# Parse dataset

In [1]:
# LOAD DATASET
import os
from collections import OrderedDict
from load_data import parse_XML
from random import shuffle

#Initialize data struct
corpus_tg="/Users/chiarasemenzin/Desktop/MscProject/corpus/Tagged/"
corpus_ut="/Users/chiarasemenzin/Desktop/MscProject/corpus/Untagged/"

TAG_DATA=OrderedDict()


# PARSE XML TAGGED
print("PARSING DATASET\n")
for root, dirs, files in os.walk(corpus_tg):
    files = [ fi for fi in files if fi.endswith(".xml")]
    for file in files:
        print("Parsing ",file,"...")
        phrase_list,lemmas=parse_XML.xml_iteration(corpus_tg+file)
        TAG_DATA[file]=[phrase_list,lemmas]


PARSING DATASET

Parsing  porcodioaugu.xml ...
Parsing  porcodiolenz.xml ...


In [2]:
# PARSE XML UNTAGGED

UNTAG_DATA=OrderedDict()

for root, dirs, files in os.walk(corpus_ut):
    files = [ fi for fi in files if fi.endswith(".xml")]
    for file in files:
        print("Parsing ",file,"...")
        try:
            phrase_list=parse_XML.xml_iteration(corpus_ut+file,tagged=False)
            UNTAG_DATA[file]=phrase_list
        except:
            continue
    print("Done.")

Parsing  1903AUGU-N.xml ...
Parsing  1922AUGU.xml ...
Parsing  1897LENZ-9.xml ...
Parsing  1897LENZ-8.xml ...
Parsing  1913GUEV-1.xml ...
Parsing  1897LENZ-11.xml ...
Parsing  1897LENZ-10.xml ...
Parsing  1910AUGU-4.xml ...
Parsing  1902AUGU.xml ...
Parsing  1897LENZ-1.xml ...
Parsing  1910AUGU-5.xml ...
Parsing  1897LENZ-3.xml ...
Parsing  1897LENZ-2.xml ...
Parsing  1910AUGU-2.xml ...
Parsing  1897LENZ-6.xml ...
Parsing  1897LENZ-7.xml ...
Parsing  1910AUGU-3.xml ...
Parsing  1621VALD.xml ...
Parsing  1910AUGU-1.xml ...
Parsing  1897LENZ-5.xml ...
Parsing  1897LENZ-4.xml ...
Parsing  1765FEBR-2.xml ...
Parsing  1765FEBR-3.xml ...
Parsing  1765FEBR-1.xml ...
Parsing  1903AUGU-1.xml ...
Parsing  1930MOES.xml ...
Done.


# Get data stats

In [3]:
## GET FREQUENCIES FUNCT DEF ++ CAN EXPORT
def get_freqs(pool):
    freqs = {}
    for word in pool:
        if word not in freqs:
            freqs[word] = 1
        else:
            freqs[word] += 1
    sorted_fr=sorted(freqs.items(), key=lambda x:x[1])
    sorted_fr.reverse()
    return sorted_fr



In [4]:
## POOL UNTAGGED DATA
UNTAG_pool=[]
for key, value in UNTAG_DATA.items():
    for w in value:
        UNTAG_pool.append(w)

top_terms=get_freqs(UNTAG_pool)

print("Total untagged tokens: ",(len(UNTAG_pool)))
print("\nTotal untagged unique terms: ",(len(top_terms)))
print("\nTop 10 types: ",top_terms[0:10])
print("\nSum frequencies top 10: ",sum([pair[1] for pair in top_terms[0:10]]))

Total untagged tokens:  116548

Total untagged unique terms:  34872

Top 10 types:  [('feichi', 2145), ('piam', 1965), ('kiñe', 1800), ('tañi', 1239), ('mapu', 1081), ('veimeu', 837), ('piŋei', 696), ('tëfachi', 693), ('dios', 636), ('domo', 614)]

Sum frequencies top 10:  11706


In [5]:
# 1/3 of the dataset is covered by the top 10 words

import pandas 
import matplotlib.pyplot as plt

freqdf = pandas.DataFrame(top_terms[0:20], columns=['Word', 'Count']).set_index('Word')
freqdf.plot.barh()


<matplotlib.axes._subplots.AxesSubplot at 0x11c25a780>

In [6]:
## FLAT (POOLED) TAGGED DATA from Data Dict

pool_tagged_sources=[]
pool_tagged_targets=[]

for key,value in TAG_DATA.items():
    sources=value[0]
    targets=value[1]
    for i in sources:
        for word in i:
            pool_tagged_sources.append(word)
    for i in targets:
        for word in i:
            pool_tagged_targets.append(word)
print(len(pool_tagged_sources))



3962


In [7]:
## GET UNSEEN WORDS in UNTAGGED

words=set(UNTAG_pool)
wordsu=set(pool_tagged_sources)
print(len(set(UNTAG_pool)))
print("Total unseen types in Untagged data: ",len(set(words)-set(wordsu)))
print("Total seen words: ",len(wordsu.intersection(words)))

34872
Total unseen types in Untagged data:  34436
Total seen words:  436


# Spelling Normalizer

In [8]:
# SPELL CHANGE 
#import change
from load_data import process_spelling

changes=(("ʎ","ll"),("t'","tr"),("ə","ü"),("v","f"),("k","c"),("ù","ü"),("ŋ","ng"),("í","i"),("á","a"),("l'","l"))

def change(pool,changes):
    for i in changes:
        new_list=process_spelling.process_pool(pool,i)
        pool=new_list
    return pool 
    
def change_dict(datadict,changes):
    for i in changes:
        new_list=process_spelling.process_spelling(datadict,i)
        datadict=new_list
    return datadict



In [9]:
UNTAG_DATA_norm=change_dict(UNTAG_DATA,changes)
TAG_DATA_norm=change_dict(TAG_DATA,changes)



PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE
PROCESSING UNTAGGED FILE


In [10]:
# FLAT NORMALIZED DATA

## POOL TAGGED DATA 

pool_tag_s_norm=[]
pool_tag_t_norm=[]

for key,value in TAG_DATA_norm.items():
    sources=value[0]
    targets=value[1]
    for i in sources:
        for word in i:
            pool_tag_s_norm.append(word)
    for t in targets:
        for word in t:
            pool_tag_t_norm.append(word)

print("Total tagged normalized tokens: ",(len((pool_tag_s_norm))))
print("Total tagged normalized types: ",(len(set(pool_tag_s_norm))))
print("\n>>>Before: total tagged types: ",len(set(pool_tagged_sources)))


## POOL UNTAGGED DATA
all_tagged_phr_s=[]
all_tagged_phr_t=[]

for key,value in TAG_DATA.items():
    sources=value[0]
    targets=value[1]
    for phrase in sources:
        all_tagged_phr_s.append(phrase)
    for phrase in targets:
        all_tagged_phr_t.append(phrase)

UNTAG_pool_norm=[]
for key, value in UNTAG_DATA_norm.items():
    for w in value:
        UNTAG_pool_norm.append(w)

print("\nTotal untagged normalized tokens: ",len(UNTAG_pool_norm))
print("Total untagged normalized types: ",len(set(UNTAG_pool_norm)))
print("\n>>>Before: total untagged types: ",len(set(UNTAG_pool)))


Total tagged normalized tokens:  3962
Total tagged normalized types:  746

>>>Before: total tagged types:  813

Total untagged normalized tokens:  116548
Total untagged normalized types:  33533

>>>Before: total untagged types:  34872


In [11]:
words=set(UNTAG_pool_norm)
wordsu=set(pool_tag_s_norm)

print("Total unseen words in Untagged Data: ",len(set(words)-set(wordsu)))
print("\nTotal seen words: ",len(wordsu.intersection(set(words))))

Total unseen words in Untagged Data:  33092

Total seen words:  441


Less unique terms, less unseen words.

### Split Dataset

In [16]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    pool_tagged_sources, pool_tagged_targets, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=1)

In [17]:
##DUMMY BASELINE

from Models import Dummy_Baseline
from collections import OrderedDict

X_train_all=X_train+X_val
X_train_all=[i for i in X_train_all if len(i)>2]
y_train_all=y_train+y_val
y_train_all=[y for y in y_train_all if len(y)>2]


predictions=Dummy_Baseline.dummy_predict(X_train,y_train,X_test)
accuracy=Dummy_Baseline.accuracy(predictions,y_test)
print(accuracy)

NameError: name 'X_train_all' is not defined

### Baseline data

In [9]:
words=set(X_test)
wordsu=set(X_train)
print("Total unseen words: ",len(set(words)-set(wordsu)))
#print(set(words)-set(wordsu))
print("\nTotal seen words: ",len(wordsu.intersection(set(words))))
len(set(X_test))

Total unseen words:  92

Total seen words:  244


336

### Spelling Normalization

In [10]:
X_train=change(X_train,changes)
X_test=change(X_test,changes)
X_val=change(X_val,changes)

NameError: name 'change' is not defined

In [11]:
words=set(X_test)
wordsu=set(X_train)

print("Total unseen words after normalization:: ",len(set(words)-set(wordsu)))
#print([i for i in set(words)-set(wordsu)])
print("\nTotal seen words after normalization: ",len(wordsu.intersection(set(words))))

Total unseen words after normalization::  92

Total seen words after normalization:  244


# write to file - no context


In [12]:
# WRITE TO FILE INCLUDING TEST - NORMAL
# TO NOT INCLUDE TEST: set Test=False
from load_data import write_noctx

op="/Users/chiarasemenzin/Desktop/MscProject/corpus/Tagged"

write_noctx.write_noctx(op,
            X_train,
            y_train,
            X_val,
            y_val,
            X_test,
            y_test)

In [18]:
# TEST UNSEEN ONLY

unseen_sources=set(words)-set(wordsu)
unseen_targets=[]

for i in unseen_sources:
    unseen_targets.append(y_test[X_test.index(i)])

with open('corpus/Tagged/test-sources-unseen', "w") as s:
    with open('corpus/Tagged/test-targets-unseen', "w") as t:
        for word, lemma in zip(unseen_sources,unseen_targets):
            word=word.lower()
            lemma=lemma.lower()
            word=" ".join(word)
            lemma=" ".join(lemma)
            s.write("{}\n".format(word))
            t.write("{}\n".format(lemma))
s.close()
t.close()

In [19]:
# TEST UNSEEN ONLY CONTEXT

unseen_sources=set(words)-set(wordsu)
unseen_targets=[]

for i in unseen_sources:
    unseen_targets.append(y_test[X_test.index(i)])

with open('corpus/Tagged/test-sources-unseen-ctx', "w") as s:
    with open('corpus/Tagged/test-targets-unseen-ctx', "w") as t:
        for word, lemma in zip(unseen_sources,unseen_targets):
            word=word.lower()
            lemma=lemma.lower()
            word=" ".join(word)
            lemma=" ".join(lemma)
            s.write("<w>{}</w>\n".format(word))
            t.write("<w>{}</w>\n".format(lemma))
s.close()
t.close()


In [20]:
# TEST SEEN ONLY
seen_sources=wordsu.intersection(set(words))
seen_targets=[]

for i in seen_sources:
    seen_targets.append(y_test[X_test.index(i)])

with open('corpus/Tagged/test-sources-seen', "w") as s:
    with open('corpus/Tagged/test-targets-seen', "w") as t:
        for word, lemma in zip(seen_sources,seen_targets):
            word=word.lower()
            lemma=lemma.lower()
            word=" ".join(word)
            lemma=" ".join(lemma)
            s.write("{}\n".format(word))
            t.write("{}\n".format(lemma))
s.close()
t.close()

In [21]:
# TEST TYPES ONLY 
types_test=set(X_test)
types_targets=[]

for i in types_test:
    types_targets.append(y_test[X_test.index(i)])
    
with open('corpus/Tagged/test-sources-types', "w") as s:
    with open('corpus/Tagged/test-targets-types', "w") as t:
        for word, lemma in zip(types_test,types_targets):
            word=word.lower()
            lemma=lemma.lower()
            word=" ".join(word)
            lemma=" ".join(lemma)
            s.write("{}\n".format(word))
            t.write("{}\n".format(lemma))
s.close()
t.close()


In [22]:
# SPELL CHECKED TYPES

#print(change(test_data,changes))

changes=(("ʎ","ll"),("t'","tr"),("ə","ü"),("v","f"),("k","c"),("ù","ü"),("ŋ","ng"),("í","i"),("á","a"),("l'","l"))

types_test_norm=change(types_test,changes)
types_targets_norm=change(types_targets,changes)

In [23]:
# WRITE SPELL CHECKED TYPES

with open('corpus/Tagged/test-sources-types-norm', "w") as s:
    with open('corpus/Tagged/test-targets-types-norm', "w") as t:
        for word, lemma in zip(types_test_norm,types_targets_norm):
            word=word.lower()
            lemma=lemma.lower()
            word=" ".join(word)
            lemma=" ".join(lemma)
            s.write("{}\n".format(word))
            t.write("{}\n".format(lemma))
s.close()
t.close()



In [24]:
# TEST UNTAGGED
from random import shuffle

shuffle(UNTAG_pool)
UNTAG_pool_shuffled=UNTAG_pool[0:700]
with open('corpus/Tagged/test-sources-untag', "w") as s:
    for word in UNTAG_pool_shuffled:
        word=word.lower()
        word=" ".join(word)
        s.write("{}\n".format(word))
s.close()


# Format with context

In [25]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    all_tagged_phr_s, all_tagged_phr_t, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=1)

In [29]:
"""WRITE CORPUS DATA TO LEMATUS FILE FORMAT"""

from load_data import format_context

n=20        

outfile_sources_tr="/Users/chiarasemenzin/Desktop/MscProject/corpus/Context/20-sources_train.txt"
outfile_sources_dev="/Users/chiarasemenzin/Desktop/MscProject/corpus/Context/20-sources_dev.txt"
outfile_sources_test="/Users/chiarasemenzin/Desktop/MscProject/Context/20-sources_test.txt"    
    
outfile_targets_tr="/Users/chiarasemenzin/Desktop/MscProject/corpus/Context/20-targets_train.txt"
outfile_targets_dev="/Users/chiarasemenzin/Desktop/MscProject/corpus/Context/20-targets_dev.txt"
outfile_targets_test="/Users/chiarasemenzin/Desktop/MscProject/corpus/Context/20-targets_test.txt"

format_context.format_context(X_train,outfile_sources_tr,n)
format_context_targets(y_train,outfile_targets_tr,n)

format_context.format_context(X_val,outfile_sources_dev,n)
format_context_targets(y_val,outfile_targets_val,n)

format_context.format_context(X_test,outfile_sources_test,n)
format_context_targets(y_test,outfile_targets_test,n)


n=5
outfile_sources_tr="/Users/chiarasemenzin/Desktop/MscProject/corpus/Context/5-sources_train.txt"
outfile_sources_dev="/Users/chiarasemenzin/Desktop/MscProject/corpus/Context/5-sources_dev.txt"
outfile_sources_test="/Users/chiarasemenzin/Desktop/MscProject/corpus/Context/5-sources_test.txt"    
    
outfile_targets_tr="/Users/chiarasemenzin/Desktop/MscProject/corpus/Context/5-targets_train.txt"
outfile_targets_dev="/Users/chiarasemenzin/Desktop/MscProject/corpus/Context/5-targets_dev.txt"
outfile_targets_test="/Users/chiarasemenzin/Desktop/MscProject/corpus/Context/5-targets_test.txt"


format_context(X_train,outfile_sources_tr,n)
format_context_targets(y_train,outfile_targets_tr,n)

format_context(X_val,outfile_sources_dev,n)
format_context_targets(y_val,outfile_targets_val,n)

format_context(X_test,outfile_sources_test,n)
format_context_targets(y_test,outfile_targets_test,n)

trimming input to  20 cause c is  24
presu, meli, kovienu, ñi, so
final ['presu', 'meli', 'kovienu', 'ñi', 'so']
trimming input to  20 cause c is  21
meli, kovienu, ñi, soltau, k
final ['meli', 'kovienu', 'ñi', 'soltau', 'k']
trimming input to  20 cause c is  21
kovienu, ñi, soltau, ka, kiñ
final ['kovienu', 'ñi', 'soltau', 'ka', 'kiñ']
trimming input to  20 cause c is  21
ñi, soltau, ka, kiñe, soleqa
final ['ñi', 'soltau', 'ka', 'kiñe', 'soleqa']
trimming input to  20 cause c is  22
iñ, uneivok, ilem, userp, ie
final ['ie', 'userp', 'ilem', 'uneivok', 'iñ']
trimming input to  20 cause c is  24
uatlos, iñ, uneivok, ilem, u
final ['u', 'ilem', 'uneivok', 'iñ', 'uatlos']
trimming input to  20 cause c is  21
ak, uatlos, iñ, uneivok, ile
final ['ile', 'uneivok', 'iñ', 'uatlos', 'ak']
trimming input to  20 cause c is  21
eñik, ak, uatlos, iñ, uneivo
final ['uneivo', 'iñ', 'uatlos', 'ak', 'eñik']
trimming input to  20 cause c is  26
domo, nie, kechu, piñen, epu, 
final ['domo', 'nie', 'kechu

trimming input to  20 cause c is  24
adew, im, uhcak, iñ, onəp, uem, 
final ['', 'uem', 'onəp', 'iñ', 'uhcak', 'im', 'adew']
trimming input to  20 cause c is  22
ip, adew, im, uhcak, iñ, onəp, u
final ['u', 'onəp', 'iñ', 'uhcak', 'im', 'adew', 'ip']
trimming input to  20 cause c is  26
ina, nor, leuvu, rume, lot'a, 
final ['ina', 'nor', 'leuvu', 'rume', "lot'a", '']
trimming input to  20 cause c is  23
nor, leuvu, rume, lot'a, cha
final ['nor', 'leuvu', 'rume', "lot'a", 'cha']
trimming input to  20 cause c is  23
leuvu, rume, lot'a, chapad, 
final ['leuvu', 'rume', "lot'a", 'chapad', '']
trimming input to  20 cause c is  21
emur, uvuel, ron, ani, lumuh
final ['lumuh', 'ani', 'ron', 'uvuel', 'emur']
trimming input to  20 cause c is  26
a'tol, emur, uvuel, ron, ani, 
final ['', 'ani', 'ron', 'uvuel', 'emur', "a'tol"]
trimming input to  20 cause c is  23
dapahc, a'tol, emur, uvuel, 
final ['', 'uvuel', 'emur', "a'tol", 'dapahc']
trimming input to  20 cause c is  22
wedá, wentru, nge, n·ai

final ['ule', 'utnum', 'iñat', 'uripmihc']
trimming input to  20 cause c is  21
pichi, che, pe, ñi, mətte, ŋüm
final ['pichi', 'che', 'pe', 'ñi', 'mətte', 'ŋüm']
trimming input to  20 cause c is  23
ihcavt, uripmihc, iñat, ut
final ['ut', 'iñat', 'uripmihc', 'ihcavt']
trimming input to  20 cause c is  22
che, pe, ñi, mətte, ŋüma, ñi, ñu
final ['che', 'pe', 'ñi', 'mətte', 'ŋüma', 'ñi', 'ñu']
trimming input to  20 cause c is  23
ihcip, ihcavt, uripmihc, i
final ['i', 'uripmihc', 'ihcavt', 'ihcip']
trimming input to  20 cause c is  23
pe, ñi, mətte, ŋüma, ñi, ñuke, k
final ['pe', 'ñi', 'mətte', 'ŋüma', 'ñi', 'ñuke', 'k']
trimming input to  20 cause c is  22
ehc, ihcip, ihcavt, uripmi
final ['uripmi', 'ihcavt', 'ihcip', 'ehc']
trimming input to  20 cause c is  21
ñi, mətte, ŋüma, ñi, ñuke, kew
final ['ñi', 'mətte', 'ŋüma', 'ñi', 'ñuke', 'kew']
trimming input to  20 cause c is  24
ep, ehc, ihcip, ihcavt, urip
final ['urip', 'ihcavt', 'ihcip', 'ehc', 'ep']
trimming input to  20 cause c is  2

trimming input to  20 cause c is  24
ñi, plata, ka, chilla, ñi, doy, 
final ['ñi', 'plata', 'ka', 'chilla', 'ñi', 'doy', '']
trimming input to  20 cause c is  24
fil, eliwiamhsip, atalp, a
final ['a', 'atalp', 'eliwiamhsip', 'fil']
trimming input to  20 cause c is  22
plata, ka, chilla, ñi, doy, kü
final ['plata', 'ka', 'chilla', 'ñi', 'doy', 'kü']
trimming input to  20 cause c is  21
iñ, fil, eliwiamhsip, atal
final ['atal', 'eliwiamhsip', 'fil', 'iñ']
trimming input to  20 cause c is  24
ka, chilla, ñi, doy, küme, kaw
final ['ka', 'chilla', 'ñi', 'doy', 'küme', 'kaw']
trimming input to  20 cause c is  21
atalp, iñ, fil, eliwiamhsi
final ['eliwiamhsi', 'fil', 'iñ', 'atalp']
trimming input to  20 cause c is  22
chilla, ñi, doy, küme, kawel
final ['chilla', 'ñi', 'doy', 'küme', 'kawel']
trimming input to  20 cause c is  23
ak, atalp, iñ, fil, eliwiamh
final ['eliwiamh', 'fil', 'iñ', 'atalp', 'ak']
trimming input to  20 cause c is  25
ñi, doy, küme, kawellu, təku, 
final ['ñi', 'doy', 'k

final ['iñ', 'ŋapil', 'eñik', 'ek', 'ütna', 'uem']
trimming input to  20 cause c is  22
pu, tañi, küdau, tvachi, lip
final ['pu', 'tañi', 'küdau', 'tvachi', 'lip']
trimming input to  20 cause c is  22
ewen, uem, ütna, ek, eñik, ŋap
final ['ŋap', 'eñik', 'ek', 'ütna', 'uem', 'ewen']
trimming input to  20 cause c is  23
tañi, küdau, tvachi, lipaŋ, 
final ['tañi', 'küdau', 'tvachi', 'lipaŋ', '']
trimming input to  20 cause c is  24
up, ewen, uem, ütna, ek, eñik, ŋ
final ['ŋ', 'eñik', 'ek', 'ütna', 'uem', 'ewen', 'up']
trimming input to  20 cause c is  23
iñat, up, ewen, uem, ütna, ek, e
final ['e', 'ek', 'ütna', 'uem', 'ewen', 'up', 'iñat']
trimming input to  20 cause c is  22
uadük, iñat, up, ewen, uem, üt
final ['üt', 'uem', 'ewen', 'up', 'iñat', 'uadük']
trimming input to  20 cause c is  21
ihcavt, uadük, iñat, up, ewe
final ['ewe', 'up', 'iñat', 'uadük', 'ihcavt']
trimming input to  20 cause c is  22
ŋapil, ihcavt, uadük, iñat, 
final ['', 'iñat', 'uadük', 'ihcavt', 'ŋapil']
trimming 

trimming input to  20 cause c is  27
wəla, kiñe, meu, ká, wep, pish
final ['wəla', 'kiñe', 'meu', 'ká', 'wep', 'pish']
trimming input to  20 cause c is  23
kiñe, meu, ká, wep, pishmaiw
final ['kiñe', 'meu', 'ká', 'wep', 'pishmaiw']
trimming input to  20 cause c is  21
meu, ká, wep, pishmaiwile, ü
final ['meu', 'ká', 'wep', 'pishmaiwile', 'ü']
trimming input to  20 cause c is  23
eliwiamhsip, pew, ák, uem, e
final ['e', 'uem', 'ák', 'pew', 'eliwiamhsip']
trimming input to  20 cause c is  26
wed, winkul, meu, vau, ina, re, 
final ['wed', 'winkul', 'meu', 'vau', 'ina', 're', '']
trimming input to  20 cause c is  23
winkul, meu, vau, ina, re, lel
final ['winkul', 'meu', 'vau', 'ina', 're', 'lel']
trimming input to  20 cause c is  23
meu, vau, ina, re, lelvun, vei, 
final ['meu', 'vau', 'ina', 're', 'lelvun', 'vei', '']
trimming input to  20 cause c is  23
vau, ina, re, lelvun, vei, ula, 
final ['vau', 'ina', 're', 'lelvun', 'vei', 'ula', '']
trimming input to  20 cause c is  23
ani, uav, u

chad, trawall, tolto, ka, ñi, 
final ['chad', 'trawall', 'tolto', 'ka', 'ñi', '']
trimming input to  20 cause c is  27
ahsew, ak, iñ, ak, ugne, ertek, 
final ['', 'ertek', 'ugne', 'ak', 'iñ', 'ak', 'ahsew']
trimming input to  20 cause c is  21
trawall, tolto, ka, ñi, plat
final ['trawall', 'tolto', 'ka', 'ñi', 'plat']
trimming input to  20 cause c is  24
dahc, ahsew, ak, iñ, ak, ugne, e
final ['e', 'ugne', 'ak', 'iñ', 'ak', 'ahsew', 'dahc']
trimming input to  20 cause c is  21
tolto, ka, ñi, plata, witra, e
final ['tolto', 'ka', 'ñi', 'plata', 'witra', 'e']
trimming input to  20 cause c is  22
llawart, dahc, ahsew, ak, iñ, 
final ['', 'iñ', 'ak', 'ahsew', 'dahc', 'llawart']
trimming input to  20 cause c is  22
ka, ñi, plata, witra, el, chem, 
final ['ka', 'ñi', 'plata', 'witra', 'el', 'chem', '']
trimming input to  20 cause c is  21
otlot, llawart, dahc, ahse
final ['ahse', 'dahc', 'llawart', 'otlot']
trimming input to  20 cause c is  24
ñi, plata, witra, el, chem, no, 
final ['ñi', 'p

trimming input to  20 cause c is  22
meli, mari, mansun, pe, ka, mə
final ['meli', 'mari', 'mansun', 'pe', 'ka', 'mə']
trimming input to  20 cause c is  21
akaw, ek, ahcuv, akatap, elə
final ['elə', 'akatap', 'ahcuv', 'ek', 'akaw']
trimming input to  20 cause c is  22
mari, mansun, pe, ka, məle, ma
final ['mari', 'mansun', 'pe', 'ka', 'məle', 'ma']
trimming input to  20 cause c is  21
ilem, akaw, ek, ahcuv, akata
final ['akata', 'ahcuv', 'ek', 'akaw', 'ilem']
trimming input to  20 cause c is  23
mansun, pe, ka, məle, mari, ke
final ['mansun', 'pe', 'ka', 'məle', 'mari', 'ke']
trimming input to  20 cause c is  25
iram, ilem, akaw, ek, ahcuv, a
final ['a', 'ahcuv', 'ek', 'akaw', 'ilem', 'iram']
trimming input to  20 cause c is  21
pe, ka, məle, mari, kechu, tor
final ['pe', 'ka', 'məle', 'mari', 'kechu', 'tor']
trimming input to  20 cause c is  25
nusnam, iram, ilem, akaw, ek, 
final ['', 'ek', 'akaw', 'ilem', 'iram', 'nusnam']
trimming input to  20 cause c is  22
ep, nusnam, iram, ilem,

final ['yem', 'mai', 'ñi', 'peñi', 'em', 'pí', 'atap']
trimming input to  20 cause c is  23
ilap, emük, étəm, eliwiamh
final ['eliwiamh', 'étəm', 'emük', 'ilap']
trimming input to  20 cause c is  24
mai, ñi, peñi, em, pí, atapay, f
final ['mai', 'ñi', 'peñi', 'em', 'pí', 'atapay', 'f']
trimming input to  20 cause c is  26
mey, ilap, emük, étəm, eliwi
final ['eliwi', 'étəm', 'emük', 'ilap', 'mey']
trimming input to  20 cause c is  21
ñi, peñi, em, pí, atapay, füch
final ['ñi', 'peñi', 'em', 'pí', 'atapay', 'füch']
trimming input to  20 cause c is  29
iam, mey, ilap, emük, étəm, el
final ['el', 'étəm', 'emük', 'ilap', 'mey', 'iam']
trimming input to  20 cause c is  23
peñi, em, pí, atapay, fücha, e
final ['peñi', 'em', 'pí', 'atapay', 'fücha', 'e']
trimming input to  20 cause c is  31
iñ, iam, mey, ilap, emük, étəm, 
final ['', 'étəm', 'emük', 'ilap', 'mey', 'iam', 'iñ']
trimming input to  20 cause c is  22
em, pí, atapay, fücha, eimi, k
final ['em', 'pí', 'atapay', 'fücha', 'eimi', 'k']

trimming input to  20 cause c is  24
ruka, meu, pe, doi, küme, vald
final ['ruka', 'meu', 'pe', 'doi', 'küme', 'vald']
trimming input to  20 cause c is  22
meu, pe, doi, küme, valdivia, 
final ['meu', 'pe', 'doi', 'küme', 'valdivia', '']
trimming input to  20 cause c is  23
pe, doi, küme, valdivia, ñi, r
final ['pe', 'doi', 'küme', 'valdivia', 'ñi', 'r']
trimming input to  20 cause c is  21
doi, küme, valdivia, ñi, ruk
final ['doi', 'küme', 'valdivia', 'ñi', 'ruk']
trimming input to  20 cause c is  24
aividlav, emük, iod, ep, uem, 
final ['', 'uem', 'ep', 'iod', 'emük', 'aividlav']
trimming input to  20 cause c is  22
iñ, aividlav, emük, iod, ep, u
final ['u', 'ep', 'iod', 'emük', 'aividlav', 'iñ']
trimming input to  20 cause c is  24
ngəf, amu, pishmaiwile, ñi, 
final ['ngəf', 'amu', 'pishmaiwile', 'ñi', '']
trimming input to  20 cause c is  23
amu, pishmaiwile, ñi, ruká, 
final ['amu', 'pishmaiwile', 'ñi', 'ruká', '']
trimming input to  20 cause c is  22
pishmaiwile, ñi, ruká, meu, 


trimming input to  20 cause c is  21
fücha, məle, ñi, utrul, plat
final ['fücha', 'məle', 'ñi', 'utrul', 'plat']
trimming input to  20 cause c is  22
lurtu, iñ, eləm, ahcüf, yapa
final ['yapa', 'ahcüf', 'eləm', 'iñ', 'lurtu']
trimming input to  20 cause c is  21
atalp, lurtu, iñ, eləm, ahcü
final ['ahcü', 'eləm', 'iñ', 'lurtu', 'atalp']
trimming input to  20 cause c is  21
tveichi, chaʎa, chem, ñi, ml
final ['tveichi', 'chaʎa', 'chem', 'ñi', 'ml']
trimming input to  20 cause c is  22
iñ, mehc, aʎahc, ihcievt, ut
final ['ut', 'ihcievt', 'aʎahc', 'mehc', 'iñ']
trimming input to  20 cause c is  21
elm, iñ, mehc, aʎahc, ihciev
final ['ihciev', 'aʎahc', 'mehc', 'iñ', 'elm']
trimming input to  20 cause c is  21
kiñe, kuikui, mle, kiñe, wel
final ['kiñe', 'kuikui', 'mle', 'kiñe', 'wel']
trimming input to  20 cause c is  25
kuikui, mle, kiñe, welu, ina, 
final ['kuikui', 'mle', 'kiñe', 'welu', 'ina', '']
trimming input to  20 cause c is  22
mle, kiñe, welu, ina, pukem, a
final ['mle', 'kiñe', 

final ['o', 'uehc', 'uma', 'ihcavt', 'üpür', 'at']
trimming input to  20 cause c is  21
amu, ket'an, kachiʎa, meu, p
final ['amu', "ket'an", 'kachiʎa', 'meu', 'p']
trimming input to  20 cause c is  22
avt, at, üpür, ihcavt, uma, ue
final ['ue', 'uma', 'ihcavt', 'üpür', 'at', 'avt']
trimming input to  20 cause c is  22
ket'an, kachiʎa, meu, pu, lo
final ["ket'an", 'kachiʎa', 'meu', 'pu', 'lo']
trimming input to  20 cause c is  21
uma, avt, at, üpür, ihcavt, um
final ['um', 'ihcavt', 'üpür', 'at', 'avt', 'uma']
trimming input to  20 cause c is  24
na'tek, uma, avt, at, üpür, ih
final ['ih', 'üpür', 'at', 'avt', 'uma', "na'tek"]
trimming input to  20 cause c is  21
aʎihcak, na'tek, uma, avt, a
final ['a', 'avt', 'uma', "na'tek", 'aʎihcak']
trimming input to  20 cause c is  22
uem, aʎihcak, na'tek, uma, a
final ['a', 'uma', "na'tek", 'aʎihcak', 'uem']
trimming input to  20 cause c is  21
up, uem, aʎihcak, na'tek, um
final ['um', "na'tek", 'aʎihcak', 'uem', 'up']
trimming input to  20 cause

trimming input to  20 cause c is  21
elm, iñat, avt, iam, uma, numa
final ['numa', 'uma', 'iam', 'avt', 'iñat', 'elm']
trimming input to  20 cause c is  23
wenüi, mütte, cheŋar, kewa, 
final ['wenüi', 'mütte', 'cheŋar', 'kewa', '']
trimming input to  20 cause c is  24
awek, raŋehc, ettüm, iünew, 
final ['', 'iünew', 'ettüm', 'raŋehc', 'awek']
trimming input to  20 cause c is  24
ruka, məttewe, weza, ruka, p
final ['ruka', 'məttewe', 'weza', 'ruka', 'p']
trimming input to  20 cause c is  22
məttewe, weza, ruka, puüʎi, 
final ['məttewe', 'weza', 'ruka', 'puüʎi', '']
trimming input to  20 cause c is  24
weza, ruka, puüʎi, ta, iʎvod, 
final ['weza', 'ruka', 'puüʎi', 'ta', 'iʎvod', '']
trimming input to  20 cause c is  23
ruka, puüʎi, ta, iʎvod, wauw, 
final ['ruka', 'puüʎi', 'ta', 'iʎvod', 'wauw', '']
trimming input to  20 cause c is  21
azew, ewettəm, akur, ihcav
final ['ihcav', 'akur', 'ewettəm', 'azew']
trimming input to  20 cause c is  25
akur, azew, ewettəm, akur, i
final ['i', 'akur'

final ['', 'agn', 'ihcafət', 'emük', 'urtnew']
trimming input to  20 cause c is  24
meu, nga, ləkai, weda, kon, mi, 
final ['meu', 'nga', 'ləkai', 'weda', 'kon', 'mi', '']
trimming input to  20 cause c is  23
upe, urtnew, emük, ihcafət, 
final ['', 'ihcafət', 'emük', 'urtnew', 'upe']
trimming input to  20 cause c is  21
nga, ləkai, weda, kon, mi, ruk
final ['nga', 'ləkai', 'weda', 'kon', 'mi', 'ruk']
trimming input to  20 cause c is  23
uem, upe, urtnew, emük, ihca
final ['ihca', 'emük', 'urtnew', 'upe', 'uem']
trimming input to  20 cause c is  21
ləkai, weda, kon, mi, ruká, me
final ['ləkai', 'weda', 'kon', 'mi', 'ruká', 'me']
trimming input to  20 cause c is  26
agn, uem, upe, urtnew, emük, i
final ['i', 'emük', 'urtnew', 'upe', 'uem', 'agn']
trimming input to  20 cause c is  23
weda, kon, mi, ruká, meu, pi, sh
final ['weda', 'kon', 'mi', 'ruká', 'meu', 'pi', 'sh']
trimming input to  20 cause c is  24
iakəl, agn, uem, upe, urtnew, 
final ['', 'urtnew', 'upe', 'uem', 'agn', 'iakəl']
t

trimming input to  20 cause c is  22
duŋu, kəpa, anai, müchai, mə
final ['duŋu', 'kəpa', 'anai', 'müchai', 'mə']
trimming input to  20 cause c is  22
kəpa, anai, müchai, məte, mu
final ['kəpa', 'anai', 'müchai', 'məte', 'mu']
trimming input to  20 cause c is  22
anai, müchai, məte, mupi, kü
final ['anai', 'müchai', 'məte', 'mupi', 'kü']
trimming input to  20 cause c is  21
müchai, məte, mupi, küpa, eŋ
final ['müchai', 'məte', 'mupi', 'küpa', 'eŋ']
trimming input to  20 cause c is  23
iana, apək, uŋud, ipum, uem, m
final ['m', 'uem', 'ipum', 'uŋud', 'apək', 'iana']
trimming input to  20 cause c is  24
məte, mupi, küpa, eŋu, chei, t
final ['məte', 'mupi', 'küpa', 'eŋu', 'chei', 't']
trimming input to  20 cause c is  22
iahcüm, iana, apək, uŋud, ip
final ['ip', 'uŋud', 'apək', 'iana', 'iahcüm']
trimming input to  20 cause c is  22
etəm, iahcüm, iana, apək, uŋ
final ['uŋ', 'apək', 'iana', 'iahcüm', 'etəm']
trimming input to  20 cause c is  22
ipum, etəm, iahcüm, iana, ap
final ['ap', 'iana

ihcip, etəm, ein, ehcil, iñ, a
final ['a', 'iñ', 'ehcil', 'ein', 'etəm', 'ihcip']
trimming input to  20 cause c is  24
anasnam, ihcip, etəm, ein, e
final ['e', 'ein', 'etəm', 'ihcip', 'anasnam']
trimming input to  20 cause c is  21
ak, anasnam, ihcip, etəm, ei
final ['ei', 'etəm', 'ihcip', 'anasnam', 'ak']
trimming input to  20 cause c is  21
ein, ak, anasnam, ihcip, etə
final ['etə', 'ihcip', 'anasnam', 'ak', 'ein']
trimming input to  20 cause c is  23
va, ein, ak, anasnam, ihcip, e
final ['e', 'ihcip', 'anasnam', 'ak', 'ein', 'va']
trimming input to  20 cause c is  24
ekrüm, va, ein, ak, anasnam, i
final ['i', 'anasnam', 'ak', 'ein', 'va', 'ekrüm']
trimming input to  20 cause c is  22
uŋe, ekrüm, va, ein, ak, anasn
final ['anasn', 'ak', 'ein', 'va', 'ekrüm', 'uŋe']
trimming input to  20 cause c is  27
ürkü, ñi, palí, umau, pí, pish
final ['ürkü', 'ñi', 'palí', 'umau', 'pí', 'pish']
trimming input to  20 cause c is  23
ñi, palí, umau, pí, pishmaiw
final ['ñi', 'palí', 'umau', 'pí', 'p

elu, kiñe, kat', welu, wüne, p
final ['elu', 'kiñe', "kat'", 'welu', 'wüne', 'p']
trimming input to  20 cause c is  24
kiñe, kat', welu, wüne, putu, 
final ['kiñe', "kat'", 'welu', 'wüne', 'putu', '']
trimming input to  20 cause c is  21
ule, iam, naknak, ihcip, mau
final ['mau', 'ihcip', 'naknak', 'iam', 'ule']
trimming input to  20 cause c is  24
kat', welu, wüne, putu, kiñe, 
final ["kat'", 'welu', 'wüne', 'putu', 'kiñe', '']
trimming input to  20 cause c is  21
eñik, ule, iam, naknak, ihci
final ['ihci', 'naknak', 'iam', 'ule', 'eñik']
trimming input to  20 cause c is  27
welu, wüne, putu, kiñe, vaso, 
final ['welu', 'wüne', 'putu', 'kiñe', 'vaso', '']
trimming input to  20 cause c is  25
'tak, eñik, ule, iam, naknak, 
final ['', 'naknak', 'iam', 'ule', 'eñik', "'tak"]
trimming input to  20 cause c is  23
wüne, putu, kiñe, vaso, mans
final ['wüne', 'putu', 'kiñe', 'vaso', 'mans']
trimming input to  20 cause c is  24
ulew, 'tak, eñik, ule, iam, na
final ['na', 'iam', 'ule', 'eñik', 

trimming input to  20 cause c is  23
eye, naq, küpa, kiñe, domo, ra
final ['eye', 'naq', 'küpa', 'kiñe', 'domo', 'ra']
trimming input to  20 cause c is  23
eñik, apük, qan, eye, roñes, u
final ['u', 'roñes', 'eye', 'qan', 'apük', 'eñik']
trimming input to  20 cause c is  23
omod, eñik, apük, qan, eye, ro
final ['ro', 'eye', 'qan', 'apük', 'eñik', 'omod']
trimming input to  20 cause c is  23
mai, amu, mai, vei, ürke, mai, k
final ['mai', 'amu', 'mai', 'vei', 'ürke', 'mai', 'k']
trimming input to  20 cause c is  22
amu, mai, vei, ürke, mai, küpa, 
final ['amu', 'mai', 'vei', 'ürke', 'mai', 'küpa', '']
trimming input to  20 cause c is  23
mai, vei, ürke, mai, küpa, am, r
final ['mai', 'vei', 'ürke', 'mai', 'küpa', 'am', 'r']
trimming input to  20 cause c is  22
vei, ürke, mai, küpa, am, rupa, 
final ['vei', 'ürke', 'mai', 'küpa', 'am', 'rupa', '']
trimming input to  20 cause c is  22
ürke, mai, küpa, am, rupa, pi, n
final ['ürke', 'mai', 'küpa', 'am', 'rupa', 'pi', 'n']
trimming input to 

trimming input to  20 cause c is  21
kechu, we, kuram, vei, pi, tam
final ['kechu', 'we', 'kuram', 'vei', 'pi', 'tam']
trimming input to  20 cause c is  24
we, kuram, vei, pi, tami, ñawe, 
final ['we', 'kuram', 'vei', 'pi', 'tami', 'ñawe', '']
trimming input to  20 cause c is  22
kuram, vei, pi, tami, ñawe, kü
final ['kuram', 'vei', 'pi', 'tami', 'ñawe', 'kü']
trimming input to  20 cause c is  26
vei, pi, tami, ñawe, küpa, ko, m
final ['vei', 'pi', 'tami', 'ñawe', 'küpa', 'ko', 'm']
trimming input to  20 cause c is  23
pi, tami, ñawe, küpa, ko, mett
final ['pi', 'tami', 'ñawe', 'küpa', 'ko', 'mett']
trimming input to  20 cause c is  21
tami, ñawe, küpa, ko, mettaw
final ['tami', 'ñawe', 'küpa', 'ko', 'mettaw']
trimming input to  20 cause c is  21
ip, iev, maruk, ew, uhcek, müv
final ['müv', 'uhcek', 'ew', 'maruk', 'iev', 'ip']
trimming input to  20 cause c is  21
imat, ip, iev, maruk, ew, uhce
final ['uhce', 'ew', 'maruk', 'iev', 'ip', 'imat']
trimming input to  20 cause c is  25
ewañ,

final ['', 'ihcavt', 'müñü', 'oli', 'mak', 'emük']
trimming input to  20 cause c is  23
mak, emük, mak, oli, müñü, ihc
final ['ihc', 'müñü', 'oli', 'mak', 'emük', 'mak']
trimming input to  20 cause c is  22
mula, üt'uv, vücha, lil, meu, 
final ['mula', "üt'uv", 'vücha', 'lil', 'meu', '']
trimming input to  20 cause c is  21
üt'uv, vücha, lil, meu, la, ka
final ["üt'uv", 'vücha', 'lil', 'meu', 'la', 'ka']
trimming input to  20 cause c is  22
vücha, lil, meu, la, kai, ka, ur
final ['vücha', 'lil', 'meu', 'la', 'kai', 'ka', 'ur']
trimming input to  20 cause c is  21
lil, meu, la, kai, ka, urvi, no, k
final ['lil', 'meu', 'la', 'kai', 'ka', 'urvi', 'no', 'k']
trimming input to  20 cause c is  21
meu, la, kai, ka, urvi, no, ko, me
final ['meu', 'la', 'kai', 'ka', 'urvi', 'no', 'ko', 'me']
trimming input to  20 cause c is  21
lil, ahcüv, vu'tü, alum, eñi
final ['eñi', 'alum', "vu'tü", 'ahcüv', 'lil']
trimming input to  20 cause c is  24
uem, lil, ahcüv, vu'tü, alum, 
final ['', 'alum', "vu't

trimming input to  20 cause c is  22
eyeu, amu, anai, tañi, ruka, m
final ['eyeu', 'amu', 'anai', 'tañi', 'ruka', 'm']
trimming input to  20 cause c is  21
iñat, iana, uma, ueye, emük, e
final ['e', 'emük', 'ueye', 'uma', 'iana', 'iñat']
trimming input to  20 cause c is  23
akur, iñat, iana, uma, ueye, e
final ['e', 'ueye', 'uma', 'iana', 'iñat', 'akur']
trimming input to  20 cause c is  22
uem, akur, iñat, iana, uma, ue
final ['ue', 'uma', 'iana', 'iñat', 'akur', 'uem']
trimming input to  20 cause c is  22
cham, tvachi, ruka, küme, kü
final ['cham', 'tvachi', 'ruka', 'küme', 'kü']
trimming input to  20 cause c is  22
tvachi, ruka, küme, küpa, kr
final ['tvachi', 'ruka', 'küme', 'küpa', 'kr']
trimming input to  20 cause c is  23
ruka, küme, küpa, krüv, lar, c
final ['ruka', 'küme', 'küpa', 'krüv', 'lar', 'c']
trimming input to  20 cause c is  21
küme, küpa, krüv, lar, chei, a
final ['küme', 'küpa', 'krüv', 'lar', 'chei', 'a']
trimming input to  20 cause c is  22
emük, akur, ihcavt, mah

trimming input to  20 cause c is  24
ehcni, ma'tün, at, uŋud, ne'
final ["ne'", 'uŋud', 'at', "ma'tün", 'ehcni']
trimming input to  20 cause c is  26
ep, ehcni, ma'tün, at, uŋud, n
final ['n', 'uŋud', 'at', "ma'tün", 'ehcni', 'ep']
trimming input to  20 cause c is  21
uemiev, ep, ehcni, ma'tün, a
final ['a', "ma'tün", 'ehcni', 'ep', 'uemiev']
trimming input to  20 cause c is  21
tvachi, ilo, küme, weda, nüm
final ['tvachi', 'ilo', 'küme', 'weda', 'nüm']
trimming input to  20 cause c is  21
adew, emük, oli, ihcavt, apü
final ['apü', 'ihcavt', 'oli', 'emük', 'adew']
trimming input to  20 cause c is  21
ümün, adew, emük, oli, ihcav
final ['ihcav', 'oli', 'emük', 'adew', 'ümün']
trimming input to  20 cause c is  21
mle, pichi, che, t'ipa, vau, t
final ['mle', 'pichi', 'che', "t'ipa", 'vau', 't']
trimming input to  20 cause c is  21
pichi, che, t'ipa, vau, ta, ml
final ['pichi', 'che', "t'ipa", 'vau', 'ta', 'ml']
trimming input to  20 cause c is  23
uav, api't, ehc, ihcip, elm, u
final ['u'

final ['mau', 'ihcavt', 'uaduk', 'nüvlel']
trimming input to  20 cause c is  22
doi, küme, tañi, kat'ü, mawi
final ['doi', 'küme', 'tañi', "kat'ü", 'mawi']
trimming input to  20 cause c is  24
uem, nüvlel, uaduk, ihcavt, 
final ['', 'ihcavt', 'uaduk', 'nüvlel', 'uem']
trimming input to  20 cause c is  22
küme, tañi, kat'ü, mawida, m
final ['küme', 'tañi', "kat'ü", 'mawida', 'm']
trimming input to  20 cause c is  23
iod, uem, nüvlel, uaduk, ihc
final ['ihc', 'uaduk', 'nüvlel', 'uem', 'iod']
trimming input to  20 cause c is  22
tañi, kat'ü, mawida, meu, ch
final ['tañi', "kat'ü", 'mawida', 'meu', 'ch']
trimming input to  20 cause c is  21
emük, iod, uem, nüvlel, uadu
final ['uadu', 'nüvlel', 'uem', 'iod', 'emük']
trimming input to  20 cause c is  22
kat'ü, mawida, meu, cheu, ta
final ["kat'ü", 'mawida', 'meu', 'cheu', 'ta']
trimming input to  20 cause c is  25
iñat, emük, iod, uem, nüvlel, 
final ['', 'nüvlel', 'uem', 'iod', 'emük', 'iñat']
trimming input to  20 cause c is  25
ü'tak, iña

trimming input to  20 cause c is  23
pe, ñi, kure, fei, uma, ñi, ruká, 
final ['pe', 'ñi', 'kure', 'fei', 'uma', 'ñi', 'ruká', '']
trimming input to  20 cause c is  22
ulew, uem, ákur, iñ, up, ief, up
final ['up', 'ief', 'up', 'iñ', 'ákur', 'uem', 'ulew']
trimming input to  20 cause c is  21
ñi, kure, fei, uma, ñi, ruká, me
final ['ñi', 'kure', 'fei', 'uma', 'ñi', 'ruká', 'me']
trimming input to  20 cause c is  24
ep, ulew, uem, ákur, iñ, up, ief, 
final ['', 'ief', 'up', 'iñ', 'ákur', 'uem', 'ulew', 'ep']
trimming input to  20 cause c is  22
iñ, ep, ulew, uem, ákur, iñ, up, i
final ['i', 'up', 'iñ', 'ákur', 'uem', 'ulew', 'ep', 'iñ']
trimming input to  20 cause c is  21
eruk, iñ, ep, ulew, uem, ákur, i
final ['i', 'ákur', 'uem', 'ulew', 'ep', 'iñ', 'eruk']
trimming input to  20 cause c is  22
ief, eruk, iñ, ep, ulew, uem, ák
final ['ák', 'uem', 'ulew', 'ep', 'iñ', 'eruk', 'ief']
trimming input to  20 cause c is  21
amu, ief, eruk, iñ, ep, ulew, ue
final ['ue', 'ulew', 'ep', 'iñ', 'eru

trimming input to  20 cause c is  22
eliwiamhsip, ekulef, árt
final ['árt', 'ekulef', 'eliwiamhsip']
trimming input to  20 cause c is  22
katrü, chi, traru, femngec
final ['katrü', 'chi', 'traru', 'femngec']
trimming input to  20 cause c is  22
uhsik, eliwiamhsip, ekul
final ['ekul', 'eliwiamhsip', 'uhsik']
trimming input to  20 cause c is  22
chi, traru, femngechi, mon
final ['chi', 'traru', 'femngechi', 'mon']
trimming input to  20 cause c is  21
ürtak, uhsik, eliwiamhsi
final ['eliwiamhsi', 'uhsik', 'ürtak']
trimming input to  20 cause c is  21
traru, femngechi, montu, p
final ['traru', 'femngechi', 'montu', 'p']
trimming input to  20 cause c is  24
ihc, ürtak, uhsik, eliwiam
final ['eliwiam', 'uhsik', 'ürtak', 'ihc']
trimming input to  20 cause c is  27
femngechi, montu, pi, pish
final ['femngechi', 'montu', 'pi', 'pish']
trimming input to  20 cause c is  29
urart, ihc, ürtak, uhsik, el
final ['el', 'uhsik', 'ürtak', 'ihc', 'urart']
trimming input to  20 cause c is  22
ihcegnmef, u

NameError: name 'format_context_targets' is not defined

In [None]:
all_tagged_phr_s=[]
all_tagged_phr_t=[]

for key,value in TAG_DATA.items():
    sources=value[0]
    targets=value[1]
    for phrase in sources:
        all_tagged_phr_s.append(phrase)
    for phrase in targets:
        all_tagged_phr_t.append(phrase)

In [None]:
# Spelling Normalization of context (LISTs OF PHRASES) 
changes=(("ʎ","ll"),("t'","tr"),("ə","ü"),("v","f"),("k","c"),("ù","ü"),("ŋ","ng"),("í","i"),("á","a"),("l'","l"))

normalized_phr_s=[]
for i in all_tagged_phr_s:
    train_types=change(i,changes)    
    normalized_phr_s.append(train_types)


normalized_phr_t=[]
for i in all_tagged_phr_t:
    train_types=change(i,changes)    
    normalized_phr_t.append(train_types)
c = list(zip(all_tagged_phr_s, all_tagged_phr_t))
shuffle(c)
all_tagged_phr_s, all_tagged_phr_t = zip(*c)


# Format for hard attention and spellcheck

In [82]:
from pandas import DataFrame, read_csv
import matplotlib.pyplot as plt
import pandas as pd 
from load_data import process_spelling
from random import shuffle

## Nb. 80% condition. To obtain 50/50 unseen data swap mapu_train and mapu_valid data

all_data = pd.read_csv('/Users/chiarasemenzin/Downloads/mapu_data_all', sep="\t", header=None)

all_types_sources=list(all_data[0])
all_types_feats=list(all_data[1])
all_types_tgt=list(all_data[2])

In [83]:
X_train, X_test, y_train, y_test = train_test_split(
    all_types_sources, all_types_tgt, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=1)

In [85]:
# CHECK UNSEEN WORDS IN TYPES DATASET

count=0
unseen=0
all_train_sources=[i.lower() for i in X_train]
for i in X_test:
    if i in set(all_train_sources):
        count+=1
    else:
        unseen+=1

print("Total train types: {}".format(len(set(all_train_sources))))
print("Total test types: {}".format(len(set(X_test))))
print("Seen in training {}".format(count))
print("Unseen in training {}".format(unseen))

Total train types: 1058
Total test types: 361
Seen in training 56
Unseen in training 310
['pilu', 'doy', 'məten', 'kelluen', 'kəpaiwelan', 'Müchai', 'deqin', 'veimu', 'kəpatun', 'nümekelai']


In [86]:
# WRITE H-A FILES

outfile="/Users/chiarasemenzin/Desktop/Mscproject/hard-attention/mapu_train_80"
with open(outfile,"w") as of:
    for source,feats,target in zip(X_train,all_types_feats,y_train):
        of.write("{}\t {}\t {}\n".format(source,all_types_feats[X_train.index(source)],target))
outfile="/Users/chiarasemenzin/Desktop/Mscproject/hard-attention/mapu_dev_80"
with open(outfile,"w") as of:
    for source,feats,target in zip(X_val,all_types_feats,y_val):
        of.write("{}\t {}\t {}\n".format(source,all_types_feats[X_val.index(source)],target))
outfile="/Users/chiarasemenzin/Desktop/Mscproject/hard-attention/mapu_test_80"
with open(outfile,"w") as of:
    for source,feats,target in zip(X_test,all_types_feats,y_test):
        of.write("{}\t {}\t {}\n".format(source,all_types_feats[X_test.index(source)],target))

In [31]:
# WRITE IN LEMATUS FORMAT

op="/Users/chiarasemenzin/Desktop/MscProject/corpus/Types_only"

write_noctx.write_noctx(op,
            X_train,
            y_train,
            X_val,
            y_val,
            X_test=X_test,
            y_test=y_test)

## SPELLING CHANGES

In [11]:
changes=(("ʎ","ll"),("t'","tr"),("ə","ü"),("v","f"),("k","c"),("ù","ü"),("ŋ","ng"),("í","i"),("á","a"),("l'","l"))

train_data = pd.read_csv('/Users/chiarasemenzin/Desktop/MscProject/hard-attention/mapu_train', sep="\t", header=None)
dev_data = pd.read_csv('/Users/chiarasemenzin/Desktop/MscProject/hard-attention/mapu_test', sep="\t", header=None)
test_data = pd.read_csv('/Users/chiarasemenzin/Desktop/MscProject/hard-attention/mapu_valid', sep="\t", header=None)


train_types_sources=list(train_data[0])
test_types_sources=list(test_data[0])
dev_types_sources=list(dev_data[0])

train_types_tgt=list(train_data[2])
test_types_tgt=list(test_data[2])
dev_types_tgt=list(dev_data[2])

# SOURCES

train_types=change(train_types_sources,changes)
test_types=change(test_types_sources,changes)
dev_types=change(dev_types_sources,changes)


train_data[0]=train_types
dev_data[0]=dev_types
test_data[0]=test_types
        
# TARGETS

train_types_tgt=change(train_types_tgt,changes)
test_types_tgt=change(test_types_tgt,changes)
dev_types_tgt=change(dev_types_tgt,changes)


train_data[2]=train_types_tgt
dev_data[2]=dev_types_tgt
test_data[2]=test_types_tgt


# COUNT UNSEEN WORDS

all_train_sources_n=train_types

count=0
unseen=0
for i in test_types:
    if i in set(all_train_sources_n):
        count+=1
    else:
        unseen+=1

print("Total train types: {}".format(len(set(all_train_sources_n))))
print("Total test types: {}".format(len(set(test_types))))
print("Seen in training {}".format(count))
print("Unseen in training {}".format(unseen))



NameError: name 'change' is not defined

In [None]:
# WRITE TYPES FOR HARD ATTENTION 

train_data.to_csv("/Users/chiarasemenzin/Desktop/MscProject/hard-attention/mapu_train_spellchkd_50",
                 index=False,
                 header=False,
                 sep="\t")
test_data.to_csv("/Users/chiarasemenzin/Desktop/MscProject/hard-attention/mapu_test_spellchkd_50",
                 index=False,
                 header=False,
                 sep="\t")
dev_data.to_csv("/Users/chiarasemenzin/Desktop/MscProject/hard-attention/mapu_dev_spellchkd_50",
                 index=False,
                 header=False,
                 sep="\t")

In [None]:
# WRITE TYPES FOR LEMATUS

write_noctx('corpus/Types/',
            train_types_sources,
            train_types_tgt,
            dev_types_sources,
            dev_types_tgt,
            test_types_sources,
            test_types_tgt)

## READ IN EXTRA DATA

In [None]:
from pandas import DataFrame, read_csv
import pandas as pd 

extra_types = r'/Users/chiarasemenzin/Desktop/MscProject/corpus/800mostFreq.xlsx'
df_types = pd.read_excel(extra_types)
df_types.head()

In [None]:
## GET LEMMAS (targets) 

import re
ty=list(df_types["Tagged"])
sources_final=list(df_types['Form'])
targets=[]

for i in ty:
    match=re.findall(r"lemma=\"([\w,\-\'']+)\"", i)
    targets.append(match)
for e,i in enumerate(targets):
    if len(i)>1:
        targets[e]=["".join(i)]

targets_final = [item for sublist in targets for item in sublist]


In [None]:
# Check NEW words in bens data

wordsu=set(pool_tagged_sources)
new=set(sources_final)

sources_new=[]

for i in new:
    if i in wordsu:
        continue
    else:
        sources_new.append(i)
print("Yay! There are {} unseen types in Ben's data".format(len(sources_new)))

In [None]:
with open('corpus/Tagged/extraben-sources', "w") as s:
    with open('corpus/Tagged/extraben-targets', "w") as t:
        for word, lemma in zip(sources_final,targets_final):
            word=word.lower()
            lemma=lemma.lower()
            word=" ".join(word)
            lemma=" ".join(lemma)
            s.write("{}\n".format(word))
            t.write("{}\n".format(lemma))
s.close()
t.close()

#### FINAL TESTING PREDICTIONS

In [None]:
# EXTRA TEST TYPES to predict FOR FINAL TESTING
import pandas as pd
import regex as re

file = r'/Users/chiarasemenzin/Desktop/MscProject/corpus/test_set_items_tag.xlsx'
df = pd.read_excel(file)
sources_pool=list(df['Word'])
tagged=list(df["Tagged"])

targets_pool=[]

# GET LEMMAS (TARGETS)
for i in tagged:
    match=re.findall(r"lemma=\"([\w,\-\'']+)\"", i)
    targets_pool.append(match)

for e,i in enumerate(targets_pool):
    if len(i)>1:
        targets_pool[e]=["".join(i)]

        
print("SANITY CHECK > LENGTH OF TARGETS: ",len(targets_pool)) #is a list of lists
print("SANITY CHECK > LENGTH OF SOURCES: ",len(sources_pool))


In [None]:
# GET OTHER FEATURES (TARGETS column)
POS=[]    
for i in tagged:
    match=re.findall(r"pos=\"([\w,\-\'']+)\"", i)
    POS.append(match)
for e,i in enumerate(POS):
    if i == []:
        POS[e]=["V"]
for e,i in enumerate(POS):
    if type(i)==list:
        POS[e]=i[0]


corresps=[]
for i in tagged:
    match=re.findall(r"corresp=\"([.()\s\w,\-\''\/\\?]+)\"", i)
    if match==[]:
        match=["go"]
        #print(">ERROR: ",i)
    corresps.append(match)

for e,i in enumerate(corresps):
    if len(i)>1:
        corresps[e]=i[0].replace(" ", "")
    if type(i)==list:
        corresps[e]=i[0].replace(" ", "")
        
print(">Sanity check:\n\n translations found: \n\n{},\n\n POS found:\n\n {}".format(corresps[0:10],POS[0:10]))


In [None]:
# WRITE ALL-TAGGED-DATA TRAIN AND DEV
from sklearn.cross_validation import train_test_split

X_train, X_dev, y_train, y_dev = train_test_split(
    pool_tagged_sources, pool_tagged_targets, test_size=0.1, random_state=1)

op="/Users/chiarasemenzin/Desktop/MscProject/corpus/Tagged"

write_noctx(op,
            X_train,
            y_train,
            X_dev,
            y_dev,
           test=False)

In [None]:
# WRITE BEN TEST SOURCES AND TARGETS

with open('corpus/Tagged/test-sources-ben', "w") as s:
    for word in sources_pool:
        word=" ".join(word)
        s.write("{}\n".format(word))
s.close()
fixed_pool_targets=[]
with open('corpus/Tagged/test-targets-ben', "w") as s:
    for word in targets_pool:
        word=" ".join(word)
        word=" ".join(word)
        fixed_pool_targets.append(word.replace(" ", ""))
        s.write("{}\n".format(word))
s.close()

In [None]:
## WRITE SPELL CHECKED TO H-A

from load_data import process_spelling
def change(pool,changes):
    for i in changes:
        new_list=process_spelling.process_pool(pool,i)
        pool=new_list
    return pool 

changes=(("ʎ","ll"),("t'","tr"),("ə","ü"),("v","f"),("k","c"),("ù","ü"),("ŋ","ng"),("í","i"),("á","a"),("l'","l"))

test_sources_ben_sc=change(sources_pool,changes)
test_targets_ben_sc=change(fixed_pool_targets,changes)

print((test_sources_ben_sc[0:10]))
print((test_targets_ben_sc[0:10]))
# WRITE SPELL CHECKED BEN DATA TO TEST ON H-A

outfile="/Users/chiarasemenzin/Desktop/hard-attention-test"
with open(outfile,"w") as of:
    for source,pos,corresp,target in zip(test_sources_ben_sc,POS,corresps,test_targets_ben_sc):
        of.write("{}\t POS={},corresp={}\t {}\n".format(source,pos,corresp,target))



## ++++ DANGER ZONE ++++

In [None]:
## Basically get data from other models

In [None]:
import pandas as pd
from load_data import process_spelling

changes=(("ʎ","ll"),("t'","tr"),("ə","ü"),("v","f"),("k","c"),("ù","ü"),("ŋ","ng"),("í","i"),("á","a"),("l'","l"))

train_data = pd.read_csv('/Users/chiarasemenzin/Desktop/MscProject/toms_lematus/models/mapu-21-char-context-fresh/data/train-sources', header=None)
dev_data = pd.read_csv('/Users/chiarasemenzin/Desktop/MscProject/toms_lematus/models/mapu-21-char-context-fresh/data/dev-sources', header=None)
test_data = pd.read_csv('/Users/chiarasemenzin/Desktop/MscProject/toms_lematus/models/mapu-21-char-context-fresh/data/test-sources', header=None)

train_targets = pd.read_csv('/Users/chiarasemenzin/Desktop/MscProject/toms_lematus/models/mapu-21-char-context-fresh/data/train-targets', header=None)
dev_targets = pd.read_csv('/Users/chiarasemenzin/Desktop/MscProject/toms_lematus/models/mapu-21-char-context-fresh/data/dev-targets', header=None)
test_targets = pd.read_csv('/Users/chiarasemenzin/Desktop/MscProject/toms_lematus/models/mapu-21-char-context-fresh/data/test-targets', header=None)


changed_test_sources=change(test_data[0],changes)
changed_test_targets=change(test_targets[0],changes)

changed_dev_sources=change(dev_data[0],changes)
changed_dev_targets=change(dev_targets[0],changes)

changed_train_sources=change(train_data[0],changes)
changed_train_targets=change(train_targets[0],changes)



In [None]:
all_tr=set(changed_train_sources+changed_dev_sources)
unseen=all_tr.intersection(set(changed_test_sources))
len(all_tr)
len(changed_test_sources)

In [None]:
#train_data = pd.read_csv('/Users/chiarasemenzin/Desktop/MscProject/hard-attention/mapu_train', sep="\t", header=None)
#dev_data = pd.read_csv('/Users/chiarasemenzin/Desktop/MscProject/hard-attention/mapu_test', sep="\t", header=None)
#test_data = pd.read_csv('/Users/chiarasemenzin/Desktop/MscProject/hard-attention/mapu_valid', sep="\t", header=None)


#train_types_sources=list(train_data[0])
#test_types_sources=list(test_data[0])
#dev_types_sources=list(dev_data[0])
#train_types_tgt=list(train_data[2])
#test_types_tgt=list(test_data[2])
#dev_types_tgt=list(dev_data[2])



In [28]:



with open('corpus/Ctx-20-checked/train-sources-spellchecked', "w") as s:
    with open('corpus/Ctx-20-checked/train-targets-spellchecked', "w") as t:
        for word, lemma in zip(changed_train_sources,changed_train_targets):
            #word=word.lower()
            #lemma=lemma.lower()
            #word=" ".join(word)
            #lemma=" ".join(lemma)
            s.write("{}\n".format(word))
            t.write("{}\n".format(lemma))
s.close()
t.close()


# DEV 
with open('corpus/Ctx-20-checked/dev-sources-spellchecked', "w") as s:
    with open('corpus/Ctx-20-checked/dev-targets-spellchecked', "w") as t:
        for word, lemma in zip(changed_dev_sources,changed_dev_targets):
           # word=word.lower()
           # lemma=lemma.lower()
           # word=" ".join(word)
           # lemma=" ".join(lemma)
            s.write("{}\n".format(word))
            t.write("{}\n".format(lemma))
s.close()
t.close()


# TEST
with open('corpus/Ctx-20-checked/test-sources-spellchecked', "w") as s:
    with open('corpus/Ctx-20-checked/test-targets-spellchecked', "w") as t:
        for word, lemma in zip(changed_test_sources,changed_test_targets):
         #   word=word.lower()
         #   lemma=lemma.lower()
       #     word=" ".join(word)
        #    lemma=" ".join(lemma)
            s.write("{}\n".format(word))
            t.write("{}\n".format(lemma))
s.close()
t.close()





NameError: name 'changed_train_sources' is not defined