## Intro

In [1]:
# import necessary packages
import fitz #this is PyMuPDF, for reading PDFs
import nltk
from nltk.tokenize import RegexpTokenizer
from pprint import pprint
import re
import json

## Set up navigation between data structures using functions

### Helper functions

In [2]:
def cons_tier(word):
    '''
        A function to replace all the vowels in a word into V. 
        
        Input: a word
            ex) word = jeets
        
        Output: the word with all its vowels replaced with V. (Also condenses multiple Vs into a single V.)
            ex) cons_sv = jVts
    '''

    # replace all vowels with V
    cons = "chjkmnpqrstw"
    vows = "aáαeoôɔiu"

    cons_word = ""
    for c in word:
        if c in cons:
            cons_word += c
        elif c in vows:
            cons_word += "V"
            
    # condense instances of V+ into V
    cons_sv = ""
    
    for i in range(len(cons_word)):
        if cons_word[i] != cons_word[i-1]:
            cons_sv += cons_word[i]

    return cons_sv   

In [3]:
def get_translit_map(word):
    '''
        A function to map every character in a word to its possible corresponding characters in the standard orthography.
        
        Input: a word
            ex) "jeets"
            
        Output: a list of lists; each list contains all possible corresponding characters in the standard orthography.
            ex) [['c'], ['i'], ['i'], ['t', ''], ['', 's', 'sh']]
    '''
    # hard code a mapping between each character and its possible corresponding characters in the standard orthography.
    c_map = {
        "a" : ["a", "á"],
        "α" : ["a", "áw", "ô", "u", "á"],
        "á" : ["á"],
        "b" : ["p"],
        "c" : ["c", "sh"],
        "d" : ["t"],
        "e" : ["i"],
        "g" : ["c", "k", "q"],
        "h" : ["h", ""],
        "i" : ["", "ay", "y", "i"],
        "j" : ["c"],
        "k" : ["", "k"],
        "m" : ["m"],
        "n" : ["ô", "n"],
        "o" : ["", "a", "á", "o", "uw", "w", "ô", "u"],
        "ɔ" : ["á", "áw"],
        "ô" : ["ô"],
        "p" : ["p"],
        "q" : ["q"],
        "r" : ["", "r"],
        "s" : ["", "s", "sh"],
        "t" : ["t", ""],
        "u" : ["", "a", "o", "ô", "u", "uw"],
        "w" : ["", "w"],
        "y" : ["", "y"],
        "z" : ["y", "s"]
    }

    # build the translit map
    word += "0"
    translit_map = []

    for i in range(len(word) - 1):

        if word[i] in c_map.keys():
            translit_map.append(c_map[word[i]])
        
    return translit_map

In [4]:
# testing get_translit_map()
practice_word = "cáqan"
prac_tmap = get_translit_map(practice_word)
print(prac_tmap)

[['c', 'sh'], ['á'], ['q'], ['a', 'á'], ['ô', 'n']]


In [5]:
def translit(translit_map):
    '''
        A function to create a list of possible transliterated words from the translit_map
        
        Input: translit_map
            ex) [['c'], ['i'], ['i'], ['t', ''], ['', 's', 'sh']]
        
        Output: (nested) list of possible transliterations
            ex) [['ciit', 'ciits', 'ciitsh', 'cii', 'ciis', 'ciish']]
            
        Note: does not "clean up" the transliterations (get rid of unlikely ones); that's done in a later step (cons_translit)
    '''
    # take first two lists and give one list housing every combination; repeat until one item remains. 
    while len(translit_map) > 1:
        new = [[i + j for i in translit_map[0] for j in translit_map[1]]]

        new.extend(translit_map[2:])
        translit_map = new
    
    return translit_map

In [6]:
# testing translit()
prac_translit = translit(prac_tmap)
print(prac_translit)

[['cáqaô', 'cáqan', 'cáqáô', 'cáqán', 'sháqaô', 'sháqan', 'sháqáô', 'sháqán']]


In [7]:
def cons_translit(word, translit):
    '''
        A function to convert a list of possible transliterations to a consonant tier.
        
        Inputs:
            - word: the original word, just so we can use its length when disposing of unlikely transliterations
            - tanspose: a translit generated by the function translit()
            ex) ("jeets", [['ciit', 'ciits', 'ciitsh', 'cii', 'ciis', 'ciish']])
            
        Output: A new list of possible transliterations, this time with all of them consonant tier-ified.
            ex) ['cVt', 'cVts', 'cV', 'cVs', 'cVsh']

    '''
    # first clean out unlikely transliterations
    cleaned_translits = []
    
    for t in translit[0]:
        
        # get rid of transliterations that are super different in length from the original word
        if abs(len(word) - len(t)) <= 2:
            t = ''.join(sorted(set(t), key=t.index))

            # get rid of transliterations with illegal consonant clusters
            if t not in cleaned_translits\
            and re.findall(r"qw", t) == []\
            and re.findall(r"kw", t) == []\
            and re.findall(r"w[qyrm]", t) == []\
            and re.findall(r"c[ktqw]", t) == []\
            and re.findall(r"tch", t) == []\
            and re.findall(r"tsh", t) == []\
            and re.findall(r"t[kq]", t) == []\
            and re.findall(r"m[twr]", t) == []\
            and re.findall(r"^[tkwy][tk]", t) == []\
            and re.findall(r"^ksh", t) == []\
            and re.findall(r"ktc", t) == []\
            and re.findall(r"r[tm]", t) == []\
            and len(re.findall(r"[aáoôiu]", t)) > 0: # I can't remember why I used this one
                cleaned_translits.append(t)

    # now consonant tier-ify them all
    cons_translits = []

    for t in cleaned_translits:
        cons_t = cons_tier(t)
        cons_t_nv = ""

        for i in range(len(cons_t)):
            
            # don't allow double characters
            if cons_t[i] != cons_t[i-1]:
                cons_t_nv += cons_t[i]

        cons_translits.append(cons_t_nv)
    
    return(cons_translits)

In [8]:
# test cons_translit()
prac_ctrans = cons_translit(practice_word, prac_translit)
print(prac_ctrans)

['cVqV', 'cVqVn', 'cVqV', 'cVqn', 'shVqV', 'shVqVn', 'shVqV', 'shVqn']


In [9]:
def find_stressed_token(token):
    """
          This function takes a token and return the token with stress marked.

          Arguments: token (str): a string to be analyzed.
    """
    strong_vowels = ["á", "o", "ô", "i"]
    weak_vowels = ["a", "u"]
    vowels = strong_vowels + weak_vowels
    underlying_vowels = []
    stressable_vowels = []
    stressed_vowel = []
    stressed_token = ""
    underlying_form = list(token)

    for i in range(len(underlying_form)):
        if underlying_form[i] in vowels:
            underlying_vowels.append((underlying_form[i], i)) 
      
    index = 0
    while index < len(underlying_vowels[:-1]):
        if underlying_vowels[index][0] in strong_vowels:
            stressable_vowels.append((underlying_vowels[index], index))
            index += 1
        elif underlying_vowels[index][0] in weak_vowels and underlying_vowels[index+1][0] in weak_vowels:
            stressable_vowels.append((underlying_vowels[index + 1], index + 1))
            index +=2
        else:
            index+=1
    

    original_index = ""

    if len(underlying_vowels) == 0:
        return token
    
    elif len(underlying_vowels) <= 2:
        original_index = underlying_vowels[0][1]
        stressed_vowel = underlying_vowels[0][0]
        stressed_vowel_location = 1
    
    elif len(stressable_vowels) > 0:
        original_index = stressable_vowels[-1][0][1]
        stressed_vowel = stressable_vowels[-1][0][0]
        stressed_vowel_location = stressable_vowels[-1][1] + 1
    
    copy = underlying_form[:]   
    copy[original_index:original_index] = ["*"]
    stressed_token = stressed_token.join(copy)
 
    return stressed_token

alg = ["cáqan", "nutusuwis", "kunawash", "mata", "noruthulo", "monuto", "smuto", "mandoh", "mo", "upulohu"]
for word in alg:
    print(find_stressed_token(word))

c*áqan
nut*usuwis
kun*awash
m*ata
noruth*ulo
m*onuto
sm*uto
m*andoh
m*o
upul*ohu


In [10]:
def stress_notes(token):
    """
        This function returns information about stress assignment for the given token.
        Arguments: token (str): a string to be analyzed.
    """
    strong_vowels = ["á", "o", "ô", "i"]
    weak_vowels = ["a", "u"]
    vowels = strong_vowels + weak_vowels
    underlying_vowels = []
    stressable_vowels = []
    stressed_vowel = []
    stressed_token = ""
    underlying_form = list(token)
    
    for i in range(len(underlying_form)):
        if underlying_form[i] in vowels:
            underlying_vowels.append((underlying_form[i], i)) 
    
    index = 0
    while index < len(underlying_vowels[:-1]):
        if underlying_vowels[index][0] in strong_vowels:
            stressable_vowels.append((underlying_vowels[index], index))
            index += 1
        elif underlying_vowels[index][0] in weak_vowels and underlying_vowels[index+1][0] in weak_vowels:
            stressable_vowels.append((underlying_vowels[index + 1], index + 1))
            index +=2
        else:
            index+=1
  
    original_index = ""

    if len(underlying_vowels) == 0:
        return token
    
    elif len(underlying_vowels) <= 2:
        original_index = underlying_vowels[0][1]
        stressed_vowel = underlying_vowels[0][0]
        stressed_vowel_location = 1
    
    elif len(stressable_vowels) > 0:
        original_index = stressable_vowels[-1][0][1]
        stressed_vowel = stressable_vowels[-1][0][0]
        stressed_vowel_location = stressable_vowels[-1][1] + 1
    
    copy = underlying_form[:]
    copy[original_index:original_index] = ["*"]
    stressed_token = stressed_token.join(copy)
 
    return "The stress falls on "  + str(stressed_vowel) + ", which is vowel #" + str(stressed_vowel_location) + " in " +token + ". It's original index is " + str(original_index)

alg = ["cáqan", "nutusuwis", "kunawash", "mata", "noruthulo", "monuto", "smuto", "mandoh", "mo", "upulohu"]
for word in alg:
    print(stress_notes(word))

The stress falls on á, which is vowel #1 in cáqan. It's original index is 1
The stress falls on u, which is vowel #2 in nutusuwis. It's original index is 3
The stress falls on a, which is vowel #2 in kunawash. It's original index is 3
The stress falls on a, which is vowel #1 in mata. It's original index is 1
The stress falls on u, which is vowel #3 in noruthulo. It's original index is 6
The stress falls on o, which is vowel #1 in monuto. It's original index is 1
The stress falls on u, which is vowel #1 in smuto. It's original index is 2
The stress falls on a, which is vowel #1 in mandoh. It's original index is 1
The stress falls on o, which is vowel #1 in mo. It's original index is 1
The stress falls on o, which is vowel #3 in upulohu. It's original index is 4


In [11]:
def find_iambs(token):
    """ 
        This function returns a list of iambs in the given token.
        Arguments: token (str): a string to be analyzed.
    """

    pattern = re.compile(r"(a|u)[^aáiuoô](a|u)")
    iambs = pattern.findall(token)

    return iambs

words = ["caqaanonaáaoa", "caqanaeag"]
for w in words:
    print(find_iambs(w))

[('a', 'a')]
[('a', 'a'), ('a', 'a')]


In [12]:
def find_bimoraic_vowels(token):
    """
        This function returns a list of bimoraic vowels in the given token.
        Arguments: token (str): a string to be analyzed.
    """
  
    pattern = re.compile(r"[á|o|ô|i]")
    bimoraic_vowels = pattern.findall(token)

    return bimoraic_vowels

find_bimoraic_vowels("caqaôno")

['ô', 'o']

In [13]:
# define edit_distance function
def edit_distance(str1, str2):
    """
        A function to implement the algorithm calculating
        the minimal edit distance between two strings.
        Copied from class notebook.
    
    Arguments:
      -- str1 (str): some string;
      -- str2 (str): another string.
      
    Outputs:
      -- int: the smallest edit distance in-between
              str1 and str2.
    """
    M = [[None for i in range (len(str1) + 1)] for j in range(len(str2) + 1)]
    
    for i in range(len(M[0])):
      M[0][i] = i
      for j in range(1, len(M)):
        M[j][0] = j

    for n in range(1, len(M[0])): # n is for each column
      for m in range(1, len(M)): # m is for each row

        str1_symbol = str1[n-1] # the symbol from string1 can be found with n
        str2_symbol = str2[m-1] # the symbol from string2 can be found with m

        if str1_symbol == str2_symbol:
          M[m][n] = M[m-1][n-1]
        else:
          M[m][n] = min(M[m-1][n-1], M[m-1][n], M[m][n-1]) + 1  

    return M[m][n]

### Main functions

In [14]:
# CREATE new subentry FROM token
def subentrify(token, source, language):
    '''
        A function to turn a set of values into a labeled dictionary of values (subentry)
        
        Inputs: values to populate the fields of a subentry; must be in the following order:
            - token (original spelling of token found in source text)
            - source (name of source text where token was found)
            - language (language of the token)
            
        Output: subentry (a dictionary; keys are field labels and values are values analogous to Inputs)
    '''
    
    new_subentry = {"token": token,
                    "source": source,
                    "language": language,
                    "english_translations": [],
                    "stressed_token": ""}
    
    return new_subentry

In [1]:
# CREATE new Entry FROM subentry
class Entry(object):
    '''
        A class for word entries.
        
        Attributes:
        - entry_title: (string) the title of the entry
        - master_info: (dictionary) a dictionary containing standardized info
        - subentries:  (list of dictionaries) a list of subentries, each of which is a labeled
                       dictionary created by the function subentrify()
        - liav_ready:  formats the Entry in a dictionary appropriate for adding to JSON files
        
        Methods:
        - display(self): displays Entry to screen
        - create_subentry(self, subentry_values): adds a subentry to the Entry
    '''
    
    def __init__(self, entry_title, subentries = None, master_info = None):
        self.entry_title = entry_title.lower()
        self.subentries = [] if subentries is None else subentries
        self.master_info = {"standard_spelling": None,
                            "IPA": "",
                            "stress": None,
                            "stress_notes": None,
                            "iambs" : None,
                            "bimoraic_vowels" : None,
                            "standard_eng_trans": []} if master_info is None else master_info
        self.liav_ready = {"entry_title": self.entry_title, "master_info": self.master_info, "subentries": self.subentries}
        
        if type(self.subentries) is not list:
            self.subentries = [self.subentries]
        
    def display(self):
        '''
            A function to display the Entry to the screen in a human-readable format
        '''
        print("entry_title:", self.entry_title)
        print("master_info:")
        pprint(self.master_info)
        print("subentries:")
        pprint(self.subentries)
        print()
        
    def add_subentry(self, subentry):
        '''
            A function to add a subentry to the entry
        '''
        self.subentries.append(subentry)
    
    def translit_entry(self):
        '''
            A function to populate the "standard_spelling" field with a transliteration of the entry 
            into the standard orthography
        '''
        if self.master_info["standard_spelling"] is None:
            
            # generate list of possible transliterations
            options = cons_translit(self.entry_title, translit(get_translit_map(self.entry_title)))
            
            # display options to user
            for i in range(len(options)):
                print(i, "\t", options[i])
            
            # get user input on which transliteration is best
            message = "Which of the following is the correct transliteration of " + self.entry_title + "? "
            choice = int(input(message))
            
            # populate "standard_spelling" field with user's choice
            self.master_info["standard_spelling"] = options[choice]

    def stress_assigner(self):
        '''
            A function to populate the "stressed" and "stress_notes" fields by calling
            find_stressed_token() and stress_notes()
        '''
        if self.master_info["stress"] is None:
            self.master_info["stress"] = find_stressed_token(self.entry_title)
            self.master_info["stress_notes"] = stress_notes(self.entry_title)
            
    def get_iambs(self):
        '''
            A function to populate the "iambs" field with the list of iambs in the word.
        '''
        if self.master_info["iambs"] is None:
            self.master_info["iambs"] = find_iambs(self.entry_title)
        
    def get_bimoraic_vowels(self):
        '''
            A function to populate the "bimoraic_vowels" field with the list of iambs in the word.
        '''
        if self.master_info["bimoraic_vowels"] is None:
            self.master_info["bimoraic_vowels"] = find_bimoraic_vowels(self.entry_title)

In [16]:
# testing Entry()
prac_entry = Entry("cáqan")
prac_entry.display()

entry_title: cáqan
master_info:
{'IPA': '',
 'bimoraic_vowels': None,
 'iambs': None,
 'standard_eng_trans': [],
 'standard_spelling': None,
 'stress': None,
 'stress_notes': None}
subentries:
[]



In [17]:
# testing .translit_entry()
prac_entry.translit_entry()
print()
prac_entry.display()

0 	 cVqV
1 	 cVqVn
2 	 cVqV
3 	 cVqn
4 	 shVqV
5 	 shVqVn
6 	 shVqV
7 	 shVqn
Which of the following is the correct transliteration of cáqan? 1

entry_title: cáqan
master_info:
{'IPA': '',
 'bimoraic_vowels': None,
 'iambs': None,
 'standard_eng_trans': [],
 'standard_spelling': 'cVqVn',
 'stress': None,
 'stress_notes': None}
subentries:
[]



In [18]:
# testing .get_iambs()
prac_entry.get_iambs()
prac_entry.display()

entry_title: cáqan
master_info:
{'IPA': '',
 'bimoraic_vowels': None,
 'iambs': [],
 'standard_eng_trans': [],
 'standard_spelling': 'cVqVn',
 'stress': None,
 'stress_notes': None}
subentries:
[]



In [19]:
# testing .get_bimoraic_vowels()
prac_entry.get_bimoraic_vowels()
prac_entry.display()

entry_title: cáqan
master_info:
{'IPA': '',
 'bimoraic_vowels': ['á'],
 'iambs': [],
 'standard_eng_trans': [],
 'standard_spelling': 'cVqVn',
 'stress': None,
 'stress_notes': None}
subentries:
[]



In [20]:
# testing .stress_assigner()
prac_entry.stress_assigner()
prac_entry.display()

entry_title: cáqan
master_info:
{'IPA': '',
 'bimoraic_vowels': ['á'],
 'iambs': [],
 'standard_eng_trans': [],
 'standard_spelling': 'cVqVn',
 'stress': 'c*áqan',
 'stress_notes': "The stress falls on á, which is vowel #1 in cáqan. It's "
                 'original index is 1'}
subentries:
[]



In [21]:
# CREATE new liav
class Liav(object):
    '''
        A class to store a list of Entries.
        
        Attributes:
            - metadata: for now, this is empty.
            - entries: list of Entries
            
        Methods:
            - save_ready: formats the liav to be json-friendly; also nicely human readable
            - save_liav: writes the liav to a .txt JSON file
            - add_entry: adds an Entry
            - find_existing_matches: finds words whose standardized spellings exactly match one of the translit
                    options for the given word
            - find_similar_words: finds words whose standardized spellings are similar to the translit options for the given word
            - stress_assign_all: populates each entry's 'stressed' and 'stress_notes' fields
            - add_cons_tier_all: populates each entry's 'standard_spelling' field (for now)
            - bimoraic_all: populates each entry's 'bimoraic_vowels' field
            - iambs_all: populates each entry's 'iambs' field
    '''
    def __init__(self):
        self.metadata = []
        self.entries = []
        
    def save_ready(self):
        '''
            A function to put the liav in a json-friendly (and also human readable) format
        '''
        formatted_entries = []
        
        for entry in self.entries:
            formatted_entries.append(entry.liav_ready)
            
        save_ready = {"entries": formatted_entries,
                     "metadata": self.metadata
                     }
        
        return save_ready
    
    def save_liav(self, filename):
        '''
            A function to write a liav to a file.
        '''
        print("starting save_liav")
        with open(filename, 'w') as outfile:
            json.dump(self.save_ready(), outfile)
        
    def add_entry(self, entries):
        '''
            A function to add an entry or list of entries to the liav.
        '''
        if type(entries) is list:
            self.entries.extend(entries)
            
        if type(entries) is Entry:
            self.entries.append(entries)
       
    def find_existing_matches(self, word):
        '''
            A function to find words in the liav that exactly match a given word.
           
           Input: - word (str): word that you want to find matches for
           
           Output: - matches (list): list of exact matches in the liav
        '''
        matches = []
        for entry in self.entries:
            if word == entry.entry_title:
                matches.append(word)
        
        # will change it to the below later after running code for populating standard_spelling
        #cons_translits = cons_translit(word, translit(get_translit_map(word)))
        #matches = {}

        #for t in cons_translits:
            
            #for entry in self.entries:
                
                #if t == entry.entry_title:
                #if t == entry.master_info["standard_spelling"]:  
                    #matches[t] = entry.entry_title

        return(matches)
        
    def find_similar_words(self, word):
        '''
            A function to find similar words in the liav to a given word.
            
            Input: - word (str): word that you want to find matches for
            
            Output: - similars (list): list of words from liav that are within a certain edit distance (5) from the given word
        '''
        similars = []
        
        for entry in self.entries:
            ed = edit_distance(word, entry.entry_title)
            if ed < 6:
                similars.append((entry.entry_title, ed))
        similars.sort(key=lambda tup: tup[1])
        
        return similars
        
    def stress_assign_all(self):
        '''
            A function to apply stress_assigner() to every entry in the liav.
        '''
        for entry in self.entries:
            entry.stress_assigner()
            
    def add_cons_tier_all(self):
        '''
            A function to apply add_consonant_tier() to every entry in the liav.
        '''
        for entry in self.entries:
            entry.add_consonant_tier()
            
    def bimoraic_all(self):
        '''
            A function to apply bimoraic_vowels() to every entry in the liav.
        '''
        for entry in self.entries:
            entry.get_bimoraic_vowels()  
    
    def iambs_all(self):
        '''
            A function to apply get_iambs() to every entry in the liav.
        '''
        for entry in self.entries:
            entry.get_iambs()

In [22]:
# testing Liav() class
prac_liav = Liav()
prac_liav.add_entry(prac_entry)
prac_liav.add_entry(Entry("tokena", subentrify("tokena", "SOURCE", "LANGUAGE")))
pprint(prac_liav.save_ready())
prac_liav.stress_assign_all()
pprint(prac_liav.save_ready())

{'entries': [{'entry_title': 'cáqan',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': ['á'],
                              'iambs': [],
                              'standard_eng_trans': [],
                              'standard_spelling': 'cVqVn',
                              'stress': 'c*áqan',
                              'stress_notes': 'The stress falls on á, which is '
                                              "vowel #1 in cáqan. It's "
                                              'original index is 1'},
              'subentries': []},
             {'entry_title': 'tokena',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': None,
                              'iambs': None,
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': None,
                              'stress_notes': None},
       

In [23]:
# decode the dictionary that load_laiv reads in and turn it into a laiv object
def decode_liav(json_data):
    '''
        A function to take a dictionary containing JSON data and return a liav object.
    '''
    entry_list = []
    
    for entry in json_data['entries']:
        entry_list.append(Entry(entry['entry_title'], entry['subentries'], entry['master_info']))
        
    liav = Liav()
    
    for entry in entry_list:
        liav.entries.append(entry)
        
    return liav

In [24]:
# load a .txt JSON file into a liav object
def load_liav(filename):
    '''
        A function to read a file and give you a liav.
    '''
    print("starting load_liav")
    
    with open(filename) as json_file:
        
        json_data = json.load(json_file)
        liav = decode_liav(json_data)
        
    return liav

In [25]:
# testing load_liav (and decode_liav)
liav_ff = load_liav("liav_d6.txt")
#print(liav_ff.save_ready())

starting load_liav


## Main Op 1: Fielding Diaries PDF --> JSON .txt file and back

#### Creating a JSON file (.txt) from the Fielding Diaries PDF


In [26]:
## for now, I'm going to arbitrarily define the variables we'll use to populate the fields 'source', 'language', etc
# In the future, we will add more code to allow the user to populate those fields
# Or to automatically populate them from the document

source = "Fielding Diary"
language = "Mohegan"
eng_translations = []

In [27]:
## read PDF

# get source file path
source_name = 'Fielding_diaries_May_30_1904_transcription_by_Speck_Stephanie_Fielding.pdf'

# use PyMuPDF to read in PDF
source_doc = fitz.open(source_name)

# find out how many pages are in the source text
num_pages = source_doc.pageCount

# make a list text_by_page of strings, each string containing the full text of a single page
text_by_page = []
for i in range(num_pages):
    text_by_page.append(source_doc.loadPage(i).getText("text"))

In [28]:
# tokenize PDF by page
tokens_by_page = [nltk.word_tokenize(p) for p in text_by_page]

In [29]:
# remove English words, duplicates, punctuation, and numbers
fld_tokens = [] #variables starting with fld_ are for the Fielding Diaries data 

english = set(nltk.corpus.words.words())
punctuation = ',.!:\'()-‘’–?&[];'

for page in tokens_by_page:
    
    for token in page:
        
        # some words aren't picked out by nltk's english corpus; try them minus their last letter or two
        cut_token = token[:-1]
        cutt_token = token[:-2]
        
        if token.lower() not in english and \
                cut_token.lower() not in english and \
                cutt_token.lower() not in english and \
                token not in fld_tokens and \
                token.lower() not in fld_tokens and \
                token not in punctuation and \
                not re.match(r"\d+(\.)*", token) and \
                not re.match(r"(.)*\d(.)*", token):
            
                fld_tokens.append(token)
                
# screen again for duplicates
fld_tokens = list(set(fld_tokens))
#print(fld_tokens)

In [30]:
# Ask Annotator to remove bad words
fld_tokens_to_add = fld_tokens

In [31]:
# CHECKPOINT: let's look at fld_tokens_to_add
print(fld_tokens_to_add)

['kuqamuq', 'tcα´ntci·', 'yeiyo', 'gαtai´nαmαng', 'sômi', 'Nutakamô', 'Jebi', 'wodjuner', 'Gowhik', 'Mohecks', 'weakchu', 'Wiyôko', 'kiskuk', 'dji·bai', 'mihkikuwôk', 'dja´gwan', 'gαdαp·u·´', 'kunatotumunáw', 'gαso´sαn·i·', 'muhtutiyásuw', 'tcu´yα', 'mα´ndunag', 'Fidelia', 'Nuwikináwô', 'dji·´tsαg', 'Weyoungo', 'joggwon', 'Micuwak', 'gertumor', 'mα´tce', 'zi·´bugαg', 'Wi·´yαηgu', 'Diaries', 'Nuwusáyki', 'Mertarwe', 'miyáw', 'αndai·´', 'wowu´tαg', 'nunáwô', 'jogwon', 'wonjug', 'ski·´dαmbak', 'weger', 'kusôhsuni', 'scanned', 'cáqansh', 'cuncherche', 'nuppoo', 'tuggum', 'skug', 'meaguwog', 'piyámáq', 'ski´dαmb', 'Ithaca', 'nupuyan', 'identifies', 'iwák', 'αndai´', 'dja´gwanc', 'Wacônáw', 'kiyaw', 'kutayunamawô', 'mαt·αd´i´a´zu', 'Stephanie', 'chunche', 'diaries', 'woto´d', 'kuwatunum', 'wikco', 'cáqan', 'mi·´ki·gwaηg', 'nαwi·gi·no´wα', 'Pennsylvania', 'muhtáwi', 'su´mi·', 'attesting', 'skedumbork', 'côhtamhutut', 'get-vti-objective', 'Jeets', 'muttuddeyozzo', 'citsak', 'mαta´wi·', 'Gertod

In [32]:
# make liav from list of tokens
liav = Liav()
for token in fld_tokens:
    liav.add_entry((Entry(entry_title=token, subentries=subentrify(token, "Fielding Diaries", "Mohegan"))))
pprint(liav.save_ready())

{'entries': [{'entry_title': 'kuqamuq',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': None,
                              'iambs': None,
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': None,
                              'stress_notes': None},
              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                             'source': 'Fielding Diaries',
                             'stressed_token': '',
                             'token': 'kuqamuq'}},
             {'entry_title': 'tcα´ntci·',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': None,
                              'iambs': None,
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': N

              'master_info': {'IPA': '',
                              'bimoraic_vowels': None,
                              'iambs': None,
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': None,
                              'stress_notes': None},
              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                             'source': 'Fielding Diaries',
                             'stressed_token': '',
                             'token': 'skug'}},
             {'entry_title': 'meaguwog',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': None,
                              'iambs': None,
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': None,
                              'stress_n

                              'standard_spelling': None,
                              'stress': None,
                              'stress_notes': None},
              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                             'source': 'Fielding Diaries',
                             'stressed_token': '',
                             'token': 'citsak'}},
             {'entry_title': 'mαta´wi·',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': None,
                              'iambs': None,
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': None,
                              'stress_notes': None},
              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                             'source': 'Fielding Diaries',
             

              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                             'source': 'Fielding Diaries',
                             'stressed_token': '',
                             'token': 'pi·´ɔ·mag'}},
             {'entry_title': 'ya´yu',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': None,
                              'iambs': None,
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': None,
                              'stress_notes': None},
              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                             'source': 'Fielding Diaries',
                             'stressed_token': '',
                             'token': 'ya´yu'}},
             {'entry_title': 'wutonuk',
              'master_info': 

                             'stressed_token': '',
                             'token': 'yáyuw'}},
             {'entry_title': 'norwor',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': None,
                              'iambs': None,
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': None,
                              'stress_notes': None},
              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                             'source': 'Fielding Diaries',
                             'stressed_token': '',
                             'token': 'norwor'}},
             {'entry_title': 'apuw',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': None,
                              'iambs': None,
                              'standard_eng_trans': [],
     

In [33]:
# save liav to .txt JSON file
liav.save_liav("liav_d7.txt")

starting save_liav


#### Reading in the JSON file we just wrote

In [34]:
liav_from_file = load_liav("liav_d7.txt")
pprint(liav_from_file.save_ready())

starting load_liav
{'entries': [{'entry_title': 'kuqamuq',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': None,
                              'iambs': None,
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': None,
                              'stress_notes': None},
              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                             'source': 'Fielding Diaries',
                             'stressed_token': '',
                             'token': 'kuqamuq'}},
             {'entry_title': 'tcα´ntci·',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': None,
                              'iambs': None,
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                      

              'master_info': {'IPA': '',
                              'bimoraic_vowels': None,
                              'iambs': None,
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': None,
                              'stress_notes': None},
              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                             'source': 'Fielding Diaries',
                             'stressed_token': '',
                             'token': 'Diaries'}},
             {'entry_title': 'nuwusáyki',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': None,
                              'iambs': None,
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': None,
                              'stre

                             'stressed_token': '',
                             'token': 'mαt·αd´i´a´zu'}},
             {'entry_title': 'stephanie',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': None,
                              'iambs': None,
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': None,
                              'stress_notes': None},
              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                             'source': 'Fielding Diaries',
                             'stressed_token': '',
                             'token': 'Stephanie'}},
             {'entry_title': 'chunche',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': None,
                              'iambs': None,
                              'standard_eng_

              'master_info': {'IPA': '',
                              'bimoraic_vowels': None,
                              'iambs': None,
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': None,
                              'stress_notes': None},
              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                             'source': 'Fielding Diaries',
                             'stressed_token': '',
                             'token': 'ewor'}},
             {'entry_title': 'i·´wag',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': None,
                              'iambs': None,
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': None,
                              'stress_not

                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': None,
                              'stress_notes': None},
              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                             'source': 'Fielding Diaries',
                             'stressed_token': '',
                             'token': 'uppoo'}},
             {'entry_title': 'mi·tcs',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': None,
                              'iambs': None,
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': None,
                              'stress_notes': None},
              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                   

                              'stress_notes': None},
              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                             'source': 'Fielding Diaries',
                             'stressed_token': '',
                             'token': 'Sosunneun'}},
             {'entry_title': 'wikuw',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': None,
                              'iambs': None,
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': None,
                              'stress_notes': None},
              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                             'source': 'Fielding Diaries',
                             'stressed_token': '',
                             'token': 'wikuw'}},
             {'e

## Main Op 2: find (and merge) similar words

In [35]:
print(liav.find_similar_words("cetsack"))

[('citsak', 2), ('citsak', 2), ('ithaca', 4), ('kiskuk', 5), ('cáqan', 5), ('jeets', 5), ('catalog', 5), ('cánaw', 5), ('meachs', 5), ('ôtay', 5), ('wutonuk', 5), ('skok', 5), ('wôcak', 5), ('côci', 5), ('cipay', 5), ('kutomák', 5)]


## Main Op 3: add linguistic data

In [36]:
pprint(liav_from_file.save_ready())

{'entries': [{'entry_title': 'kuqamuq',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': None,
                              'iambs': None,
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': None,
                              'stress_notes': None},
              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                             'source': 'Fielding Diaries',
                             'stressed_token': '',
                             'token': 'kuqamuq'}},
             {'entry_title': 'tcα´ntci·',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': None,
                              'iambs': None,
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': N

                             'source': 'Fielding Diaries',
                             'stressed_token': '',
                             'token': 'dja´gwan'}},
             {'entry_title': 'gαdαp·u·´',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': None,
                              'iambs': None,
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': None,
                              'stress_notes': None},
              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                             'source': 'Fielding Diaries',
                             'stressed_token': '',
                             'token': 'gαdαp·u·´'}},
             {'entry_title': 'kunatotumunáw',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': None,
                             

                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': None,
                              'stress_notes': None},
              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                             'source': 'Fielding Diaries',
                             'stressed_token': '',
                             'token': 'scanned'}},
             {'entry_title': 'cáqansh',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': None,
                              'iambs': None,
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': None,
                              'stress_notes': None},
              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                

                              'stress_notes': None},
              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                             'source': 'Fielding Diaries',
                             'stressed_token': '',
                             'token': 'skedumbork'}},
             {'entry_title': 'côhtamhutut',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': None,
                              'iambs': None,
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': None,
                              'stress_notes': None},
              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                             'source': 'Fielding Diaries',
                             'stressed_token': '',
                             'token': 'côhtamhutut'}},
   

                             'source': 'Fielding Diaries',
                             'stressed_token': '',
                             'token': 'wámi'}},
             {'entry_title': 'wáhtôw',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': None,
                              'iambs': None,
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': None,
                              'stress_notes': None},
              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                             'source': 'Fielding Diaries',
                             'stressed_token': '',
                             'token': 'wáhtôw'}},
             {'entry_title': 'ôtay',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': None,
                              'iambs': None,
   

             {'entry_title': 'berkedum',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': None,
                              'iambs': None,
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': None,
                              'stress_notes': None},
              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                             'source': 'Fielding Diaries',
                             'stressed_token': '',
                             'token': 'Berkedum'}},
             {'entry_title': 'bα´ki·dαm',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': None,
                              'iambs': None,
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress':

In [37]:
liav_from_file.iambs_all()

In [38]:
#pprint(liav_from_file.save_ready())

In [39]:
liav_from_file.bimoraic_all()

In [40]:
#pprint(liav_from_file.save_ready())

In [41]:
liav_from_file.stress_assign_all()
pprint(liav_from_file.save_ready())

{'entries': [{'entry_title': 'kuqamuq',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': [],
                              'iambs': [('u', 'a')],
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': 'kuq*amuq',
                              'stress_notes': 'The stress falls on a, which is '
                                              "vowel #2 in kuqamuq. It's "
                                              'original index is 3'},
              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                             'source': 'Fielding Diaries',
                             'stressed_token': '',
                             'token': 'kuqamuq'}},
             {'entry_title': 'tcα´ntci·',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': ['i'],
             

             {'entry_title': 'jogwon',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': ['o', 'o'],
                              'iambs': [],
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': 'j*ogwon',
                              'stress_notes': 'The stress falls on o, which is '
                                              "vowel #1 in jogwon. It's "
                                              'original index is 1'},
              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                             'source': 'Fielding Diaries',
                             'stressed_token': '',
                             'token': 'jogwon'}},
             {'entry_title': 'wonjug',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': ['o'],
                      

                              'standard_spelling': None,
                              'stress': 'c*áqan',
                              'stress_notes': 'The stress falls on á, which is '
                                              "vowel #1 in cáqan. It's "
                                              'original index is 1'},
              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                             'source': 'Fielding Diaries',
                             'stressed_token': '',
                             'token': 'cáqan'}},
             {'entry_title': 'mi·´ki·gwaηg',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': ['i', 'i'],
                              'iambs': [],
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': 'mi·´k*i·gwaηg',
                              'stress_not

              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                             'source': 'Fielding Diaries',
                             'stressed_token': '',
                             'token': 'Citsak'}},
             {'entry_title': 'meachs',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': [],
                              'iambs': [],
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': 'me*achs',
                              'stress_notes': 'The stress falls on a, which is '
                                              "vowel #1 in meachs. It's "
                                              'original index is 2'},
              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                             'source': 'Fielding Diaries'

             {'entry_title': 'côci',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': ['ô', 'i'],
                              'iambs': [],
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': 'c*ôci',
                              'stress_notes': 'The stress falls on ô, which is '
                                              "vowel #1 in côci. It's original "
                                              'index is 1'},
              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                             'source': 'Fielding Diaries',
                             'stressed_token': '',
                             'token': 'côci'}},
             {'entry_title': 'wa´djαnα',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': [],
                              '

                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': 'w*ikuw',
                              'stress_notes': 'The stress falls on i, which is '
                                              "vowel #1 in wikuw. It's "
                                              'original index is 1'},
              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                             'source': 'Fielding Diaries',
                             'stressed_token': '',
                             'token': 'wikuw'}},
             {'entry_title': 'mi·´zɔ·',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': ['i'],
                              'iambs': [],
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': 'm*i·´zɔ·',


In [42]:
pprint(liav_from_file.save_ready())

{'entries': [{'entry_title': 'kuqamuq',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': [],
                              'iambs': [('u', 'a')],
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': 'kuq*amuq',
                              'stress_notes': 'The stress falls on a, which is '
                                              "vowel #2 in kuqamuq. It's "
                                              'original index is 3'},
              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                             'source': 'Fielding Diaries',
                             'stressed_token': '',
                             'token': 'kuqamuq'}},
             {'entry_title': 'tcα´ntci·',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': ['i'],
             

                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': 'micuw*ak',
                              'stress_notes': 'The stress falls on a, which is '
                                              "vowel #3 in micuwak. It's "
                                              'original index is 5'},
              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                             'source': 'Fielding Diaries',
                             'stressed_token': '',
                             'token': 'Micuwak'}},
             {'entry_title': 'gertumor',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': ['o'],
                              'iambs': [],
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': 'gert

                             'language': 'Mohegan',
                             'source': 'Fielding Diaries',
                             'stressed_token': '',
                             'token': 'αndai´'}},
             {'entry_title': 'dja´gwanc',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': [],
                              'iambs': [],
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': 'dj*a´gwanc',
                              'stress_notes': 'The stress falls on a, which is '
                                              "vowel #1 in dja´gwanc. It's "
                                              'original index is 2'},
              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                             'source': 'Fielding Diaries',
                             'stressed_token':

              'master_info': {'IPA': '',
                              'bimoraic_vowels': ['o', 'o'],
                              'iambs': [],
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': 'j*oggwonch',
                              'stress_notes': 'The stress falls on o, which is '
                                              "vowel #1 in joggwonch. It's "
                                              'original index is 1'},
              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                             'source': 'Fielding Diaries',
                             'stressed_token': '',
                             'token': 'joggwonch'}},
             {'entry_title': 'sosunne',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': ['o'],
                              'iambs': [],
        

                                              "It's original index is 4"},
              'subentries': {'english_translations': [],
                             'language': 'Mohegan',
                             'source': 'Fielding Diaries',
                             'stressed_token': '',
                             'token': 'αndai´gαtu·´mαk'}},
             {'entry_title': 'connecticut',
              'master_info': {'IPA': '',
                              'bimoraic_vowels': ['o', 'i'],
                              'iambs': [],
                              'standard_eng_trans': [],
                              'standard_spelling': None,
                              'stress': 'connect*icut',
                              'stress_notes': 'The stress falls on i, which is '
                                              "vowel #2 in connecticut. It's "
                                              'original index is 7'},
              'subentries': {'english_translations': [],
  

## Main Op 3: finding existing matches for words

In [43]:
# must do cons_tier_assignment to liav first!
print(liav_from_file.find_existing_matches("ewog"))

['ewog']
