In [42]:
    import xml.etree.ElementTree as ET # parses XML files
    import string
    import os

# Extract Data from flextext XML files

In [43]:
# placeholders/delimiters
TEMP = '@@@'

# text-level flextext XML attributes
TITLE_TYPE = 'title'
COMMENT_TYPE = 'comment'
# flextext XML attributes for languages/scripts used in title or translations
ENGLISH = 'en'
INDONESIAN = 'id'

# IGT tier-level flextext XML attributes
TXT = 'txt' # surface morph/segment AND transcribed text
CAN_MORPHEME = 'cf' # canonical (underlying) morpheme
GLOSS = 'gls' # morpheme gloss AND sentence free translation
M_POS = 'msa' # morpheme-level pos, what category affix attaches to
WORD_POS = 'pos' # word-level pos
PUNCT = 'punct' # punctuation

# morpheme types (flextext XML attributes)
MWE = 'phrase' # multiword expression
PREFIX = 'prefix'
SUFFIX = 'suffix'
CIRCUMFIX = 'circumfix'
PROCLITIC = 'proclitic'
ENCLITICS = ['enclitic', 'clitic'] # NOTE: clitic functions as enclitic in some FLEx databases (e.g. lmk)
INFIXES = ['infix', 'infixing interfix']
STEMS = ['stem', 'bound stem', 'bound root', 'bound root A', 'root', 'particle']
COMPOUND2 = 'bound root B'

# Segment boundaries are  uniquely marked in FLEx, add yours here
# NOTE: FLEx databases handle circumfixes differently
# NOTE: these symbols will need to be removed before re-importing to FLEx
CIRCUM_PRE = '>'
CIRCUM_POST = '<'
CIRCUM_HOLE = '<>'
CLITIC = '='
BOUNDROOT = '*'

# Generic GLOSSes
PROPER_NOUN_GLOSS = 'NPROP'
DIGIT_GLOSS_POS = 'NUM'

# Output dictionary keys
TITLE = 'text_title'
COMMENT = 'text_comment'
SEGNUM = 'line#'
FT = 'free_transl'
ORIG_LINE = 'orig_line'
WORDS = 'words'
TOKEN = 'token'
POS = 'wPOS'
MORPHEMES = "morphemes"

In [44]:
def getTitleComment(xmlsection):
    '''find title and comment if in this section
    some documents have both and english and native language titles
    these checks assure that the both will always be used if found separated by //
    if only one of them is found then it is used
    if none are found return NO TITLE FOUND'''
    
    title = "NO TITLE FOUND" 
    eng_title = TEMP
    non_eng_title = TEMP
    comment = "No comment"
    
    for item_lin in xmlsection.iter('item'):
        if item_lin.get('type') == TITLE_TYPE and item_lin.get('lang') == ENGLISH:
            eng_title = item_lin.text
        if item_lin.get('type') == TITLE_TYPE and item_lin.get('lang') != ENGLISH:
            non_eng_title = item_lin.text
        if item_lin.get('type') == COMMENT_TYPE and item_lin.get('lang') == ENGLISH:
            comment = item_lin.text
    # check languages of title and add either or both
    if eng_title != TEMP and non_eng_title == TEMP:
        title = eng_title 
    elif eng_title == TEMP and non_eng_title != TEMP:
        title = non_eng_title
    elif eng_title != TEMP and non_eng_title != TEMP:
        title = eng_title + ' // ' + non_eng_title 
        
    return title, comment

In [45]:
#############################
'''These cleaning functions handle pecularities of a corpus or non-conventional IGT annotations'''

def cleanWord(IGTstring):  
    
    IGTstring = str(IGTstring)
    
    # TODO?: phrasal lexemes to be separated by double tilde
    #IGTstring = IGTstring.strip().replace(' ', '~~')
    # Strip hyphens from words. This handles hyphen as Cyrillic quotation mark 
    IGTstring = IGTstring.strip('-')
    # Use tilde in hyphenated words. Don't confuse w hyphen as morpheme breaks
    IGTstring = IGTstring.replace('-', '~')
    
    return IGTstring.strip().lower()
    
    
def cleanMorph(IGTstring):
    '''remove unexpected symbols in surface morphs and canonical morphemes
    (includes infixes and circumfix halves)'''
    
    # separate multiple words in morpheme string with period
    IGTstring = IGTstring.replace(' ', '.')
    
    IGTstring = IGTstring.lower()
    
    # make null morpheme symbol consistent across databases, avoid encoding bugs
    IGTstring = IGTstring.replace('Ø','NULL').replace('∅', 'NULL').replace('zero', 'NULL')
    # add your null morpheme symbol here
    IGTstring = IGTstring.replace('*0','NULL') # lez
    # make * on bound roots into suffix hyphens
    IGTstring = IGTstring.replace('*', '-') # ntu
    
    # NOTE: add here any pre-processing specific to a database
    #IGTstring = IGTstring.replace('*', '') # NTU
    
    return IGTstring.strip()


def cleanGloss(IGTstring, morpheme_type):
    '''preprocess morpheme glosses
    Follow Leipzig glossing rules where possible'''
    
    # separate multiple words in glosses with period, per linguistic convention
    IGTstring = IGTstring.replace('-','.').replace(' ', '.')
    
    # make affix glosses all caps, per linguistic convention
    if morpheme_type not in STEMS:
        IGTstring = IGTstring.upper()
    
    return IGTstring.strip()


def cleanPOS(IGTstring):
    '''preprocess morpheme-level POS and word-level POS'''
    
    #TODO: reverse before returning to FLEx
    
    # separate multiple tags with period, per linguistic convention
    IGTstring = IGTstring.replace(' ', '')
    # remove FLEx-inserted hyphens, to reduce confusion w morpheme delimiter
    #TODO: reverse before returning to FLEx
    IGTstring = IGTstring.replace('pro-form', 'proform').replace('Nom-1','Nom1')
    
    ### NOTE: add here any pre-processing specific to a database
    IGTstring = IGTstring.replace('N (kx cl)', 'N(kx.cl)') ## Natugu [ntu] morpheme pos
    
    return IGTstring.strip()   


In [46]:
def getInfixedStem(wordtxt, morphitem, infix):
    '''infixed stems need special processing,
    especially for non-neural models that require glosses for every segment'''
    
    pre_temp_morph = [TEMP, TEMP, TEMP, TEMP]
    post_temp_morph = [TEMP, TEMP, TEMP, TEMP]
    
    infix = infix[0][1:-1] # remove dashes surrounding infixes
    stemhalves = wordtxt.split(infix) # treat strings surrounding infixes as stems
    
    # get other tiers
    for item in morphitem.iter('item'):
        if item.get('type') != None or item.text != '' or item.text != '<NotSure>' or item.text != ' ':
            # get surface morph, treat same as stem halves
            if (item.get('type') == TXT):
                pre_temp_morph[0] = cleanGloss(stemhalves[0])
                post_temp_morph[0] = cleanGloss(stemhalves[1])
            # canonical morpheme, will be nothing for first half if infixed
            elif(item.get('type') == CAN_MORPHEME):
                pre_temp_morph[1] = cleanMorph(item.text)
                post_temp_morph[1] = cleanMorph(item.text)
            # gloss, same for both
            elif(item.get('type') == GLOSS):
                # separate multi-word glosses with "."
                pre_temp_morph[2] = cleanGloss(item.text)
                post_temp_morph[2] = cleanGloss(item.text)
            # morpheme pos
            elif(item.get('type') == M_POS):
                pre_temp_morph[3] = cleanPOS(item.text)
                post_temp_morph[3] = cleanPOS(item.text)
        else:
            continue
    
    return pre_temp_morph, post_temp_morph

In [47]:
def getMorpheme(morphitem, morphemetype, numaffix):
    '''OUTPUT for each morpheme segment: [morph, morpheme, gloss, mpos, morphemetype]
    
    To add more items to this array of info about morpheme segments:
    1st. Add another holding place in the morph_info array; give index for that info piece.
    2nd. Add elif statement for new tier using the attribute you want, e.g. 'morpheme type'.
        If necessary, create special delimiter and write "cleaning" function.
    3rd. Check that that morph_info array matches entries in temp_morph
        and does not mess up punctuation processing.'''
    
    # temporary array for morpheme information
    morph_info = [TEMP, TEMP, TEMP, TEMP, morphemetype]
    # indexes for types of information to be in morph_info
    MORPH_IDX = 0
    MORPHEME_IDX = 1
    GLOSS_IDX = 2
    M_POS_IDX = 3
    TYPE_IDX = 4
    
    # make uniform label for all stem-like morphemes
    # assume missing morpheme type attribute is a stem
    if morphemetype == None or morphemetype in STEMS:
        morphemetype = 'stem'
        
    # make 2nd half of compound stems (bound root B) into suffixes (derivational)
    if morphemetype == COMPOUND2:
        morphemetype = 'suffix'
    
    # catch "new" morpheme types in current database
    if (morphemetype not in STEMS and morphemetype not in INFIXES
        and morphemetype != PROCLITIC and morphemetype not in ENCLITICS
        and morphemetype != PREFIX and morphemetype != SUFFIX
        and morphemetype != MWE and morphemetype != CIRCUMFIX
        and morphemetype != COMPOUND2):
            print("\nThis morpheme type XML attribute is not handled yet in getMorpheme(): " + morphemetype)
    
    # extract information about morpheme from IGT tiers
    for item in morphitem.iter('item'):
        if item.text != None:
            # TIER => surface morph (txt)
            if (item.get('type') == TXT):
                if morphemetype in ENCLITICS:
                    morph_info[MORPH_IDX] = CLITIC + cleanMorph(item.text)
                elif morphemetype == PROCLITIC:
                    morph_info[MORPH_IDX] = cleanMorph(item.text) + CLITIC
                else:
                    morph_info[MORPH_IDX] = cleanMorph(item.text)
            # TIER => canonical morpheme (cf)
            elif(item.get('type') == CAN_MORPHEME):
                # TODO: do not assume only 1 circumfix per word
                if morphemetype == CIRCUMFIX: 
                    # if first half of circumfix is word-initial, treat as prefix
                    if len(numaffix) == 1:
                        morph_info[MORPHEME_IDX] = cleanMorph(item.text) + CIRCUM_PRE
                    # if first half of circumfix is not word-initial, treat as infix
                    else:
                        morph_info[MORPHEME_IDX] = CIRCUM_POST + cleanMorph(item.text) + CIRCUM_PRE
                # treat circumfix halves as pre/suffix, treat circumfixed stem as stem
                elif '-...-' in item.text:
                    if morphemetype in STEMS or morphemetype == MWE:
                        morph_info[MORPHEME_IDX] = cleanMorph(item.text).replace('-...-', '')
                    elif morphemetype == PREFIX:
                        morph_info[MORPHEME_IDX] = cleanMorph(item.text).replace('-...-', CIRCUM_PRE)
                    elif morphemetype == SUFFIX:
                        morph_info[MORPHEME_IDX] = CIRCUM_POST + cleanMorph(item.text).replace('-...-', '')
                # other canonical morpheme types
                else:
                    if morphemetype in ENCLITICS:
                        morph_info[MORPHEME_IDX] = CLITIC + cleanMorph(item.text)
                    elif morphemetype == PROCLITIC:
                        morph_info[MORPHEME_IDX] = cleanMorph(item.text) + CLITIC
                    else:
                        morph_info[MORPHEME_IDX] = cleanMorph(item.text)
            # TIER => gloss
            elif (item.get('type') == GLOSS):
                morph_info[GLOSS_IDX] = cleanGloss(item.text, morphemetype)
            # TIER: morpheme pos
            elif(item.get('type') == M_POS):
                morph_info[M_POS_IDX] = cleanPOS(item.text)
                
    return morph_info

In [48]:
def getWPOS(current_tokenXML, current_token, current_token_type):
    temp_wpos = TEMP
    
    for word_item in current_tokenXML.iter('item'):
        if word_item.get('type') == WORD_POS:
            temp_wpos = cleanPOS(word_item.text)
    # generic POS for digits
    if current_token.isdigit():
        temp_wpos = DIGIT_GLOSS_POS
    # generic POS for punctuation
    if current_token_type == PUNCT or current_token == '~':
        temp_wpos = PUNCT.upper()
    
    return temp_wpos

## Main Extraction Function

In [49]:
def extract_flextext(flextext_filename):
    '''Takes FLExText XML any number of texts. OUTPUT list of line dicts: 
    [{title, comment, line#, free_transl, origline_w_digits_punct, words":[
                {word_txt, wpostag, "morphemes":[
                    [morph, morpheme, gloss, mpos, morphemetype]
    ]}]}]'''

    root = ET.parse(flextext_filename).getroot()
    lines = []
    total_lexemes = 0 #No punct or digits. NOTE: MWE is 1 lexeme
    total_tokens = 0
    pos_tags_in_corpus = set()
    
    for text in root.iter('interlinear-text'):
        title,comment = getTitleComment(text)
        
        # This gets info at the phrase (sentence/line), word, morpheme level, ignores paragraph breaks
        for line_idx,phrase in enumerate(text.iter('phrase')):
            temp_line = {}
            temp_words = []
            no_punct_line = ''
            orig_line = []

            # FLExtext "segnum" is ID for phrases
            if phrase.find('item').get('type') == 'segnum':
                lineid = phrase.find('item').text
            else:
                lineid = str(line_idx)                
            
            # Get free translations
            # TODO: handle as many languages if needed
            temp_transl = TEMP
            if phrase.find('item').get('type') == GLOSS:
                if phrase.find('item').get('lang') == ENGLISH:
                    temp_transl = phrase.find('item').text
                else:
                    temp_transl = phrase.find('item').text
            # This gets token (word) level info, as tokenized by FLEx user. 
            # Note: MWE (phrasal lexems) are one "word"
            for token_idx,token in enumerate(phrase.iter('word')):
                tokentype = token.find('item').get('type')
                token_string = cleanWord(token.find('item').text)
                
                # Uncomment line below to ignore punctuation (but not digits)
                #if tokentype != PUNCT and token_string != '~' and token_string != '':
                # Uncomment line below to ignore punctuation & digits 
                #if tokentype != PUNCT and not token_string.isdigit() and token_string != '~' and token_string != '':
                # Uncomment line below to keep punctuation and digits, only ignore empty items
                if token_string != '':
                    total_tokens+=1
                    orig_line.append(token_string)
                    temp_morphemes = []
                    affix_order = [] # order of affixes to align infixes later         
                    # get total lexemes 
                    if tokentype != PUNCT and token_string != '~' and not token_string.isdigit():
                        no_punct_line += token_string
                        total_lexemes+=1    
                    
                    # get word-level POS
                    temp_wpos = getWPOS(token, token_string, tokentype)
                    pos_tags_in_corpus.add(temp_wpos)
                    
                    # get interlinear tiers for word segments, if any
                    if token.find('morphemes') == None:  #TODO?: eliminate this line, use filter function
                        temp_morphemes.append([token_string, token_string, TEMP, temp_wpos, tokentype])
                    else:
                        for morph in token.iter('morph'):
                            morphemetype = morph.get('type')
                            
                            # TODO: for non-neural models (need input/output alignment)
                            # morpheme type will determine what part of string is infix
                            affix_order.append(morphemetype)
                            
                            # Handle infixes
                            #if len(affix_order) >= 2 and affix_order[-2] in infixes:
                                # NOTE: FLEx seems to always put infix before its stem
                                #preinfix, postinfix = getInfixedStem(str(wrd), morph, temp_word[-1])
                                # insert first half of prefix for surface segmentation
                                #infix_index = len(affix_order)-2
                                #temp_word.insert(infix_index-1, preinfix)
                                # add second half of infixed stem
                                #temp_morph = postinfix
                            #else: 
                            temp_morph = getMorpheme(morph, morphemetype, affix_order)
                            
                            # Add generic gloss to unglossed proper nouns
                            if temp_wpos == 'nprop' and morphemetype in STEMS:
                                if temp_morph[2] == TEMP: 
                                    temp_morph[2] = PROPER_NOUN_GLOSS
                            
                            # add morpheme to dict of word's segments
                            temp_morphemes.append(temp_morph)
                    
                    # create word dict
                    temp_words.append({TOKEN:token_string, POS:temp_wpos, MORPHEMES:temp_morphemes})            
        
            # add line
            orig_line = ' '.join(orig_line)
            temp_line = {TITLE:title, COMMENT:comment, SEGNUM:lineid, FT:temp_transl, ORIG_LINE:orig_line, WORDS:temp_words}
            lines.append(temp_line)
    
    # corpus statistics
    print("Parts of speech found in corpus:", pos_tags_in_corpus, end='\n\n')
    print("All Tokens:", total_tokens)
    print("Lexemes, ignoring punctuation and digits:", total_lexemes, end='\n\n')
    # sanity check first line
    print(lines[0][WORDS][:10])
    print()
                                
    return lines

### Filtering 

word_by_morpheme -> `lines["words"][word_idx]["morphemes"]`, i.e. [[morph, morpheme, gloss, mpos, morphemetype],...]

lexical_item -> `lines[words][word_idx]["orig_word"]`, i.e. wordstring

#### Morpheme level Filters

In [50]:
def glossed(word_by_morphemes):
    '''no words with missing glosses;
        assumes segmentation is complete'''
    glossed = True
    for segment in word_by_morphemes:
        # check gloss of morphemes
        if segment[2] == TEMP:
            glossed = False
            break # this line saves time 
    return glossed
    
    
def surf_segmented(word_by_morphemes):
    '''no words that have not been segmented 
    (i.e. no <morphemes> tag in XML)'''
    annotated = True
    if len(word_by_morphemes) == 1:
        if word_by_morphemes[0][0] == TEMP:
            annotated = False
    return annotated


def can_segmented(word_by_morphemes):
    '''no words that have not been canonically segmented'''
    annotated = True
    if len(word_by_morphemes) == 1:
        if word_by_morphemes[0][1] == TEMP:
            annotated = False
    return annotated

#### Word level filters

In [51]:
def multiword(lexical_item):
    '''no lexical items with spaces'''
    mwe = False
    # check original text of word
    if ' ' in lexical_item or '~' in lexical_item or '-' in lexical_item:
        mwe = True
    return mwe


def selected_pos(word_postag):
    '''filter for a list of specified word level POS'''
    #SELECT_POS_TAGS = ['Vnf', 'v', 'msd', 'Vf','n','IMPV','cop']
    SELECT_POS_TAGS = []
    undesired_pos = False
    # check word level POS tag
    if word_postag not in SELECT_POS_TAGS:
            undesired_pos = True
    return undesired_pos

#### COMBINE FILTER FUNCTIONS HERE 

In [52]:
def filtering(extractedtexts, bysentence):
    '''Write custom filter functions above, add calls here
        add/remove function calls as needed.
        Returns list of word dictionaries'''
    
    for_training = []
    non_training = []
    for line in extractedtexts:
        linewords = []
        linestatus = [] # to check for any filtered tokens
        temp_for_training = []
        temp_non_training = [] 
        
        for word in line[WORDS]:
            good4training = True
            
            '''Uncomment if statements to add word level filtering
              that filters certain word types or POS tags'''
            # To filter out MWE
            #if multiword(word['orig_word']): good4training = False
            # To filter out unselected, unspecified POS
            #if not selected_pos(word['POS']): good4training = False
            # To filter out digits
            #if word['POS'] == DIGIT_GLOSS_POS: good4training = False
            # To filter out punctuation
            #if word['POS'] == PUNCT: good4training = False
            
            '''Uncomment if statements to filter for specific annotation'''
            if good4training:
                # For training to gloss
                #if not glossed(word[MORPHEMES]): good4training = False
                # For training to surface segment
                #if not surf_segmented(word[MORPHEMES]): good4training = False
                # For training surface segmentation
                #if not surf_segmented(word[MORPHEMES]): good4training = False
                # For training canonical (underlying) segmentation
                #if not can_segmented(word[MORPHEMES]): good4training = False
                # For training POS tagging
                if word[POS] == TEMP: good4training = False
                
            linewords.append(word)
            linestatus.append(good4training)
            if good4training:
                temp_for_training.append(word)
            else:
                temp_non_training.append(word)
        
        # For training by sentences w/o filtered tokens
        if bysentence:
            # End of sentence marker
            EOS = {TOKEN:'EOS', POS:'@EOS@', MORPHEMES:['@EOS@', '@EOS@', '@EOS@', '@EOS@', '@EOS@']}
            linewords.append(EOS)
            if all(linestatus): 
                for_training.extend(linewords)
            else:
                non_training.extend(linewords)
        # For training by word, not sentence 
        else:
            if good4training:
                for_training.extend(temp_for_training)
            else:
                non_training.extend(temp_non_training)
                
    print("Total training examples, after filtering:", len(for_training), end='\n\n')
    return for_training,non_training

# Write to files

Get this list of words:  

Current: `[{title, comment, line#, free_transl, origline_w_digits_punct, words":[
                {word_txt, wpostag, "morphemes":[
                    [morph, morpheme, gloss, mpos, morphemetype]` 

Old: `[{"text_title":title, "text_comment":comment, "words":[
            {"segnum":line#, "orig_word":word, "POS":postag, "morphemes": [
                            [morph, morpheme, gloss, mpos, morphemetype]
                         ]}]` 

to files with one word per line

In [78]:
def check_alignment(a, b):
    if len(a) != len(b):
        raise ValueError("must be same number of morph(emes) and gloss in a word")
        
def poslines(listofwords, listofPOStags):
    "Arranges POS by sentences for training"
    stringtags = '%%'.join(listofPOStags)
    listofwords = [''.join(word.split()) for word in listofwords]
    stringwords = '%%'.join(listofwords)
    bysenttags = stringtags.split('@EOS@')
    bysentwords = stringwords.split('EOS')
    return [' '.join(sent.split('%%')).strip() for sent in bysentwords], [' '.join(sent.split('%%')).strip() for sent in bysenttags]
        

def dataFiles(extracted_words, training_task, outfilepath):
    '''Writes two files: X and y (tokens and annotations; input and output)'''
    
    input_data = []
    output_data = []
    
    for word in extracted_words: 
        # input string (X)
        input_data.append(' '.join(word[TOKEN])) # insert space between chars
        # output types (y)
        wPOS_tag = word[POS]
        canonical_morphemes = []
        surface_morphemes = []
        glosses = []
        for morpheme in word[MORPHEMES]:
            surface_morphemes.append(morpheme[0])
            canonical_morphemes.append(morpheme[1])
            glosses.append(morpheme[2])

        # determines what will be written to output file
        #TODO: _canSegGls & _canSeg must handle null morphemes for non-neural models (CRF)
        if training_task == '_pos':
            output_data.append(wPOS_tag)
        elif training_task == '_gls':
            output_data.append(' '.join(glosses))
        elif training_task == '_canSeg':
            output_data.append(' '.join(canonical_morphemes))
        elif training_task == '_surSeg':
            output_data.append(' '.join(surface_morphemes))
        elif training_task == '_surSegGls':
            check_alignment(surface_morphemes, glosses)
            combined_seg_gls = [morph+'#'+glosses[i] for i,morph in enumerate(surface_morphemes)]
            output_data.append(' '.join(combined_seg_gls))
        elif training_task == '_canSeg':
            output_data.append(' '.join(canonical_morphemes))
        elif training_task == '_canSegGls':
            check_alignment(canonical_morphemes, glosses)
            combined_seg_gls = [morpheme+'#'+glosses[i] for i,morpheme in enumerate(canonical_morphemes)]
            output_data.append(' '.join(combined_seg_gls))
        else:
            print("Output format not found.")
    
    # prepare for sentence level POS tagging
    if training_task == '_pos':
        input_data, output_data = poslines(input_data, output_data)
    
    with open(outfilepath+training_task+'.input', 'w', encoding='utf8') as I:
        I.write('\n'.join(input_data)[:-1])
    with open(outfilepath+training_task+'.output', 'w', encoding='utf8') as O:
        O.write('\n'.join(output_data)[:-1])

# Sample Run Code: Extract Surface Segmentation Data to Files

In [79]:
####### EXTRACT from flextext#######
def main(tostorepath, dbfile, bysentence=False):
    master_data = extract_flextext(dbfile)

    # NOTE: FIRST EDIT filtering() function to suit your task!
    training_words, unannotated_words = filtering(master_data, bysentence)
    print("Training examples:\n", training_words[-5:])
    print("Unannotated data:\n", unannotated_words[-5:])
    
    for task in DESIRED_TASKS:
        # write all extracted tokens to _M file 
        dataFiles(training_words+unannotated_words, task, tostorepath+'_Master')
        # write filtered out tokens to _U(nannotated) file
        if unannotated_words:
            dataFiles(unannotated_words, task, tostorepath+'_U')
        # write unfiltered tokens to T(raining) file
        dataFiles(training_words, task, tostorepath+'_T')

In [82]:
####### FOR FILTERING ####### 
# lezgi pos tags: {'ordnum', 'Vnf', 'num', 'indfpro', 'nprop', 'emph', 'Vocpart', 'proform', 'multipnum', 'prep', 'adv', 'post', 'ptcp', 'pers', 'verbprt', 'coordconn', 'adj', 'v', 'conn', 'poss', 'pro', 'prt', 'det', 'dem', 'interj', 'msd', 'subordconn', 'Vf', 'cardnum', 'n', 'interrog', 'recp'}
# Alas pos tags: {'num', 'n', 'refl', 'Aux', 'vt', 'cop', 'clf', 'adv', 'prt', 'Adj', 'cardnum', 'vi', 'stc', 'existmrkr', 'quant', 'relpro', 'ordnum', 'vd', 'distrnum', 'adj', 'Prep', 'nprop', 'interj', 'Conj', 'dem', 'v', 'pro'}
# Upper Tanana Pos tags: {'dem', 'advlizer', 'nvp', 'nprop', 'inter', 'proform', 'imp', 'coordconn', 'v', '@@@', 'nomprt', 'verbprt', 'adv', 'adj', 'NUM', 'n', 'interj', 'pro', 'onom', 'PUNCT', 'cardnum', 'quant', 'mod', 'DM', 'dir', 'post'}
# To decipher tasks: 
### gls = glossing only, seg = segmentation 
### can = canonical (underlying) morphemes, surf = surface morphs 
### SegGls = segmentation+glossing, pos = (word) POS tagging
### All possible tasks: ['_canSeg', '_surSeg', '_gls', '_canSegGls', '_surSegGls', '_pos']
DESIRED_TASKS = ['_pos']
LANGS = ['tau']

####### FILE LOCATIONS ####### 
for lang in LANGS:
    STORE = r"../../Teaching/NLPWorkshop/Alberta/data/"+lang+'/'+lang
    TO_EXTRACT = r'./flextexts/'+lang+'-all_txts.flextext'
    main(STORE, TO_EXTRACT, bysentence=True)

Parts of speech found in corpus: {'coordconn', 'advlizer', 'interj', 'imp', 'nprop', 'verbprt', 'DM', 'cardnum', 'proform', 'pro', '@@@', 'nvp', 'onom', 'NUM', 'mod', 'dem', 'inter', 'v', 'nomprt', 'adj', 'n', 'quant', 'PUNCT', 'adv', 'dir', 'post'}

All Tokens: 17587
Lexemes, ignoring punctuation and digits: 14099

[{'token': 'keey', 'wPOS': 'n', 'morphemes': [['keey', 'keey', 'village', 'n', 'stem']]}, {'token': 'tah', 'wPOS': 'post', 'morphemes': [['tah', 'tah', 'at:AR', 'post', 'stem']]}, {'token': 'hihneeshyąą', 'wPOS': 'v', 'morphemes': [['hih-', 'h-', '3PL.S.', 'v:Any', 'prefix'], ['nee-', 'nee-', 'QUAL:DH.PFV:Ø.', 'Verb', 'prefix'], ['shyąą', 'shyąą', 'grow:PFV', 'v', 'stem']]}, {'token': 'jah', 'wPOS': 'adv', 'morphemes': [['jah', 'jah', 'here', 'adv', 'stem']]}, {'token': 'dineh', 'wPOS': '@@@', 'morphemes': [['dineh', 'dineh', 'person', 'n', 'stem']]}, {'token': 'huuniign', 'wPOS': 'v', 'morphemes': [['huu-', 'huu-', '3SG.S:QUAL:DH.PFV:Ø.', 'Verb', 'prefix'], ['niign', 'niig