# Extract Data from flextext XML files

In [135]:
import xml.etree.ElementTree as ET # parses XML files
import string
import os
import json

## Variables, Generic Tags/Morphemes, and Other Codes
GLOBAL variables that match flextext XML attributes and other information required for consistent handling of the IGT data.

In [136]:
# placeholder for unannotated tiers
TEMP = '@UNK@'

# Generic glosses & POS tags for tokens usually left unannotated
PROPER_NOUN = 'nprop'
DIGIT = 'num'

### FLEXTEXT ATTRIBUTES
# flextext XML text/discourse attributes
TITLE_TYPE = 'title'
COMMENT_TYPE = 'comment'
# flextext XML attributes for languages/scripts used in title or translations
ENGLISH = 'en'
INDONESIAN = 'id'

# flextext XML IGT tier attributes
TXT = 'txt' # surface morph/segment AND transcribed text
CAN_MORPHEME = 'cf' # canonical (underlying) morpheme
GLOSS = 'gls' # morpheme gloss AND sentence free translation
M_POS = 'msa' # morpheme-level pos = what POS the affix attaches to
WORD_POS = 'pos' # word-level pos
WORD_GLOSS = 'gls' # word level gloss
PUNCT = 'punct' # punctuation

# flextext XML morpheme attributes
MWE = 'phrase' # multiword expression
PREFIX = 'prefix'
SUFFIX = 'suffix'
CIRCUMFIX = 'circumfix'
INFIXES = ['infix', 'infixing interfix']
PROCLITIC = 'proclitic'
ENCLITICS = ['enclitic', 'clitic'] # Enclitics labed as 'clitic' in some FLEx databases (e.g. lmk)
COMPOUND2 = 'bound root B' # Treat second root in compound word as suffix when delimiter markers are needed
STEM = 'stem'
# all morpheme types
STEMS = {'stem', 'bound stem', 'bound root', 'bound root A', 'root', 'particle', 'root'}
AFFIXES = [[MWE, PREFIX, SUFFIX, CIRCUMFIX, PROCLITIC], ENCLITICS, INFIXES]

### Segment boundaries markers used in flextext XML
# Affix types boundaries are marked uniquely in FLEx.
# NOTE: Each FLEx database may handle circumfixes boundaries differently.
# NOTE: these symbols will need to be removed before re-importing to FLEx.
CIRCUM_PRE = '>'
CIRCUM_POST = '<'
CIRCUM_HOLE = '<>'
CLITIC = '='
BOUNDROOT = '*'

### Keys for output dictionary/JSON
TITLE = 'text_title'
COMMENT = 'text_comment'
SEGNUM = 'line#'
FT = 'free_transl'
ORIG_LINE = 'orig_line'
WORDS = 'words'
TOKEN = 'token'
POS = 'wPOS'
WGLOSS = 'word_gloss'
MORPHEMES = 'morphemes'

## Extract Metadata for Each Text
These functions get metadata associated with each text/discourse such as title and comments.

In [137]:
def getTitleComment(xmlsection):
    '''find title and comment if in this section
    some documents have both and english and native language titles
    these checks assure that the both will always be used if found separated by //
    if only one of them is found then it is used
    if none are found return NO TITLE FOUND'''
    
    title = "NO TITLE FOUND" 
    eng_title = TEMP
    non_eng_title = TEMP
    comment = "No comment"
    
    for item_lin in xmlsection.iter('item'):
        if item_lin.get('type') == TITLE_TYPE and item_lin.get('lang') == ENGLISH:
            eng_title = item_lin.text
        if item_lin.get('type') == TITLE_TYPE and item_lin.get('lang') != ENGLISH:
            non_eng_title = item_lin.text
        if item_lin.get('type') == COMMENT_TYPE and item_lin.get('lang') == ENGLISH:
            comment = item_lin.text
    # check languages of title and add either or both
    if eng_title != TEMP and non_eng_title == TEMP:
        title = eng_title 
    elif eng_title == TEMP and non_eng_title != TEMP:
        title = non_eng_title
    elif eng_title != TEMP and non_eng_title != TEMP:
        title = eng_title + ' // ' + non_eng_title 
        
    return title, comment

## Preprocessing for Consistent and Simplified Data
These functions can handle pecularities of a particular corpus and non-conventional IGT annotations. Some may be dispreferred for your purposes and can be edited or commented out, e.g. should bound roots be treated as other roots/stems? 

Most of this preprocessing must be reversed before re-importing to FLEx. 

In [138]:
def cleanWord(IGTstring):  
    '''Handle unconventional or confusing use of punctuation in words'''
    
    IGTstring = str(IGTstring)
    
    # phrasal lexemes delimited by double tilde
    # IGTstring = IGTstring.strip().replace(' ', '~~')
    # Strip hyphens from words - hyphens used as Cyrillic quotation mark 
    IGTstring = IGTstring.strip('-')
    # Use tilde in hyphenated words to avoid confusing morpheme delimiter,
    # because not all linguists segment hyphenated words on morpheme tier
    IGTstring = IGTstring.replace('-', '~')
    
    return IGTstring.strip().lower()
    
    
def cleanMorph(IGTstring):
    '''Handle inconsistent or unconventional annotations in morpheme tiers
    (e.g. includes infixes and circumfix halves)'''
    
    # Replace spaces with period
    IGTstring = IGTstring.replace(' ', '.')
    # Make morphemes tiers are case insensitive 
    IGTstring = IGTstring.lower()
    # Null morpheme symbol should be consistent across databases, avoid encoding issues
    # Add unique null morpheme symbol below
    IGTstring = IGTstring.replace('Ø','NULL').replace('∅', 'NULL').replace('zero', 'NULL')
    IGTstring = IGTstring.replace('∅', 'NULL')
    IGTstring = IGTstring.replace('zero', 'NULL')
    IGTstring = IGTstring.replace('*0','NULL') # Lezgi [lez] 
    
    ##OPTIONAL, un/comment as needed
    # Remove * on bound roots, to treat as regular stem/root
    IGTstring = IGTstring.replace(BOUNDROOT, '-') # ntu
        
    return IGTstring.strip()


def cleanGloss(IGTstring, morpheme_type):
    '''Handles inconsistent or unconventional glosses.
    Follows Leipzig glossing rules where possible'''
    
    # Delimit multiple senses in glosses with '.''
    IGTstring = IGTstring.replace('-','.')
    IGTstring = IGTstring.replace(' ', '.')
    # Affix glosses should be in CAPS
    if morpheme_type not in STEMS:
        IGTstring = IGTstring.upper()
    
    return IGTstring.strip()


def cleanWGloss(IGTstring):
    '''Handle inconsistencies and unconventional word level glosses'''
    
    # Strip hyphens from word glosses. This handles hyphen as Cyrillic quotation mark 
    IGTstring = IGTstring.strip('-')
    # Use tilde in hyphenated words to avoid confusing with morpheme delimiter
    IGTstring = IGTstring.replace('-', '~')
    
    return IGTstring.strip()


def cleanPOS(IGTstring):
    '''Handles inconsistencies and unconventional morpheme-level and word-level POS'''
    
    ### OPTIONAL: add other pre-processing specific to a database
    IGTstring = IGTstring.replace('N (kx cl)', 'N(kx.cl)') # Natugu [ntu] morpheme pos
    
    ## Delimit multiple tags per token with '.'
    IGTstring = IGTstring.replace(' ', '.')
    # Remove FLEx-inserted hyphens, to avoid confusing with morpheme delimiter
    IGTstring = IGTstring.replace('pro-form', 'proform')
    IGTstring = IGTstring.replace('Nom-1','Nom1')
    
    return IGTstring.strip()   

## Extract Interlinear Tiers Annotations

### Morpheme Tier Annotations
This includes surface morph, morpheme, morpheme type and boundary delimiters, morpheme "POS" (category of attachment), etc.

In [139]:
def getInfixedStem(wordtxt, morphitem, infix):
    '''Infixed stems need special processing,
    especially for NLP models that require one gloss per morpheme'''
    
    # NOTE: FLEx seems to always put infix before its stem
    # THIS CODE CUT FROM MAIN EXTRACTION FUNCTION. 
    # In future may need to handle inflectional infixes 
    # instead treating as part of stem
    # because all my databases so far only have derivationl infixes, and few very of them
                                #preinfix, postinfix = getInfixedStem(str(wrd), morph, temp_word[-1])
                                # insert first half of prefix for surface segmentation
                                #infix_index = len(affix_order)-2
                                #temp_word.insert(infix_index-1, preinfix)
                                # add second half of infixed stem
                                #temp_morph = postinfix
    
    pre_temp_morph = [TEMP, TEMP, TEMP, TEMP]
    post_temp_morph = [TEMP, TEMP, TEMP, TEMP]
    
    infix = infix[0][1:-1] # remove dashes surrounding infixes
    stemhalves = wordtxt.split(infix) # treat strings surrounding infixes as stems
    
    # get other tiers
    for item in morphitem.iter('item'):
        if item.get('type') != None or item.text != '' or item.text != '<NotSure>' or item.text != ' ':
            # get surface morph, treat same as stem halves
            if (item.get('type') == TXT):
                pre_temp_morph[0] = cleanGloss(stemhalves[0])
                post_temp_morph[0] = cleanGloss(stemhalves[1])
            # canonical morpheme, will be nothing for first half if infixed
            elif(item.get('type') == CAN_MORPHEME):
                pre_temp_morph[1] = cleanMorph(item.text)
                post_temp_morph[1] = cleanMorph(item.text)
            # gloss, same for both
            elif(item.get('type') == GLOSS):
                # separate multi-word glosses with "."
                pre_temp_morph[2] = cleanGloss(item.text)
                post_temp_morph[2] = cleanGloss(item.text)
            # morpheme pos
            elif(item.get('type') == M_POS):
                pre_temp_morph[3] = cleanPOS(item.text)
                post_temp_morph[3] = cleanPOS(item.text)
        else:
            continue
    
    return pre_temp_morph, post_temp_morph

In [140]:
def generalizeMorphemeType(morphemetype):
    '''Reduces number of morpheme types allowed in FLEx'''
    
    unknown_morphemetype = True
    
    ## Stems: Use same tag for all stem-like morphemes
    # Assume missing morpheme type attribute indicates stem
    if morphemetype == None or morphemetype in STEMS:
        morphemetype = STEM  
        unknown_morphemetype = False
    # Treat 2nd half of compound word (bound root B) as suffix to keep boundary marker
    elif morphemetype == COMPOUND2:
        morphemetype = SUFFIX
        unknown_morphemetype = False
        
    ## Catch any morpheme types not handled yet
    elif unknown_morphemetype:
        for affixtype in AFFIXES:
            if morphemetype in affixtype:
                unknown_morphemetype = False
                break
    if unknown_morphemetype:
        print("\nThis morpheme type XML attribute is not handled yet in getMorpheme(): " + morphemetype)
    
    return morphemetype


def affixDelimiter(morphemetype, morphemetext):
    '''Add delimiter indicating affix type'''
    
    if morphemetype in ENCLITICS:
        return CLITIC + cleanMorph(morphemetext)
    elif morphemetype == PROCLITIC:
        return cleanMorph(morphemetext) + CLITIC
    else:
        return cleanMorph(morphemetext)

    
def circumfixDelimiter(morphemetype, morphemetext, numaffix):
    '''FLEx handles circumfixes differently.
    This makes that consistent at canonical morpheme level.
    TODO?: do not assume only 1 circumfix per word'''
    
    if morphemetype == CIRCUMFIX:
        # if first half of circumfix is word-initial, treat as prefix
        if len(numaffix) == 1:
            return cleanMorph(morphemetext) + CIRCUM_PRE
        # if first half of circumfix is not word-initial, treat as infix
        else:
            return CIRCUM_POST + cleanMorph(morphemetext) + CIRCUM_PRE
    
    # Treat second circumfix half as suffix and circumfixed stem as stem
    elif '-...-' in morphemetext:
        if morphemetype in STEMS or morphemetype == MWE:
            return cleanMorph(item.text).replace('-...-', '')
        elif morphemetype == PREFIX:
            return cleanMorph(morphemetext).replace('-...-', CIRCUM_PRE)
        elif morphemetype == SUFFIX:
            return CIRCUM_POST + cleanMorph(morphemetext).replace('-...-', '')

         
def getMorpheme(morphsubitems, morphemetype, numaffix):
    '''OUTPUT for each morpheme: [morph, morpheme, gloss, mpos, morphemetype]
    
    To add more tiers annotations to this array:
    1st. Add another holding place in the morph_info array with TEMP variable
    2nd. Give index for new tier annotation.
    3rd. Add elif statement for new tier using the attribute needed, e.g. 'morpheme type'.
        If necessary, create special delimiter and add/edit a "preprocessing/cleaning" function above.
    4th. Check that that morph_info array matches entries in temp_morph
        and does not mess up with punctuation processing, etc.'''
    
    # 1st. temporary array for morpheme information
    morph_info = [TEMP, TEMP, TEMP, TEMP, generalizeMorphemeType(morphemetype)]
    
    # 2nd. indexes for types of information to be in morph_info
    MORPH_IDX = 0
    MORPHEME_IDX = 1
    GLOSS_IDX = 2
    M_POS_IDX = 3
    TYPE_IDX = 4
    
    # 3rd. extract IGT tier annotations for morpheme
    for item in morphsubitems.iter('item'):
        if item.text != None:
            # TIER => surface morph (txt)
            if (item.get('type') == TXT):
                morph_info[MORPH_IDX] = affixDelimiter(morphemetype, item.text)

            # TIER => canonical morpheme (cf)
            elif(item.get('type') == CAN_MORPHEME):
                if morphemetype == CIRCUMFIX or '-...-' in item.text: 
                    morph_info[MORPHEME_IDX] = circumfixDelimiter(morphemetype, item.text, numaffix)
                else:
                    morph_info[MORPH_IDX] = affixDelimiter(morphemetype, item.text)
                    
            # TIER => morpheme gloss
            elif (item.get('type') == GLOSS):
                morph_info[GLOSS_IDX] = cleanGloss(item.text, morphemetype)
            
            # TIER: morpheme pos
            elif(item.get('type') == M_POS):
                morph_info[M_POS_IDX] = cleanPOS(item.text)
                           
    return morph_info

### Word Level Tier Annotations
This includes word level POS tag, word gloss, etc

In [141]:
def genericPOS(current_token, current_token_type):
    '''Add missing POS tag for digit, punct'''
    
    if current_token.isdigit():
        return DIGIT
    
    if current_token_type == PUNCT or current_token == '~':
        return PUNCT.upper()
    
    return TEMP


def getWPOS(current_tokenXML, current_token, current_token_type):
    '''Extract word level POS tags'''
    
    temp_wpos = TEMP
    for word_item in current_tokenXML.iter('item'):
        if word_item.get('type') == WORD_POS:
            temp_wpos = cleanPOS(word_item.text)
    
    if temp_wpos == TEMP:
        temp_wpos = genericPOS(current_token, current_token_type)
    
    return temp_wpos


def getWordGloss(current_tokenXML, current_token, current_token_type):
    '''Extract word level gloss if present. 
    Currently ASSUMES ONLY TWO WORD GLOSS LANGS and ENG IS always 1st.
    TODO: Find a way not to depend on index of word level XML nodes'''
    
    word_gloss = TEMP
    # Can't iterate on only one level easily with ETree, so use index of word level XML nodes
    if len(current_tokenXML) < 4:
        # If punctuation gloss missing, copy from original text
        if current_tokenXML[0].get('type') == PUNCT:
            word_gloss = PUNCT
    elif current_tokenXML[-2].get('type') == WORD_GLOSS and current_tokenXML[-2].get('lang') == 'en':
        if current_tokenXML[-2].text != None:
            word_gloss = cleanWGloss(current_tokenXML[-2].text)
    
    # Choose English gloss.
    elif current_tokenXML[-3].get('type') == WORD_GLOSS and current_tokenXML[-3].get('lang') == 'en':
         if current_tokenXML[-3].text != None:
            word_gloss = cleanWGloss(current_tokenXML[-3].text)
    
    return word_gloss

### Line/Sentence/Phrase Tier Annotations

This includes free translations, etc.

In [142]:
def getFreeTransl(phrase):
    '''Get Free translations.
    # TODO: handle as many languages if needed'''
    
    if phrase.find('item').get('type') == GLOSS:
        if phrase.find('item').get('lang') == ENGLISH:
            return phrase.find('item').text
        else:
            return phrase.find('item').text
    return TEMP

## Main Extraction

Words: 

Current: `[{title, comment, line#, free_transl, origline_w_digits_punct, "words":[
                {'token': word_txt, 'wPOS':wpostag, 'word_gloss':wgloss, "morphemes":[
                    [morph, morpheme, gloss, mpos, morphemetype]` 
                    
         e.g. words: [{'token': 'birii', 'wPOS': 'V.ISA.MRK.UV', 'morphemes': [['biri', 'bori', 'give', 'V', 'stem'], ['-i', '-ei', 'ISA.MRK.UV.IMP', 'V.ISA>V.ISA.MRK.UV', 'suffix']], 'word_gloss': 'give'}, {'token': 'ihi', 'wPOS': 'PERS.PRO', 'morphemes': [['ihi', 'ihi', 'kami', 'PERS.PRO', 'stem']], 'word_gloss': 'we'}, {'token': 'egas', 'wPOS': 'N', 'morphemes': [['egas', 'egas', 'beras', 'N', 'root']], 'word_gloss': 'rice'}, {'token': '.', 'wPOS': 'PUNCT', 'morphemes': [['.', '.', '@@@', 'PUNCT', 'punct']], 'word_gloss': 'punct'}]

Old: `[{"text_title":title, "text_comment":comment, "words":[
            {"segnum":line#, "token":word, "POS":postag, "morphemes": [
                            [morph, morpheme, gloss, mpos, morphemetype]
                         ]}]` 

In [143]:
def genericGloss(temp_wpos, morphemetype):
    '''Add generic gloss to unglossed 
    proper nouns (stem), punct, and digits.'''
    if temp_wpos == PROPER_NOUN and morphemetype in STEMS:
        return PROPER_NOUN.upper()
    if temp_wpos == DIGIT:
        return DIGIT.upper()
    elif temp_wpos == PUNCT:
        return PUNCT.upper()
    return TEMP


def genericMorpheme(token_string, morph, temp_wpos, morphemetype):
    '''Proper nouns (stem), punct, and digits only:
    Copy morph if morpheme missing. 
    Copy original word string if missing morph.'''
    if (temp_wpos == PROPER_NOUN and morphemetype in STEMS) or temp_wpos == DIGIT or temp_wpos == PUNCT:
        if morph == TEMP:
            return token_string, token_string
        else:
            return morph, morph
    return morph, TEMP


#### MAIN EXTRACTION ####
def extract_flextext(flextext_filename):
    '''Takes FLExText XML any number of texts. OUTPUT list of line dicts: 
    [{title, comment, line#, free_transl, origline_w_digits_punct, "words":[
                {word_txt, wpostag, wgloss, "morphemes":[
                    [morph, morpheme, gloss, mpos, morphemetype]]}]}]'''

    root = ET.parse(flextext_filename).getroot()
    lines = []
    total_lexemes = 0 # Don't count punct or digits. NOTE: MWE is 1 lexeme
    total_tokens = 0
    pos_tags_in_corpus = set()
    
    for text in root.iter('interlinear-text'):
        title,comment = getTitleComment(text)
        # Extract phrase (sentence/line), word, morpheme level, ignore paragraph breaks
        for line_idx,phrase in enumerate(text.iter('phrase')):
            temp_line = {}
            temp_words = []
            no_punct_line = ''
            orig_line = []

            # FLExtext "segnum" is ID for phrases
            if phrase.find('item').get('type') == 'segnum':
                lineid = phrase.find('item').text
            else:
                lineid = str(line_idx)                
            
            # Extract free translations
            temp_transl = getFreeTransl(phrase)
            
            # Extract token (word) level info, as tokenized by FLEx user.
            # Note: MWE (phrasal lexems) are treated as one "word"
            for token_idx,token in enumerate(phrase.iter('word')):
                tokentype = token.find('item').get('type')
                token_string = cleanWord(token.find('item').text)
                
                if token_string: # Ignore blank strings
                    total_tokens+=1
                    orig_line.append(token_string)
                    temp_morphemes = []
                    affix_order = [] # order of affixes to align infixes later         
                    
                    # Count lexemes
                    if tokentype != PUNCT and token_string != '~' and not token_string.isdigit():
                        no_punct_line += token_string
                        total_lexemes+=1    
                    
                    # Extract word-level tier annotations
                    temp_wgloss = getWordGloss(token, token_string, tokentype)
                    temp_wpos = getWPOS(token, token_string, tokentype)
                    pos_tags_in_corpus.add(temp_wpos)
                    
                    # Extract morpheme level tier annotations, if any
                    if token.find('morphemes') == None:  
                        temp_morphemes.append([token_string, TEMP, TEMP, temp_wpos, tokentype])
                    else:
                        for morphsubitem in token.iter('morph'):
                            morphemetype = morphsubitem.get('type')
                            
                            # TODO: for non-neural models which need input/output alignment
                            # morpheme type will determine what part of string is infix
                            affix_order.append(morphemetype)
                            
                            # TODO: Handle infixes
                            #if len(affix_order) >= 2 and affix_order[-2] in infixes:
                            #else: 
                            temp_morph = getMorpheme(morphsubitem, morphemetype, affix_order)
                            
                            # Create generic gloss and morpheme segments
                            if temp_morph[2] == TEMP: 
                                temp_morph[2] = genericGloss(temp_wpos, morphemetype)
                            if temp_morph[1] == TEMP:
                                temp_morph[0], temp_morph[1] = genericMorpheme(token_string, temp_morph[0], temp_wpos, morphemetype)
                            
                            temp_morphemes.append(temp_morph)
                    
                    temp_words.append({TOKEN:token_string, POS:temp_wpos, MORPHEMES:temp_morphemes, WGLOSS:temp_wgloss})            
        
            orig_line = ' '.join(orig_line)
            temp_line = {TITLE:title, COMMENT:comment, SEGNUM:lineid, FT:temp_transl, ORIG_LINE:orig_line, WORDS:temp_words}
            lines.append(temp_line)
    
    # Print corpus statistics
    print("Parts of speech found in corpus:", pos_tags_in_corpus, end='\n\n')
    print("All Tokens:", total_tokens)
    print("Lexemes, ignoring punctuation and digits:", total_lexemes)
    # sanity check first line
    print('Initial Extraction Sanity Check:', lines[0][WORDS][:10])
    print('Extraction done...\n')
                                
    return lines

## Post Extraction Filtering 

Filtering has to happen after extraction because all the preprocessing, "cleaning", and adding generic tags and morphemes has to be done first. 

### Morpheme Filters

To filter out words or lines based on morpheme annotations

In [144]:
def glossed(word_by_morphemes):
    '''checks gloss of every morpheme in a word
        if missing glosses, flags word as unglossed;
        assumes segmentation is complete'''
    for segment in word_by_morphemes:
        if segment[2] == TEMP:
            return False
    return True
    
def surfSegmented(word_by_morphemes):
    '''flags words that have not been segmented 
    (i.e. no <morphemes> tag in XML, or no morph)
    should not flag monomorphemic words'''
    if len(word_by_morphemes) == 1 and word_by_morphemes[0][0] == TEMP:
        return False
    return True

def canonSegmented(word_by_morphemes):
    '''no words that have not been canonically segmented'''
    if len(word_by_morphemes) == 1 and word_by_morphemes[0][1] == TEMP:
        return False
    return True

### Word filters

Filter words or lines based on word annotations

In [145]:
def properNoun(word_postag):
    if word_postag == PROPER_NOUN:
        return True
    return False

def punct(word_postag):
    if word_postag == PUNCT:
        return True
    return False
                           
def multiword(lexical_item):
    '''flag if original text of token has spaces'''
    if ' ' in lexical_item or '~' in lexical_item or '-' in lexical_item:
        return True
    return False

def selected_pos(word_postag):
    '''filter a list of specified word level POS'''    
    # check word level POS tag
    if word_postag not in SELECT_POS_TAGS:
        return True
    return False

### Line Filters

Filter lines or texts based on line/sentence/phrase/clause annotations, e.g. free translation

In [146]:
def hasFreeTrans(ft_line):
    '''Checks if has (English) free translation.
    TODO: Check by language'''
    if ft_line:
        return True
    return False

### COMBINE FILTERING FUNCTIONS HERE AS NEEDED

In [147]:
def filtering(extractedtexts, task, bysentence, useoriginalline):
    '''Write custom filter functions above, un/comment calls here
        as needed for your purposes to create gold standard dataset.'''
    
    gold_standard = []
    unlabeled = []
    punctuation = 0
    digits = 0
    tokens = 0
    # End of sentence marker
    EOS = {TOKEN:'EOS', POS:'@EOS@', MORPHEMES:['@EOS@', '@EOS@', '@EOS@', '@EOS@', '@EOS@'], WGLOSS: '@EOS@'}
    
    for line in extractedtexts:
        linewords = [] 
        linestatus = [] # Check for unannotated items
        temp_gold_standard = []
        temp_unlabeled = [] 
        
        for word in line[WORDS]:
            tokens+=1
            good4training = True
            
            ## Find missing morpheme glosses
            if (task == '_gls' or task == '_surSegGls' or task == '_canSegGls') and not glossed(word[MORPHEMES]): good4training = False
            
            ## Find missing canonical (underlying) segments
            if (task == '_canSeg' or task == '_canSegGls') and not canonSegmented(word[MORPHEMES]): good4training = False
            
            ## Find missing surface segments
            if (task == '_surSeg' or task == '_surSegGls') and not surfSegmented(word[MORPHEMES]):  good4training = False
            
            ## Find missing word POS tags
            if (task == '_pos' or task == '_infl') and word[POS] == TEMP: good4training = False
            
            ## Find missing word gloss
            if task == '_wrdgls' and word[WGLOSS] == TEMP: good4training = False
            
            ### Uncomment lines below to find missing annoations or undesirable tokens ####
            ## Filter out MWE
            #if multiword(word['token']): good4training = False
            
            ## Filter out unknown, unselected, or unspecified POS
            #if not selected_pos(word[POS]): good4training = False
            
            ## Filter out digits
            #if word[POS] == DIGIT: 
                #good4training = False
                #digits+=1
            
            ## Filter out punctuation
            #if isPunct(word[POS]): 
                #good4training = False
                #punctuation+=1
            
            ### Split filtered and unfiltered words
            if not bysentence:
                if good4training:
                    temp_gold_standard.append(word)
                else:
                    temp_unlabeled.append(word)
                    
            # Create new sentences from unfiltered word/pos
            else:
                linewords.extend(word)
                linestatus.append(good4training)
                
        ### Combine datasets
        # Filter and split sentences
        if bysentence:
            ## Add end of sentence marker: to recreate sentences after training NLP model by word tokens
            #linewords.append(EOS)
            if all(linestatus):
                if useoriginalline: 
                    gold_standard.append(line)
                else: 
                    gold_standard.append(linewords)
            else:
                if useoriginalline: 
                    unlabeled.append(line)
                else: 
                    unlabeled.append(linewords)

        # Filter words
        else:
            gold_standard.extend(temp_gold_standard)
            unlabeled.extend(temp_unlabeled)
    
    print('Post filtering statistics:')
    print("\tTotal words after filtering:", tokens)
    print("\tTotal training examples, after filtering for", task, len(gold_standard))
    print("\tTotal punctuation and digits:", punctuation+digits, end='\n\n')
    
    return gold_standard, unlabeled

## Write to Files

### Final Checks, Formatting,  and Write to Text Files

Doublecheck there's a gloss for every morph(eme) and same number POS tags / word glosses as tokens.

In [148]:
def aligned(a, b):
    if len(a) != len(b):
        raise ValueError("must be same number of morph(emes) and gloss in a word")
        
# Prepare sentence level alignments for text files
def poslines(listofwords, listofPOStags):
    "Arranges POS by sentences for training"
    stringtags = '%%'.join(listofPOStags)
    listofwords = [''.join(word.split()) for word in listofwords]
    stringwords = '%%'.join(listofwords)
    bysenttags = stringtags.split('@EOS@')
    bysentwords = stringwords.split('EOS')
    return [' '.join(sent.split('%%')).strip() for sent in bysentwords], [' '.join(sent.split('%%')).strip() for sent in bysenttags]

# Prepare sentence level alignments for text files
def wrdglsLines(listofwords, listofwordglosses):
    "Arranges word glosses by sentences for training"
    stringtags = '%%'.join(listofwordglosses)
    listofwords = [''.join(word.split()) for word in listofwords]
    stringwords = '%%'.join(listofwords)
    bysenttags = stringtags.split('@EOS@')
    bysentwords = stringwords.split('EOS')
    return [' '.join(sent.split('%%')).strip() for sent in bysentwords], [' '.join(sent.split('%%')).strip() for sent in bysenttags]

In [149]:
def dataFiles(extracted_words, training_task, outfilepath):
    '''Writes two text files: X and y (tokens and annotations; input and output)'''
    
    input_data = []
    output_data = []
    
    for word in extracted_words: 
        if training_task != '_gls':
            # input string (X)
            input_data.append(' '.join(word[TOKEN])) # insert space between chars
        
        # output types (y)
        wPOS_tag = word[POS]
        word_gloss = word[WGLOSS]
        inflection_gloss = [word[POS]]
        canonical_morphemes = []
        surface_morphemes = []
        glosses = []
        for morpheme in word[MORPHEMES]:
            surface_morphemes.append(morpheme[0])
            canonical_morphemes.append(morpheme[1])
            glosses.append(morpheme[2])
            if morpheme[-1] != STEM:
                inflection_gloss.append(morpheme[2])

        # determines what will be written to output file
        #TODO: _canSegGls & _canSeg must handle null morphemes for non-neural models (CRF)
        if training_task == '_pos':
            output_data.append(wPOS_tag)
        elif training_task == '_wrdgls':
            output_data.append(wordgloss)
        elif training_task == '_gls':
            input_data.append(' '.join(surface_morphemes)) # input string (X)
            output_data.append(' '.join(glosses)) 
        elif training_task == '_canSeg':
            output_data.append(' '.join(canonical_morphemes))
        elif training_task == '_surSeg':
            output_data.append(' '.join(surface_morphemes))
        elif training_task == '_surSegGls':
            aligned(surface_morphemes, glosses)
            combined_seg_gls = [morph+'#'+glosses[i] for i,morph in enumerate(surface_morphemes)]
            output_data.append(' '.join(combined_seg_gls))
        elif training_task == '_canSeg':
            output_data.append(' '.join(canonical_morphemes))
        elif training_task == '_canSegGls':
            aligned(canonical_morphemes, glosses)
            combined_seg_gls = [morpheme+'#'+glosses[i] for i,morpheme in enumerate(canonical_morphemes)]
            output_data.append(' '.join(combined_seg_gls))
        elif training_task == '_infl':
            output_data.append(' '.join(inflection_gloss))
        else:
            print("Output format not found.")
    
    # prepare for sentence level POS tagging and word glossing
    if training_task == '_pos':
        input_data, output_data = poslines(input_data, output_data)
    if training_task == '_wrdgls':
        input_data, output_data = wrdglsLines(input_data, output_data)
    
    # write master data files
    with open(outfilepath+training_task+'.input', 'w', encoding='utf8') as I:
        I.write('\n'.join(input_data)[:-1])
    with open(outfilepath+training_task+'.output', 'w', encoding='utf8') as O:
        O.write('\n'.join(output_data)[:-1])

### Dump to Json File

In [150]:
def flextext2Json(pathname, gold_standard):
    with open(pathname+'.json', 'w') as write_file:
        json.dump(gold_standard, write_file)

## Main code 

In [151]:
####### EXTRACT from flextext#######
def main(tostorepath, dbfile, task, bysentence, useoriginalline, json):
    
    master_data = extract_flextext(dbfile)

    # NOTE: FIRST EDIT filtering() function to suit your task!
    gold_standard, unannotated = filtering(master_data, task, bysentence, useoriginalline)
    print("\nSanity check training examples:\n", gold_standard[-5:])
    print("\nSanity check unannotated data:\n", unannotated[-5:])
    
    #### Write files ####
    ### Text files, word per line
    if not json:
        ## write all extracted tokens to _M file 
        dataFiles(gold_standard+unannotated, task, tostorepath+'_Master')
        ## write filtered out tokens to _U(nannotated) file
        if unannotated:
            dataFiles(unannotated, task, tostorepath+'_U')
        ## write remaining tokens to T(raining)/L(abeled) file
        dataFiles(gold_standard, task, tostorepath+'_L')
    
    ### Filtered data only to JSON file
    if json:
        flextext2Json(tostorepath+task, gold_standard)

# Sample Run 

In [152]:
#### NOTE: EDIT filtering() function above to suit your purposes!!! ####

#### FOR POS FILTERING:
### Current possible POS tags to select
## lezgi pos tags: {'ordnum', 'Vnf', 'num', 'indfpro', 'nprop', 'emph', 'Vocpart', 'proform', 'multipnum', 'prep', 'adv', 'post', 'ptcp', 'pers', 'verbprt', 'coordconn', 'adj', 'v', 'conn', 'poss', 'pro', 'prt', 'det', 'dem', 'interj', 'msd', 'subordconn', 'Vf', 'cardnum', 'n', 'interrog', 'recp'}
## Alas pos tags: {'num', 'n', 'refl', 'Aux', 'vt', 'cop', 'clf', 'adv', 'prt', 'Adj', 'cardnum', 'vi', 'stc', 'existmrkr', 'quant', 'relpro', 'ordnum', 'vd', 'distrnum', 'adj', 'Prep', 'nprop', 'interj', 'Conj', 'dem', 'v', 'pro'}
## Upper Tanana Pos tags: {'dem', 'advlizer', 'nvp', 'nprop', 'inter', 'proform', 'imp', 'coordconn', 'v', '@@@', 'nomprt', 'verbprt', 'adv', 'adj', 'NUM', 'n', 'interj', 'pro', 'onom', 'PUNCT', 'cardnum', 'quant', 'mod', 'DM', 'dir', 'post'}
## Bonggi POS tags: {'V.ACH.ABIL', 'V.ACL', 'CLF', 'V.ST1', 'INT.EXP.ST', 'Coordconj', 'PUNCT', 'INTNS', 'REFL.PRO', 'V.ISA.MRK.UV', 'V.ST.PER', 'PP', 'Adv.loc', 'V.CAUS.UV', 'N.CHAR.NMLZ', 'Prep', 'ADV.MAN', 'V.ACY.SE', 'N.Temp', 'QW', 'V.ISA', 'V.ST.POSS', 'MO.ACY.PATH', 'ADJ', 'V', 'V.ACH', 'N.ABSTR', 'INDEF.PRO', 'Rel', 'V.ST.DES', 'Mult.Num', 'V.ISA.UV', 'V.CAUS', 'PersName', '@@@', 'V.ST.POS', 'MOD', 'N.PROD', 'Interj', 'V.ABL.MOD', 'V.INSTRV', 'TNS', 'EX.ST', 'N.peg--an', 'ST.ATTR', 'AUX.UV', 'PERS.PRO', 'DEM.PRO', 'Det.PN', 'Subordconj', 'PEG-tsc.AV<N', 'Adv.temp', 'NP', 'V.ACY.C2', 'Adv.epis', 'NEG', 'Card.Num', 'MO.ACY.M', 'V.PET', 'N.Voc', 'V.ADVRS.ACH', 'Det', 'V.PLAC.ACY', 'ASP', 'V.DES.MOD', 'Adv.pace', 'N.p-.-an', 'Conn', 'CLS.MENS', 'PEG-tsc.UV<V', 'V.CAUS.AV', 'V.ST.POSS.denom', 'N.Prop', 'QUAN', 'V.ISA.AV', 'V.ST.EMO', 'Adv.freq', 'CLS.SORT', 'V.ST', 'NEG.PRO', 'N.DIMIN.redup', 'ADNOM.DEM', 'COND.ST', 'PRT', 'V.ST.COG', 'V.ACD.MOD', 'PEG-tsc.AV<V', 'V.ST2', 'peg-verb.temp.sub.cl', 'V.ACY.M', 'N', 'Ord.Num', 'V.ACY', 'Adv'}
#SELECT_POS_TAGS = []

#### TASKS: 
### All possible tasks: ['_canSeg', '_surSeg', '_gls', '_canSegGls', '_surSegGls', '_pos', '_wrdgls', '_infl']
### gls = glossing only, seg = segmentation only,
### can = canonical (underlying) morphemes, surf = surface morphs
### SegGls = segmentation+glossing, pos = (word) POS tagging
### infl = (re)inflection, affix glosses for words and POS tags
DESIRED_TASKS = ['_surSeg'] 

#### LANGUAGES:
LANGS = ['bdg','btz','cho','lez','ntu','tau'] # ['bdg','btz','cho','lez','ntu','tau'] 

####### RUN CODE: 
for lang in LANGS:
    print('\n\nLANGUAGE:', lang)
    for task in DESIRED_TASKS:
        store = r"../Research/MorphologyLRL/LLMinferring/"+lang
        to_extract = r'./flextexts/'+lang+'-all_txts.flextext'
        main(store, to_extract, task, bysentence=True, useoriginalline=True, json=True)



LANGUAGE: bdg
Parts of speech found in corpus: {'PEG-tsc.AV.<.V', 'ASP', 'CLF', 'ST.ATTR', 'V.ADVRS.ACH', 'N.ABSTR', 'Prep', 'PEG-tsc.AV.<.N', 'V.PLAC.ACY', 'Adv.loc', 'N.Temp', 'AUX.UV', 'V.ISA.UV', 'REFL.PRO', 'PRT', 'Ord.Num', 'ADNOM.DEM', 'N.p-.-an', 'Adv', 'INDEF.PRO', 'V.ST.PER', 'N.Voc', 'NEG.PRO', 'Subordconj', 'V.ST', 'NEG', 'N', 'V.PET', 'NP', 'CLS.MENS', '@UNK@', 'ADJ', 'peg-verb.temp.sub.cl', 'V.ST1', 'V.ACL', 'V.ISA', 'Conn', 'INTNS', 'N.peg-.-an', 'QUAN', 'PP', 'EX.ST', 'CLS.SORT', 'Det.PN', 'N.Prop', 'Coordconj', 'ADV.MAN', 'PUNCT', 'QW', 'PERS.PRO', 'TNS', 'V.CAUS.UV', 'V.DES.MOD', 'V.ACH', 'PersName', 'MOD', 'Card.Num', 'Rel', 'Adv.temp', 'Det', 'N.DIMIN.redup', 'V.ST.EMO', 'V.CAUS.AV', 'V.CAUS', 'V.ST2', 'V.ST.POSS.denom', 'N.CHAR.NMLZ', 'PEG-tsc.UV.<.V', 'V.ST.COG', 'N.PROD', 'V.ST.DES', 'Adv.pace', 'V.ACY.C2', 'V.ISA.AV', 'MO.ACY.PATH', 'V.ISA.MRK.UV', 'V.ACY.M', 'V.ACY', 'Adv.epis', 'V.ACD.MOD', 'V.ST.POS', 'V.INSTRV', 'INT.EXP.ST', 'V.ST.POSS', 'V.ACH.ABIL', 'Ad



LANGUAGE: btz
Parts of speech found in corpus: {'pro', 'vi', 'relpro', 'Conj', 'v', 'PUNCT', 'prt', 'Prep', 'Aux', 'existmrkr', 'adv', 'refl', 'distrnum', 'cardnum', 'Adj', 'ordnum', 'adj', 'nprop', 'interj', 'vd', 'dem', 'stc', 'quant', 'clf', 'vt', 'n', 'cop'}

All Tokens: 4285
Lexemes, ignoring punctuation and digits: 3839
Initial Extraction Sanity Check: [{'token': 'alkisah', 'wPOS': 'n', 'morphemes': [['alkisah', '@UNK@', 'The.story.is.told', 'n', 'stem']], 'word_gloss': 'The story is told'}, {'token': ',', 'wPOS': 'PUNCT', 'morphemes': [[',', '@UNK@', '@UNK@', 'PUNCT', 'punct']], 'word_gloss': 'punct'}, {'token': 'ni', 'wPOS': 'Prep', 'morphemes': [['ni', '@UNK@', 'in,.at.(space)', 'Prep', 'stem']], 'word_gloss': 'in, at (space)'}, {'token': 'sebuah', 'wPOS': 'distrnum', 'morphemes': [['sebuah', '@UNK@', 'one', 'distrnum', 'stem']], 'word_gloss': 'one'}, {'token': 'kute', 'wPOS': 'n', 'morphemes': [['kute', '@UNK@', 'city', 'n', 'stem']], 'word_gloss': 'city'}, {'token': '(', '



LANGUAGE: lez
Parts of speech found in corpus: {'pro', 'poss', 'Voc.part', 'v', 'PUNCT', 'IMPV', 'prt', 'prep', 'adv', 'post', 'emph', 'indfpro', 'subordconn', 'proform', '@UNK@', 'verbprt', 'interrog', 'num', 'cardnum', 'ordnum', 'adj', 'multipnum', 'pers', 'nprop', 'interj', 'det', 'dem', 'conn', 'coordconn', 'recp', 'n', 'cop'}

All Tokens: 18750
Lexemes, ignoring punctuation and digits: 13953
Initial Extraction Sanity Check: [{'token': 'са', 'wPOS': 'cardnum', 'morphemes': [['са', '@UNK@', 'one', 'cardnum', 'stem']], 'word_gloss': 'one'}, {'token': 'юкъуз', 'wPOS': 'n', 'morphemes': [['юкъ', '@UNK@', '@UNK@', '@UNK@', 'stem'], ['-ди', '@UNK@', 'OBL', 'n:Oblique-erg', 'suffix'], ['-з', '@UNK@', 'DAT', 'n:SemCase', 'suffix']], 'word_gloss': 'day'}, {'token': 'зун', 'wPOS': 'pers', 'morphemes': [['зун', '@UNK@', '@UNK@', '@UNK@', 'stem']], 'word_gloss': '1sg.abs'}, {'token': 'хуьряй', 'wPOS': 'n', 'morphemes': [['хуьр', '@UNK@', 'village', '<Not.Sure>', 'stem'], ['-да', '@UNK@', 'IN



LANGUAGE: ntu
Parts of speech found in corpus: {'INTJ', 'V.(caus)', 'pro', 'be.V.', 'Pers.pro', 'Nom.phrase', 'Ord', 'CONJ', 'PCLF', 'PUNCT', 'VP', 'PN', 'NEG', 'N', 'z-Nom', 'Particle', 'N.comp', 'V.(comp)', 'NP', 'N(kx.cl)', 'vi.', 'GEN', 'vt.', '@UNK@', 'interrog', 'num', 'Phrase', 'V', 'Adj', 'Det', 'nprop', 'Nom1', 'PREP', 'RPRN', 'NP.(comp)', 'Clause', 'Adv', 'A-D-P2', 'DEM', 'NUM', 'V.neg', 'C-fix-Nom', 'Poss.pro', 'SUBR'}

All Tokens: 20054
Lexemes, ignoring punctuation and digits: 16690
Initial Extraction Sanity Check: [{'token': '29', 'wPOS': 'num', 'morphemes': [['29', '29', 'NUM', '@UNK@', 'stem']], 'word_gloss': '@UNK@'}, {'token': '.', 'wPOS': 'PUNCT', 'morphemes': [['.', '@UNK@', '@UNK@', 'PUNCT', 'punct']], 'word_gloss': 'punct'}, {'token': 'vex', 'wPOS': '@UNK@', 'morphemes': [['ve', '@UNK@', 'accompany', 'V', 'stem'], ['==ä', '@UNK@', '=1ᴍꞮɴI', 'Nom1', 'enclitic']], 'word_gloss': '@UNK@'}, {'token': 'dckta foks', 'wPOS': '@UNK@', 'morphemes': [['dckta.foks', '@UNK@'



LANGUAGE: tau
Parts of speech found in corpus: {'nvp', 'pro', 'mod', 'dir', 'v', 'PUNCT', 'onom', 'post', 'adv', 'proform', 'nomprt', '@UNK@', 'verbprt', 'num', 'inter', 'cardnum', 'adj', 'interj', 'nprop', 'advlizer', 'dem', 'quant', 'imp', 'coordconn', 'n', 'DM'}

All Tokens: 17587
Lexemes, ignoring punctuation and digits: 14099
Initial Extraction Sanity Check: [{'token': 'keey', 'wPOS': 'n', 'morphemes': [['keey', '@UNK@', 'village', 'n', 'stem']], 'word_gloss': 'village'}, {'token': 'tah', 'wPOS': 'post', 'morphemes': [['tah', '@UNK@', 'at:AR', 'post', 'stem']], 'word_gloss': 'at:AR'}, {'token': 'hihneeshyąą', 'wPOS': 'v', 'morphemes': [['h-', '@UNK@', '3PL.S.', 'v:Any', 'prefix'], ['nee-', '@UNK@', 'QUAL:DH.PFV:Ø.', 'Verb', 'prefix'], ['shyąą', '@UNK@', 'grow:PFV', 'v', 'stem']], 'word_gloss': '@UNK@'}, {'token': 'jah', 'wPOS': 'adv', 'morphemes': [['jah', '@UNK@', 'here', 'adv', 'stem']], 'word_gloss': 'here'}, {'token': 'dineh', 'wPOS': '@UNK@', 'morphemes': [['dineh', '@UNK@'