Original code by Zachary J. Ryan 
at CU Boulder
Spring 2020

In [2]:
#API for parsing XML docs
import xml.etree.ElementTree as ET
from itertools import chain
import sklearn
import csv



def XMLtoArray(filename, stems=False):
    '''Takes multiple FLExText texts as .xml. 
    Returns data as list: [[[[morpheme, gloss, mpos, wpos],...]word,...]sent,...]'''
    
    # Identify tiers, because the info you need may be on different tiers in FLEx.
    txt = 'txt'
    gloss = 'gls'
    cf = 'cf'
    pos = 'pos' # word-level pos
    msa = 'msa' # morpheme-level pos
    punct = 'punct'
    title_type = 'title'
    comment_type = 'comment'
    english = 'en'
    
    datalists = []
    #a single line for datalists is below
    #datalists -> [title, segnum, [[word_1,[[morph, gloss]], pos], [word2,[[morph, gloss], [morph, gloss]], pos], ... ,[wordn,[[morph, gloss], [morph, gloss]], pos]], translated phrase, comment]
    
    # open XML doc using xml parser
    root = ET.parse(filename).getroot()
    for lin in root.iter('interlinear-text'):
        #find title and comment if in this section
        #some documents have both and english and native language titles
        #these checks assure that the english title will always be used if both are found
        #if only one of them is found then it is used
        #if none are found return NO TITLE FOUND
        comment = "No comment"
        eng_title = "~~~"
        non_eng_title = "~~~"
        for item_lin in lin.iter('item'):
            if item_lin.get('type') == title_type and item_lin.get('lang') == english:
                eng_title = item_lin.text
            if item_lin.get('type') == title_type and item_lin.get('lang') != english:
                non_eng_title = item_lin.text
            if item_lin.get('type') == comment_type and item_lin.get('lang') == english:
                comment = item_lin.text
        if eng_title != "~~~" and non_eng_title == "~~~":
            title = eng_title
        elif eng_title == "~~~" and non_eng_title != "~~~":
            title = non_eng_title
        elif eng_title != "~~~" and non_eng_title != "~~~":
            title = eng_title
        else:
            title = "NO TITLE FOUND"
            
        #go through all paragraphs
        for paragraphs in lin.iter('paragraphs'):
            for paragraph in paragraphs.iter('paragraph'):
                #go through all phrases in paragraph
                for phrases in paragraph.iter('phrases'):
                    for phrase in phrases.iter('phrase'):
                        #create a temp array for each phrase
                        temp_line = []
                        #append title
                        temp_line.append(title)
                        #get segnum and append to temp_line
                        segnum = phrase.find('item').text
                        temp_line.append(segnum)
                        temp_words_morph_gloss = []
                        #loop through all words in the phrase
                        for words in phrase.iter('words'):
                            for word in words.iter('word'):
                                temp_word = []
                                #attach the untranslated word
                                wrd = word.find('item')
                                if wrd.get('type') == punct:
                                    temp_word.append(wrd.text)
                                    # make sure all new entries go with in both sets of brackets, 
                                    #follow the same format right after 'punct'
                                    temp_morph = [[str(wrd.text), str(wrd.text), 'punct']]
                                    temp_word.append(temp_morph)
                                    temp_word.append('punct')
                                    
                                else:
                                    temp_word.append(wrd.text.replace(' ', '~'))
                                    #find the morpheme and gloss for each word and append what is found in the xml
                                    temp_morphemes = []
                                    for morph in word.iter('morph'):
                                        #if you want to add more items that are found in each morpheme add the code here
                                        #first add another holding place in the the temp_morph array
                                        #then add an elif statement that is of the same type already seen but instead 
                                        #check for whatever tag you want, here 'gloss' is one.
                                        #also if you do add more entries into the temp_morph you should keep it uniform
                                        #for the punctuation option above and entries to the temp morph as well.
                                        temp_morph = ["~~~", "~~~", "~~~"]
                                        for item in morph.iter('item'):
                                            if(item.get('type') == cf):
                                                temp_morph[0] = item.text
                                            elif(item.get('type') == gloss):
                                                # separate multi-word glosses with "."
                                                gloss_line = item.text
                                                if gloss_line != None:
                                                    gloss_line = gloss_line.strip().replace(' ','.').replace('-','.')
                                                    temp_morph[1] = gloss_line
                                            elif(item.get('type') == msa):
                                                temp_morph[2] = item.text.replace(' ', '').replace('pro-form', 'proform')
                                            else:
                                                continue
                                        temp_morphemes.append(temp_morph)
                                    temp_word.append(temp_morphemes)
                                    #look for pos for the individual word
                                    temp_pos = '~~~'
                                    for w_item in word.iter('item'):
                                        if(w_item.get('type') == pos):
                                            temp_pos = w_item.text.replace('pro-form', 'proform')
                                        else:
                                            continue
                                    temp_word.append(temp_pos)
                                temp_words_morph_gloss.append(temp_word)
                            #finding the phrase translation, starts with a string of '~~~'
                            translation = '~~~'
                            #iterate through all 'item' in branch phrase
                            temp_phrase_gloss = [p_item for p_item in phrase.iter('item')]
                            #take the last item which should be our phrase translation but the if statement checks to make sure
                            for tmg in temp_phrase_gloss:
                                if tmg.get('type') == gloss and tmg.get('lang') == english:
                                    translation = tmg.text
                            #append all the words with there morpheme and gloss array
                            temp_line.append(temp_words_morph_gloss)
                            #append the translation of the phrase to the end of the temp line
                            temp_line.append(translation)
                            #append whatever comment may have been found
                            temp_line.append(comment)
                            #append each phrase to datalist
                            #print(temp_line)
                            datalists.append(temp_line)
    
    return datalists


def arrayToCSV(xml, langID, fileName):
    
    with open(fileName, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        for i in range(0,len(xml)):
            #starts with segnum and lang ID
            csv_line = [xml[i][1],langID]
            temp_line_phrase = ""
            temp_line_morph = ""
            temp_line_gloss = ""
            temp_line_msa = ""
            temp_line_word_pos = ""
            for wmg in range(0,len(xml[i][2])):
                temp_line_phrase += (str(xml[i][2][wmg][0])+" ")
                temp_morph = ""
                temp_gloss = ""
                temp_msa = ""
                temp_line_word_pos += (str(xml[i][2][wmg][2])+" ")
                for mg in range(0, len(xml[i][2][wmg][1])):
                    if mg == (len(xml[i][2][wmg][1])-1):
                        temp_morph += (str(xml[i][2][wmg][1][mg][0]))
                        temp_gloss += (str(xml[i][2][wmg][1][mg][1]))
                        temp_msa   += (str(xml[i][2][wmg][1][mg][2]))
                    else:
                        temp_morph += (str(xml[i][2][wmg][1][mg][0])+" ")
                        temp_gloss += (str(xml[i][2][wmg][1][mg][1])+"-")
                        temp_msa   += (str(xml[i][2][wmg][1][mg][2])+"-")
                temp_line_morph += (temp_morph+"\t")
                temp_line_gloss += (temp_gloss+"\t")
                temp_line_msa += (temp_msa+"\t")
                
                
            csv_line.append(temp_line_phrase)
            csv_line.append(temp_line_morph)
            csv_line.append(temp_line_gloss)
            csv_line.append(xml[i][3]) #appends the english translation
            csv_line.append("ML ID")
            csv_line.append(xml[i][4]) #append the comment found from file
            csv_line.append(xml[i][0]) #appends the title or Text_ID
            csv_line.append(temp_line_msa) #appends the morphemes POS
            csv_line.append(temp_line_word_pos) #appends the POS for each word 
            
            writer.writerow(csv_line)

In [3]:
SOURCE_DIR = r'../'
#lgs = ['btz', 'lez', 'lez', 'ntu', 'mni']
lgs = ['mni']
for lg in lgs:
    source_file = SOURCE_DIR + lg + '-all_txts.flextext'
    datalists = XMLtoArray(source_file, stems=True)
    print(datalists[0])
    outputfile = lg + '-CLDF.csv'
    arrayToCSV(datalists, lg, outputfile)
    

['The Right Attitude, the Right Form', '1', [['khúdə́m', [['khút', 'hand', 'n'], ['lə́m', 'path', 'n']], 'n'], ['oynə', [['oy', 'be', '<NotSure>'], ['-nə', '.ADV', 'Attachestoanycategory']], '~~~'], [',', [[',', ',', 'punct']], 'punct'], ['yam', [['yam', 'much', '<NotSure>']], '~~~'], ['waŋnə', [['waŋ', 'high', '<NotSure>'], ['-nə', '.ADV', 'Attachestoanycategory']], '~~~'], ['haygətnə', [['hay', 'sway', 'v'], ['-khət', '.UP', 'v>v'], ['-nə', '.ADV', 'Attachestoanycategory']], 'v'], ['kəyno', [['kəri', 'what', '<NotSure>'], ['=no', '=INQ', '<NotSure>']], 'adv'], ['təwbidrə́gəsú', [['təw', 'do', '<NotSure>'], ['-pi', '.REC', 'Attachestoanycategory'], ['-tə', '.NEG', '<NotSure>'], ['-lə́gə', '.AFTER', 'Attachestoanycategory'], ['-čhú', '.ALSO', 'Attachestoanycategory']], '~~~'], [',', [[',', ',', 'punct']], 'punct'], ['tə́rahumdə', [['tə́ra', 'ten', '<NotSure>'], ['hum', 'three', '<NotSure>'], ['=tə', '==LOC', '<NotSure>']], '~~~'], ['oybəsú', [['oy', 'be', '<NotSure>'], ['-pə', '.NOM', 