The first argument is a string containing the filename. 
The second argument is another string containing a few characters 
describing the way in which the file will be used. 
mode can be 'r' when the file will only be read.
https://docs.python.org/3/tutorial/inputoutput.html#reading-and-writing-files



In [1]:
import pandas

In [2]:
with open('Nenets_interlinear_gloss.htm', 'r') as file:
    file_guts = file.read()

In [3]:
type(file_guts)

str

In [4]:
separated_by_ref = file_guts.split("<span class=\"ref\">")[1:] #sliced to remove element 0

In [5]:
separated_by_ref[0]

'tesjada_nisjami 001</span>\n\n<span class="st">Сэвни чахад мадорота яха вархана илеваʔ.</span>\n\n<span class="ts">Sæwnʼi tʼaxad madorota jaxa warxana jilʼewaʔ </span>\n\n<table><tr>\n<td colspan = 2><span class="tx">Sæwnʼi</span></td><td colspan = 2><span class="tx">tʼaxad</span></td><td colspan = 1><span class="tx">madorota</span></td><td colspan = 1><span class="tx">jaxa</span></td><td colspan = 2><span class="tx">warxana</span></td><td colspan = 2><span class="tx">jilʼewaʔ.</span></td></tr>\n<tr><td><span class="mb">sæw</span></td><td><span class="mb">-nʼi</span></td><td><span class="mb">tʼa</span></td><td><span class="mb">-xad</span></td><td><span class="mb">madorota</span></td><td><span class="mb">jaxa</span></td><td><span class="mb">war</span></td><td><span class="mb">-xana</span></td><td><span class="mb">jilʼe</span></td><td><span class="mb">-waʔ</span></td></tr>\n\n<tr><td><span class="mp">sæw</span></td><td><span class="mp">-nʼi</span></td><td><span class="mp">tʼaʔ</span></t

In [6]:
cyrillic_transliteration = separated_by_ref[0].split('<span class=\"st\">')[1].split('</span>')[0]

In [7]:
cyrillic_transliteration

'Сэвни чахад мадорота яха вархана илеваʔ.'

In [8]:
def parseout_span_contents(class_id, sections):
    contents = []
    for section in sections:
        contents.append(section.split(f'<span class=\"{class_id}\">')[1].split('</span>')[0])
    return contents

In [9]:
cyr_translit = parseout_span_contents('st', separated_by_ref)
lat_translit = parseout_span_contents('ts', separated_by_ref)
eng_translat = parseout_span_contents('ft', separated_by_ref)
rus_translat = parseout_span_contents('fr', separated_by_ref)
ger_translat = parseout_span_contents('fg', separated_by_ref)

In [10]:
#this works
#all_tables = pandas.read_html(file_guts)

In [11]:
from io import StringIO

def parseout_tables(sections):
    tables = []
    for section in sections:
        table = section.split('<table>')[1].split('</table>')[0]
        tables.append(pandas.read_html(StringIO(f'<table>{table}</table>'))[0])
    return tables

In [12]:
all_tables = parseout_tables(separated_by_ref)

In [13]:
all_tables[0]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,Sæwnʼi,Sæwnʼi,tʼaxad,tʼaxad,madorota,jaxa,warxana,warxana,jilʼewaʔ.,jilʼewaʔ.
1,sæw,-nʼi,tʼa,-xad,madorota,jaxa,war,-xana,jilʼe,-waʔ
2,sæw,-nʼi,tʼaʔ,-xVd,madorota,jaxa,war,-xVna,jilʼe,-waʔ
3,n,-n.case-poss,pp,-n.case,adj,n,n,-n.case,v,-v.pn
4,глаз,-GEN.1SG.PL,туда,-ABL,излучинами,река,берег,-LOC,жить,-1PL
5,eye,-GEN.1SG.PL,there,-ABL,winding,river,coast,-LOC,live,-1PL
6,Auge,-GEN.1SG.PL,dorthin,-ABL,kurvig,Fluss,Ufer,-LOC,leben,-1PL


In [14]:
all_tables[0].loc[:,0] #why comma - numeric indexing not names, so this will ALWAYS grab the first column

0    Sæwnʼi
1       sæw
2       sæw
3         n
4      глаз
5       eye
6      Auge
Name: 0, dtype: object

In [15]:
all_tables[0].shape[1]

10

In [16]:
# Making a list of list of dictionaries for all tables
listOfSentencesOfStory = []
for j in range(0, len(all_tables)):
    #Getting a list of dictionaries but a single table
    listOfWordsInASentence = []
    for i in range(0, all_tables[j].shape[1]):
        dictionary_ofWordInfo = {
            "phonemic": all_tables[j].loc[0, i],
            "pos_tag": all_tables[j].loc[3, i],
            "Russian": all_tables[j].loc[4, i],
            "English": all_tables[j].loc[5, i],
            "German": all_tables[j].loc[6, i],
        }
        listOfWordsInASentence.append(dictionary_ofWordInfo)
    listOfSentencesOfStory.append(listOfWordsInASentence)

In [17]:
listOfSentencesOfStory[0]

[{'phonemic': 'Sæwnʼi',
  'pos_tag': 'n',
  'Russian': 'глаз',
  'English': 'eye',
  'German': 'Auge'},
 {'phonemic': 'Sæwnʼi',
  'pos_tag': '-n.case-poss',
  'Russian': '-GEN.1SG.PL',
  'English': '-GEN.1SG.PL',
  'German': '-GEN.1SG.PL'},
 {'phonemic': 'tʼaxad',
  'pos_tag': 'pp',
  'Russian': 'туда',
  'English': 'there',
  'German': 'dorthin'},
 {'phonemic': 'tʼaxad',
  'pos_tag': '-n.case',
  'Russian': '-ABL',
  'English': '-ABL',
  'German': '-ABL'},
 {'phonemic': 'madorota',
  'pos_tag': 'adj',
  'Russian': 'излучинами',
  'English': 'winding',
  'German': 'kurvig'},
 {'phonemic': 'jaxa',
  'pos_tag': 'n',
  'Russian': 'река',
  'English': 'river',
  'German': 'Fluss'},
 {'phonemic': 'warxana',
  'pos_tag': 'n',
  'Russian': 'берег',
  'English': 'coast',
  'German': 'Ufer'},
 {'phonemic': 'warxana',
  'pos_tag': '-n.case',
  'Russian': '-LOC',
  'English': '-LOC',
  'German': '-LOC'},
 {'phonemic': 'jilʼewaʔ.',
  'pos_tag': 'v',
  'Russian': 'жить',
  'English': 'live',
  'Ger

In [18]:
#Attempt 1
# Making a list of dictionaries, where each corresponds to a sentence
listOfSentencesOfStory = []
for j in range(0, len(separated_by_ref)): #for every sentence j...
    #First create from sentence j's table, a LIST of dictionaries(ofWordInfo) 
    listOfWordsInASentence = []
    for i in range(0, all_tables[j].shape[1]):
        dictionary_ofWordInfo = {
            "phonemic": all_tables[j].loc[0, i],
            "pos_tag": all_tables[j].loc[3, i],
            "Russian": all_tables[j].loc[4, i],
            "English": all_tables[j].loc[5, i],
            "German": all_tables[j].loc[6, i],
        }
        listOfWordsInASentence.append(dictionary_ofWordInfo)
    
    #Secondly, create a dictionary per sentence j
    #Put that list inside, along with sentence j's standalone transliterations andtranslations
    dictionary_forASingleSentence =  {
        "cyrillic_transliteration" : cyr_translit[j],
        "latin_transliteration" : lat_translit[j],
        "Russian_translation" : rus_translat[j],
        "English_translation" : eng_translat[j],
        "German_translation" : ger_translat[j],
        "wordInfo_dictionary" : listOfWordsInASentence
    }
    listOfSentencesOfStory.append(dictionary_forASingleSentence)

In [19]:
eng_translat[0].find("river")

44

In [20]:
query = {
    'cyr_translit': None,
    'lat_translit': None,
    'rus_translat': None,
    'eng_translat': "river",
    'ger_translat': None,
    'wordInfo_dictionary': {
        'phonemic': "jaxa",
        'pos_tag': None,
        'Russian': None,
        'English': "-1SG",
        'German': None
    }
}

In [21]:
for sentence in listOfSentencesOfStory:
    break

In [22]:
sentence["English_translation"].find("river")

44

In [23]:
sentence

{'cyrillic_transliteration': 'Сэвни чахад мадорота яха вархана илеваʔ.',
 'latin_transliteration': 'Sæwnʼi tʼaxad madorota jaxa warxana jilʼewaʔ ',
 'Russian_translation': 'Как я себя помню, мы жили на берегу извилистой реки.',
 'English_translation': 'As far as I remember, we lived by a winding river bank.',
 'German_translation': 'Soweit ich mich erinnere, wohnten wir am Ufer eines sich schlängelnden Flusses.',
 'wordInfo_dictionary': [{'phonemic': 'Sæwnʼi',
   'pos_tag': 'n',
   'Russian': 'глаз',
   'English': 'eye',
   'German': 'Auge'},
  {'phonemic': 'Sæwnʼi',
   'pos_tag': '-n.case-poss',
   'Russian': '-GEN.1SG.PL',
   'English': '-GEN.1SG.PL',
   'German': '-GEN.1SG.PL'},
  {'phonemic': 'tʼaxad',
   'pos_tag': 'pp',
   'Russian': 'туда',
   'English': 'there',
   'German': 'dorthin'},
  {'phonemic': 'tʼaxad',
   'pos_tag': '-n.case',
   'Russian': '-ABL',
   'English': '-ABL',
   'German': '-ABL'},
  {'phonemic': 'madorota',
   'pos_tag': 'adj',
   'Russian': 'излучинами',
  

In [24]:
query = {
    'cyr_translit': None,
    'lat_translit': None,
    'rus_translat': None,
    'eng_translat': "river",
    'ger_translat': None,
    'wordInfo_dictionary': {
        'phonemic': ["jaxa"],
        'pos_tag': None,
        'Russian': None,
        'English': ["-1SG"],
        'German': None
    }
}

In [25]:
query['wordInfo_dictionary']

{'phonemic': ['jaxa'],
 'pos_tag': None,
 'Russian': None,
 'English': ['-1SG'],
 'German': None}

In [26]:
print("exists in sentence indexes: ", end='')
def doSearch(query):
    matching_sentences = []
    for i in range(len(listOfSentencesOfStory)):
        sentence = listOfSentencesOfStory[i]

        keep_this_sentence = True

        if query['cyr_translit'] is not None:
            if sentence["cyrillic_transliteration"].find(query['cyr_translit']) == -1:
                keep_this_sentence = False
        if query['lat_translit'] is not None:
            if sentence["latin_transliteration"].find(query['lat_translit']) == -1:
                keep_this_sentence = False
        if query['rus_translat'] is not None:
            if sentence["Russian_translation"].find(query['rus_translat']) == -1:
                keep_this_sentence = False
        if query['eng_translat'] is not None:
            if sentence["English_translation"].find(query['eng_translat']) == -1:
                keep_this_sentence = False
        if query['ger_translat'] is not None:
            if sentence["German_translation"].find(query['ger_translat']) == -1:
                keep_this_sentence = False
        for key in query['wordInfo_dictionary'].keys():
            if query['wordInfo_dictionary'][key] is not None:
                for word in query['wordInfo_dictionary'][key]:
                    if word not in [dictionary[key] for dictionary in sentence['wordInfo_dictionary']]:
                        keep_this_sentence = False

        if keep_this_sentence:
            matching_sentences.append(sentence)
    return matching_sentences
doSearch(query)

exists in sentence indexes: 

[{'cyrillic_transliteration': 'Няхарʔ яля ңэсоңгана нерута яха нимня ңадимядм.',
  'latin_transliteration': 'Nʼaxarʔ jalʼa ŋæsoŋkana nʼeruta jaxa nʼimnʼa ŋadʼimʼadm.',
  'Russian_translation': 'Потом я дошёл до реки, поросшей ивняком.',
  'English_translation': 'After the third day I arrived at a river that was surrounded by willows.',
  'German_translation': 'Nach dem Verstreichen des dritten Tages gelangte zu einem von Weiden umwachsenen Fluss.',
  'wordInfo_dictionary': [{'phonemic': 'Nʼaxarʔ',
    'pos_tag': 'num',
    'Russian': 'три',
    'English': 'three',
    'German': 'drei'},
   {'phonemic': 'jalʼa',
    'pos_tag': 'n',
    'Russian': 'день',
    'English': 'day',
    'German': 'Tag'},
   {'phonemic': 'ŋæsoŋkana',
    'pos_tag': 'pp',
    'Russian': 'через',
    'English': 'after',
    'German': 'nach'},
   {'phonemic': 'nʼeruta',
    'pos_tag': 'adj',
    'Russian': 'поросший.тальником',
    'English': 'overgrown.with.willows',
    'German': 'mit.Weinen.bewachsen'},
   {'ph

In [27]:
def input_query():
    query = {
        'cyr_translit': input("Cyrillic transliteration:"),
        'lat_translit': input("Latin transliteration:"),
        'rus_translat': input("Russian transliteration:"),
        'eng_translat': input("English transliteration:"),
        'ger_translat': input("German transliteration:"),
        'wordInfo_dictionary': {
            'phonemic': input("Phonemic transliteration:").split(),
            'pos_tag': input("POS tagging:"),
            'Russian': input("Russian transliteration:"),
            'English': input("English transliteration:"),
            'German': input("German transliteration:")
        }
    }
    return query

In [28]:
def main_screen():
    print("Welcome to Nenets Corpus Search! Enter your query below:")
    query = input_query()
    search_results = doSearch(query)
    print(f"There are {len(search_results)} search results for your query!")
    for search_result in search_results:
        print(f"Cyrillic transliteration: {search_result['cyrillic_transliteration']}")
        print(f"Latin transliteration: {search_result['latin_transliteration']}")
        print(f"Russian translation: {search_result['Russian_translation']}")
        print(f"English translation: {search_result['English_translation']}")
        print(f"Phonemic tagging: {' '.join([i['phonemic'] for i in search_result['wordInfo_dictionary']])}")
        print(f"POS tagging: {' '.join([i['pos_tag'] for i in search_result['wordInfo_dictionary']])}")
        print(f"Russian tagging: {' '.join([i['Russian'] for i in search_result['wordInfo_dictionary']])}")
        print(f"English tagging: {' '.join([i['English'] for i in search_result['wordInfo_dictionary']])}")
        print(f"German tagging: {' '.join([i['German'] for i in search_result['wordInfo_dictionary']])}")
        print()

In [29]:
main_screen()

Welcome to Nenets Corpus Search! Enter your query below:



KeyboardInterrupt


KeyboardInterrupt



## Write the XML

In [37]:
from xml.etree.ElementTree import Element, ElementTree

In [59]:
import xml.etree.ElementTree

In [47]:
listOfSentencesOfStory[0]

{'cyrillic_transliteration': 'Сэвни чахад мадорота яха вархана илеваʔ.',
 'latin_transliteration': 'Sæwnʼi tʼaxad madorota jaxa warxana jilʼewaʔ ',
 'Russian_translation': 'Как я себя помню, мы жили на берегу извилистой реки.',
 'English_translation': 'As far as I remember, we lived by a winding river bank.',
 'German_translation': 'Soweit ich mich erinnere, wohnten wir am Ufer eines sich schlängelnden Flusses.',
 'wordInfo_dictionary': [{'phonemic': 'Sæwnʼi',
   'pos_tag': 'n',
   'Russian': 'глаз',
   'English': 'eye',
   'German': 'Auge'},
  {'phonemic': 'Sæwnʼi',
   'pos_tag': '-n.case-poss',
   'Russian': '-GEN.1SG.PL',
   'English': '-GEN.1SG.PL',
   'German': '-GEN.1SG.PL'},
  {'phonemic': 'tʼaxad',
   'pos_tag': 'pp',
   'Russian': 'туда',
   'English': 'there',
   'German': 'dorthin'},
  {'phonemic': 'tʼaxad',
   'pos_tag': '-n.case',
   'Russian': '-ABL',
   'English': '-ABL',
   'German': '-ABL'},
  {'phonemic': 'madorota',
   'pos_tag': 'adj',
   'Russian': 'излучинами',
  

In [93]:
root_element = Element("story")
element_tree = ElementTree(root_element)

In [94]:
for sentence in listOfSentencesOfStory:
    sub_element = Element("sentence")
    root_element.append(sub_element)
    sub_sub_element1 = Element("cyrillic_transliteration")
    sub_sub_element1.text = sentence['cyrillic_transliteration']
    
    sub_sub_element2 = Element("latin_transliteration")
    sub_sub_element2.text = sentence['latin_transliteration']
    
    sub_sub_element3 = Element('Russian_translation')
    sub_sub_element3.text = sentence['Russian_translation']
    
    sub_sub_element4 = Element("English_translation")
    sub_sub_element4.text = sentence['English_translation']
    
    sub_sub_element5 = Element("German_translation")
    sub_sub_element5.text = sentence['German_translation']
    
    sub_sub_element6 = Element('wordInfo_dictionary')
    #wordInfo_dictionary is a list of dictionaries
    # one dictionary for one word in different forms
    for dictionary in sentence["wordInfo_dictionary"]:
        sub_sub_sub_element = Element("word")
        sub_sub_element6.append(sub_sub_sub_element)
        
        sub_sub_sub_sub_element1 = Element("phonemic")
        sub_sub_sub_sub_element1.text = dictionary["phonemic"]
        sub_sub_sub_sub_element2 = Element("pos_tag")
        sub_sub_sub_sub_element2.text = dictionary["pos_tag"]
        sub_sub_sub_sub_element3 = Element("Russian")
        sub_sub_sub_sub_element3.text = dictionary["Russian"]
        sub_sub_sub_sub_element4 = Element("English")
        sub_sub_sub_sub_element4.text = dictionary["English"]
        sub_sub_sub_sub_element5 = Element("German")
        sub_sub_sub_sub_element5.text = dictionary["German"]
        #this one puts phonemic, pos_tag, Russian, English and German word forms as children of wordInfo_dictionary
        sub_sub_sub_element.extend([sub_sub_sub_sub_element1, sub_sub_sub_sub_element2, sub_sub_sub_sub_element3, sub_sub_sub_sub_element4, sub_sub_sub_sub_element5])

    #extend is append but for multiple subelements at once in a list
    #this one puts cyrillic, latin transliteration, Russian, English, German translation, wordInfo_dictionary as children of sentence
    sub_element.extend([sub_sub_element1, sub_sub_element2, sub_sub_element3, sub_sub_element4, sub_sub_element5, sub_sub_element6])
    
    
    
    xml.etree.ElementTree.indent(root_element)

In [95]:
element_tree.write('nenets.xml', encoding='utf-8')