In [None]:
import xml.etree.ElementTree as ET
import json
import iquito_dict as iqdict

As new LIFT exports are created, put them in the `flex_export` directory and update the `infile` value in the next cell.

As currently written, the output files are overwritten every time this notebook is run. Look for them in the `tex` directory.

In [None]:
infile = 'flex_export/flex_export.lift'
outfile_de = 'tex/diccionario_escolar_iquito.tex'   # Diccionario escolar
outfile_acad = 'tex/dictionary_academic_iquito.tex' # Academic dictionary (English)
outfile_acad_es = 'tex/dictionary_academic_iquito_es.tex' # Academic dictionary (Spanish)

Read the LIFT xml export and create data structures holding the dictionary information. 

In [None]:
tree = ET.parse(infile)
root = tree.getroot()

entries = root.findall('entry')
reversalentries_es = [
    e for e in
    root.findall('entry/sense/reversal[@type="es"]/../..')
    if 'sci' not in iqdict.get_headword(e)
]
#reventries_en = [
#    e for e in
#    root.findall('entry/sense/reversal[@type="en"]/../..')
#]
reversalentries_en = {}
#for e in root.findall('entry/sense/reversal[@type="en"]/../..'):
for e in root.findall('entry'):
    if iqdict.is_excluded(e) or iqdict.is_suffix(e):
        continue
    iqword = iqdict.get_headword(e)
#    pos = e.find('sense/grammatical-info').attrib['value'].strip()
#    is_verb = pos in iqdict.verb_pos
#    for revsns in e.findall('sense/reversal[@type="en"]/..'):
    for sns in e.findall('sense'):
        try:
            pos = sns.find('grammatical-info').attrib['value'].strip()
        except AttributeError:
            print(f'Error in sense (id {sns.attrib["id"]}). Could not find grammatical-info (POS).\n')
#            ET.dump(sns)
#        if is_verb is True:
#            pos = revsns.find('grammatical-info').attrib['value'].strip()
        for revnode in sns.findall('reversal[@type="en"]'):
            try:
                rev = iqdict.nodetext(revnode.find('form/text')).strip()
            except AttributeError:
                print(f'WARNING: empty reversal for entry {iqword}: {e.attrib["guid"]}')
                continue
            try:
                reversalentries_en[rev]
            except KeyError:
                reversalentries_en[rev] = {}
            try:
                reversalentries_en[rev][pos].append(iqword)
            except KeyError:
                reversalentries_en[rev][pos] = [iqword]
for rev in reversalentries_en.keys():
    for pos in reversalentries_en[rev]:
        try:
            reversalentries_en[rev][pos].sort(key=lambda s: iqdict.str2sort(s))
        except KeyError as e:
            print(f'Found illegal character {e}')
            msg = f'Could not create sort entries for reversals {reversalentries_en[rev][pos]}.\n'
            print(msg)
pos = {}

# Spanish reversals are for diccionario escolar.
reversalentries_es = {}
for e in root.findall('entry/sense/reversal[@type="es"]/../..'):
    if iqdict.is_excluded(e) or iqdict.is_suffix(e):
        continue
    # NOTE: This assumes all reversals of current e entry are same
    # part of speech.
    iqword = iqdict.get_headword(e)
    pos = e.find('sense/grammatical-info').attrib['value'].strip()
    for rev in e.findall('sense/reversal[@type="es"]/form/text'):
        rev = iqdict.nodetext(rev)
        if r'\sci ' in rev:
            continue
        try:
            reversalentries_es[rev]
        except KeyError:
            reversalentries_es[rev] = {}
        try:
            reversalentries_es[rev][pos].append(iqword)
        except KeyError:
            reversalentries_es[rev][pos] = [iqword]
for rev in reversalentries_es.keys():
    for pos in reversalentries_es[rev]:
        try:
            reversalentries_es[rev][pos].sort(key=lambda s: iqdict.str2sort(s))
        except KeyError:
            print(f'Found illegal character {e}')
            msg = f'Could not create sort entries for reversals {reversalentries_es[rev][pos]}.\n'
            print(msg)
pos = {}

In [None]:
varmap = {
    'Sociolinguistic variant': 'sociovarlab',
    'Free variant': 'freevarlab', # TODO: for Lev to review
    'Free Variant': 'freevarlab',
    'Dialectal variant': 'dialectvarlab', # TODO: for Lev to review
    'Dialectal variant of': 'dialectvaroflab',
    'Quantificational variant': 'quantvarlab', # TODO: for Lev to review
    'Archaic variant': 'archvarlab', # TODO: for Lev to review
    'Nanay dialect': 'dialectlab{Nanay}',
    'Chambira dialect': 'dialectlab{Chambira}',
    'Maasikuuri dialect': 'dialectlab{Maasikuuri}',
    'Iíjakawɨɨ́raana dialect': 'dialectlab{Iíjakawɨɨ́raana}', # TODO: for Lev to review
    'Inkawɨ́ɨ́rààna dialect': 'dialectlab{Inkawɨɨ́raana}', # TODO: for Lev to review
    'Inkawɨɨ́raana dialect': 'dialectlab{Inkawɨɨ́raana}',
    'Majanakáani dialect': 'dialectlab{Majanakáani}', # TODO: for Lev to review
    'Máájànàkáànì dialect': 'dialectlab{Majanakáani}', # TODO: for Lev to review
    'Maájanakáani dialect': 'dialectlab{Maájanakáani}',
    'Maásikuuri dialect': 'dialectlab{Maásikuuri}', # TODO: for Lev to review
    'Personal variant (JPI)': 'persvarlab{JPI}',
    'Personal variant (ELY)': 'persvarlab{ELY}',
    'Personal variant (HDC)': 'persvarlab{HDC}',
    'Personal variant': 'persvarlabunk',
    'Constructional variant': 'constructvarlab',
    'Archaic variant of': 'archvarlab',
    'Prepausal form': 'prepausallab',
    'Affective variant': 'affectvarlab',
    'Euphemistic variant': 'euphvarlab',
    'Playful variant': 'playvarlab',
    'Irregular 3rd person possessed form': 'irregthirdposs',
    'Irregular 1st person possessed form': 'irregfirstposs',
    'Nickname': 'nicknamelab',
    'Allomorph': 'allomorphlab',
    'Irregular plural': 'irregpllab',  # Not converted to latex macro here; handled separately
    'Irregular Plural': 'irregpllab',  # TODO: for Lev to review
    'Imperfective root': 'impfrtlab',
}
#    'Free variant(s)': 'freevarlabs',
#    'Dialectal variant(s)': 'dialectvarlabs',

In [None]:
variantmap = {}
mainwdmap_es = {}
mainwdmap_en = {}
irreg_pl_map = {}
impf_rt_map = {}
for entry in entries:
    relations = entry.findall('relation[@type="_component-lexeme"]')
    for rel in relations:
        refid = rel.attrib['ref']
        if refid == '':
            continue
        try:
            mainwd = iqdict.get_headword(root.find('entry[@id="{:}"]'.format(refid)))
        except AttributeError:
            print('Could not find entry {:}'.format(refid))
            continue
        try:
            vartype = rel.find('trait[@name="variant-type"]').attrib['value']
            parts = vartype.split()
            parts[0] = parts[0].capitalize()  # capitalize first word only and leave others as capitalized
            vartype = ' '.join(parts)
        except:
            print('Could not get variant type for entry {:}'.format(refid))
            continue
        if varmap[vartype] is not None:
            vartype = varmap[vartype]
#        if vartype == 'Dialectal variant of':
#            vartype = 'Dialectal variant'
#        if vartype == 'Archaic variant of':
#            vartype = 'Archaic variant'
#        if vartype.endswith(' dialect'):
#            vartype += ' form'
        mainwdmap_es[entry.attrib['id']] = '\n  \\variantof{Variante de: \\textbf{' + mainwd + '}}'
        mainwdmap_en[entry.attrib['id']] = '\n  \\variantof{\\' + vartype + ' of \\vartext{' + mainwd + '}}'
#        if vartype == 'Imperfective root':
        if vartype == 'impfrtlab':
            impf_rt_map[refid] = iqdict.get_headword(entry)
            continue   # Do not include in variantmap
        try:  # citation form if it exists, else lexeme form
            variant = entry.find('citation/form[@lang="iqu"]/text').text
        except:
            variant = entry.find('lexical-unit/form[@lang="iqu"]/text').text
        try:
            variantmap[refid]
        except KeyError:
            variantmap[refid] = {}
        try:
            variantmap[refid][vartype].append(variant)
        except KeyError:
            variantmap[refid][vartype] = [variant]
    glosses = entry.findall('sense/gloss[@lang="ga"]/text')
    for ipl in iqdict.get_irreg_pl(glosses):
        irreg_pl_map[ipl] = iqdict.get_headword(entry)

## Diccionario escolar
A full entry consists of the following fields in the following order:

1. Citation Form
2. Gram. info (Abbrev. spn)
3. Glosses (Gae)
4. Variant Form (Iqu) [if present]

A minimal entry is a reduced entry that refers back to full entry, and consists of the following fields in the following order:

1. Citation Form
2. Variant Type (Reverse name, spn)
3. Variant of

The entries in the  'Reversals' section is relatively simple, and consists of the following fields:

1. Reversal Entries (Spn)
2. Gram. info
3. Citation Form

In [None]:
# Regular dictionary
texentries = []
for e in entries:
    d, err = iqdict.entry2dict_de(e, variantmap, mainwdmap_es, irreg_pl_map)
    if err is None:
        texentries.append(d)
    else:
        print('Error in entry. ', str(err))
texentries.sort(key=lambda entry: entry['sortword'])

lastchapter = ''
with open(outfile_de, 'w', encoding='utf-8') as out:
    for d in texentries:
        if d['firstletter'] != lastchapter:
            out.write('\n' + r'\chapter{' + d['firstletter'] + '}\n\n')
            lastchapter = d['firstletter']
        out.write('\n\n' + d['tex'])

# Reversal dictionary
texentries = []
for rev, e in reversalentries_es.items():
    d, err = iqdict.reventry2dict_de(rev, e)
    if err is None:
        texentries.append(d)
    elif err == 'SCI':
        pass
    else:
        print('Error in reversal entry. ', str(err))
texentries.sort(key=lambda entry: entry['sortword'])

lastchapter = ''
with open(outfile_de, 'a', encoding='utf-8') as out:
    for d in texentries:
        if d['firstletter'] != lastchapter:
            out.write('\n' + r'\chapter{' + d['firstletter'] + '}\n\n')
            lastchapter = d['firstletter']
        out.write('\n\n' + d['tex'])

## Academic dictionary (English)

In [None]:
# Regular dictionary
iqdict.reset_wordcounts()  # Reset word counter
texentries = []
for e in entries:
    if iqdict.is_excluded(e) or iqdict.is_suffix(e):
        continue
    d, err = iqdict.entry2dict_acad(e, variantmap, mainwdmap_en, irreg_pl_map, impf_rt_map)
    if err is None:
        texentries.append(d)
    else:
        print('Error in entry. ', str(err))
texentries.sort(key=lambda entry: entry['sortword'])

lastchapter = ''
with open(outfile_acad, 'w', encoding='utf-8') as out:
    for d in texentries:
        if d['firstletter'] != lastchapter:
            out.write('\n' + r'\chapter{' + d['firstletter'] + '}\n\n')
            lastchapter = d['firstletter']
        out.write(d['tex'] + '\n')

In [None]:
# Reversal dictionary
iqdict.reset_revwordcounts()  # Reset word counter
texentries = []
for rev, e in reversalentries_en.items():
    d, err = iqdict.reventry2dict_acad(rev, e)
    if err is None:
        texentries.append(d)
    elif err == 'SCI':
        pass
    else:
        print('Error in reversal entry. ', str(err))
texentries.sort(key=lambda entry: entry['sortword'])

lastchapter = ''
with open(outfile_acad, 'a', encoding='utf-8') as out:
    for d in texentries:
        if d['firstletter'] != lastchapter:
            out.write('\n' + r'\chapter{' + d['firstletter'] + '}\n\n')
            lastchapter = d['firstletter']
        out.write(d['tex'] + '\n')

## Academic dictionary (Spanish)

In [None]:
# Regular dictionary
iqdict.reset_wordcounts()  # Reset word counter
texentries = []
for e in entries:
    if iqdict.is_excluded(e) or iqdict.is_suffix(e):
        continue
    d, err = iqdict.entry2dict_acad_es(e, variantmap, mainwdmap_es, irreg_pl_map, impf_rt_map)
    if err is None:
        texentries.append(d)
    else:
        print('Error in entry. ', str(err))
texentries.sort(key=lambda entry: entry['sortword'])

lastchapter = ''
with open(outfile_acad_es, 'w', encoding='utf-8') as out:
    for d in texentries:
        if d['firstletter'] != lastchapter:
            out.write('\n' + r'\chapter{' + d['firstletter'] + '}\n\n')
            lastchapter = d['firstletter']
        out.write(d['tex'] + '\n')

In [None]:
# Reversal dictionary
iqdict.reset_revwordcounts()  # Reset word counter
texentries = []
for rev, e in reversalentries_es.items():
    d, err = iqdict.reventry2dict_acad_es(rev, e)
    if err is None:
        texentries.append(d)
    elif err == 'SCI':
        pass
    else:
        print('Error in reversal entry. ', str(err))
texentries.sort(key=lambda entry: entry['sortword'])

lastchapter = ''
with open(outfile_acad_es, 'a', encoding='utf-8') as out:
    for d in texentries:
        if d['firstletter'] != lastchapter:
            out.write('\n' + r'\chapter{' + d['firstletter'] + '}\n\n')
            lastchapter = d['firstletter']
        out.write(d['tex'] + '\n')