# Turn BHSA TSV into an XML tree

## Import data

First, read in the TSV file and extract the column headers from the first row.

In [87]:
# path_to_tsv_data = 'tsv/bhsa2017_trunc.csv' # truncated to ~100 lines for testing
path_to_tsv_data = 'tsv/bhsa2017.txt'

delimiter = '\t' # '\t'

selected_headers = [
    'n',
    'otype',
    'in.lex',
    'in.subphrase',
    'in.phrase_atom',
    'in.phrase',
    'in.clause_atom',
    'in.clause',
    'in.sentence_atom',
    'in.sentence',
    'in.half_verse',
    'in.verse',
    'in.chapter',
    'in.book',
    'crossref',
    'mother',
    'book',
    'chapter',
    'code',
    'det',
    'domain',
    'freq_lex',
    'function',
    'g_cons_utf8',
    'g_lex_utf8',
    'g_word_utf8',
    'gloss',
    'gn',
    'label',
    'language',
    'lex_utf8',
    'ls',
    'nametype',
    'nme',
    'nu',
    'number',
    'osm',
    'osm_sf',
    'pargr',
    'pdp',
    'pfm',
    'phono',
    'phono_trailer',
    'prs',
    'prs_gn',
    'prs_nu',
    'prs_ps',
    'ps',
    'qere_trailer_utf8',
    'qere_utf8',
    'rank_lex',
    'rela',
    'sp',
    'st',
    'tab',
    'trailer_utf8',
    'txt',
    'typ',
    'uvf',
    'vbe',
    'vbs',
    'verse',
    'voc_lex_utf8',
    'vs',
    'vt'
 ]

excluded_values = [
    'NA',
    'n/a',
    'none',
    ''
]

Then, create a dictionary to store the data for each row, keyed by chapter, where the keys are the column headers and the values are the data for that column.

In [124]:
import csv
from lxml import etree

with open(path_to_tsv_data, 'r') as tsv_file:
    reader = csv.DictReader(tsv_file, delimiter=delimiter)
    data = [row for row in reader]
    headers = reader.fieldnames
    print(headers)

['n', 'otype', 'in.lex', 'in.subphrase', 'in.phrase_atom', 'in.phrase', 'in.clause_atom', 'in.clause', 'in.sentence_atom', 'in.sentence', 'in.half_verse', 'in.verse', 'in.chapter', 'in.book', 'crossref', 'mother', 'book', 'chapter', 'code', 'det', 'domain', 'freq_lex', 'function', 'g_cons', 'g_cons_utf8', 'g_lex', 'g_lex_utf8', 'g_word', 'g_word_utf8', 'gloss', 'gn', 'label', 'language', 'lex', 'lex_utf8', 'ls', 'nametype', 'nme', 'nu', 'number', 'osm', 'osm_sf', 'pargr', 'pdp', 'pfm', 'phono', 'phono_trailer', 'prs', 'prs_gn', 'prs_nu', 'prs_ps', 'ps', 'qere', 'qere_trailer', 'qere_trailer_utf8', 'qere_utf8', 'rank_lex', 'rela', 'sp', 'st', 'tab', 'trailer', 'trailer_utf8', 'tree', 'treen', 'txt', 'typ', 'uvf', 'vbe', 'vbs', 'verse', 'voc_lex', 'voc_lex_utf8', 'vs', 'vt']


In [131]:
data = dict()

with open(path_to_tsv_data, 'r') as f:
    reader = csv.DictReader(f, delimiter=delimiter)
    for row in reader:
        data[row['n']] = row

# sentences = [row for row in data if row['otype'] == 'sentence'] # not list but dict
sentences = [row for row in data.values() if row['otype'] == 'sentence']

print('rows: ', len(data), 'sentences: ', len(sentences))

rows:  1446831 sentences:  63717


## Parsing functions

Next, create a function to parse the treen string and convert it into an XML tree. The function will take the treen string and the column headers as inputs and return an XML tree.

In [130]:
# Very slightly adapted from Dirk Roorda code here: https://github.com/ETCBC/bhsa/blob/master/tutorial/utils.py

def _parseTerminal(string):
    if ' ' in string:
        (tag, num) = string.split(' ', 1)
        num = str(int(num) + 1) # Word ids in the table are 1-based
        return tag + '{' + num + '}'
    # elif '{' not in string:
        # print('Warning: no space or curly braces in "{}"'.format(string)) # Note: I think the only time this happens is when the word is a coordination group, with string value 'Ccoor'
    return string


def _parseBrackets(string):
    if string == '':
        return ([], '', None)
    if string[0] == '(':
        rest = string[1:]
        result = []
        error = None
        while rest != '' and rest[0] != ')':
            (thisResult, rest, error) = _parseBrackets(rest)
            if error:
                break
            result.append(thisResult)
        if rest == '':
            error = 'Missing ")" in "{}"'.format(string[1:])
        if len(rest):
            rest = rest[1:]
        return (result, rest, error)
    if string[0] == ')':
        return ('', string[1:], None)
    theOpen = string.find('(')
    theClose = string.find(')')
    if theOpen == -1 and theClose == -1:
        return (_parseTerminal(string), '', None)
    nextPos = None
    if theOpen == -1:
        nextPos = theClose
    else:
        nextPos = theOpen if theClose == -1 else min((theOpen, theClose))
    return (_parseTerminal(string[0:nextPos]), string[nextPos:], None)

In [129]:
def create_node(tag, tag_name, id):
    node = etree.Element(tag_name)
    matching_row = None
    if id != -1:
        node.set('id', str(id))
        matching_row = data[str(id)]
    
    # Set all node attributes 
    
    if matching_row is None:
        # print('No matching row for id: ', str(id), 'tag: ', tag, 'tag_name: ', tag_name)
        pass
    else:
        # the row is a TSV row, so we need to convert it to a dict using the headers as keys
        attributes = {
            header: matching_row[header] for header in headers if matching_row[header] not in excluded_values and header in selected_headers
        }
        attributes['class'] = tag
        for key, value in attributes.items():
            node.set(key, value)
    
    if tag_name == 'w':
        node.text = matching_row['g_word_utf8']
    
    return node

def create_tree(node_list):
    if len(node_list[1:]) > 0:
        id = None;
        if node_list[0] == 'Ccoor' or node_list[0] == 'Cattr':
            tag_name = 'wg'
            tag = node_list[0]
            id = -1
        elif node_list[0][0] == 'Ccoor' or node_list[0] == 'Cattr':
            tag_name = 'wg'
            tag = node_list[0][0]
            id = -1
        else:  
            try:
                tag, id_str = node_list[0].split('{')
            except Exception as e:
                print('Error parsing: ', node_list[0])  
    
        tag_name = 'sentence' if tag == 'S' else 'wg'
        id = int(id_str[:-1]) if id != -1 else -1
        node = create_node(tag, tag_name, id)
        for child_list in node_list[1:]:
            child_node = create_tree(child_list)
            node.append(child_node)
        return node
    else:
        tag_name = 'w'
        tag, id_str = node_list[0].split('{')
        id = int(id_str[:-1])
        return create_node(tag, tag_name, id)


In [171]:
# Cleanup function to move every node with a 'mother' attribute to the
# node with the matching id. This is necessary because the ETCBC data
# does not necessarily nest these relations, preferring to encode them
# as attributes.

# Note: for many nodes with a mother, the mother node cannot be found based
# on the way I am breaking up sentences, I suspect. It might be worth trying
# to build these trees not based on sentences as the basic unit but based on
# the entire book (more like OpenText 2.0)

def move_node_to_mother(node, element_tree):
    if 'mother' in node.attrib:
        mother_id = node.attrib['mother']
        mother_node = element_tree.find('.//*[@id="{}"]'.format(mother_id))
        if mother_node is not None:
            mother_node.append(node)
        else:
            print('No mother node found for id: ', mother_id, ' Node with mother: ', node.tag, node.attrib['id'])
    # All the children should go along with the passed node
    
def move_nodes_to_mother(element_tree):
    for node in element_tree.iter():
        move_node_to_mother(node, element_tree)

## Create XML trees for each chapter

In [133]:
chapters = [row for row in data.values() if row['otype'] == 'chapter']
verses = [row for row in data.values() if row['otype'] == 'verse']

In [136]:
book_lookup = {
    'Genesis':'GEN',
    'Exodus':'EXO',
    'Leviticus':'LEV',
    'Numeri':'NUM',
    'Deuteronomium':'DEU',
    'Josua':'JOS',
    'Judices':'JDG',
    'Ruth':'RUT',
    'Samuel_I':'1SA',
    'Samuel_II':'2SA',
    'Reges_I':'1KI',
    'Reges_II':'2KI',
    'Chronica_I':'1CH',
    'Chronica_II':'2CH',
    'Esra':'EZR',
    'Nehemia':'NEH',
    'Esther':'EST',
    'Iob':'JOB',
    'Psalmi':'PSA',
    'Proverbia':'PRO',
    'Ecclesiastes':'ECC',
    'Canticum':'SNG',
    'Jesaia':'ISA',
    'Jeremia':'JER',
    'Threni':'LAM',
    'Ezechiel':'EZK',
    'Daniel':'DAN',
    'Hosea':'HOS',
    'Joel':'JOL',
    'Amos':'AMO',
    'Obadia':'OBA',
    'Jona':'JON',
    'Micha':'MIC',
    'Nahum':'NAM',
    'Habakuk':'HAB',
    'Zephania':'ZEP',
    'Haggai':'HAG',
    'Sacharia':'ZEC',
    'Maleachi':'MAL',
}

In [142]:
# Assign each sentence to a book and chapter lookup dict
chapter_dict = dict()
for i, sentence in enumerate(sentences):
    treen = sentence['treen']
    # Parse treen brackets into a nested list
    parsed = _parseBrackets(treen)
    chapter = sentence['in.chapter']
    # verse = sentence['in.verse']
    # find chapter in chapters using 'n' field
    ch_row = [row for row in chapters if row['n'] == chapter][0]
    book = book_lookup[ch_row['book']]
    ch_formatted = ' '.join([book, ch_row['chapter']])
    # find verse in verses using 'n' field
    # v_row = [row for row in verses if row['n'] == verse][0]['verse']
    chapter_dict[ch_formatted] = chapter_dict.get(ch_formatted, []) + [parsed[0]]

In [143]:
# Sanity check
chapter_dict['GEN 1']

[['S{1172308}',
  ['C{427559}',
   ['PP{651573}', ['pp{1}'], ['n{2}']],
   ['VP{651574}', ['vb{3}']],
   ['NP{651575}', ['n{4}']],
   ['PP{651576}',
    ['U{1300539}', ['pp{5}'], ['dt{6}'], ['n{7}']],
    ['cj{8}'],
    ['U{1300540}', ['pp{9}'], ['dt{10}'], ['n{11}']]]]],
 ['S{1172309}',
  ['C{427560}',
   ['CP{651577}', ['cj{1}']],
   ['NP{651578}', ['dt{2}'], ['n{3}']],
   ['VP{651579}', ['vb{4}']],
   ['NP{651580}',
    ['U{1300541}', ['n{5}']],
    ['cj{6}'],
    ['U{1300542}', ['n{7}']]]]],
 ['S{1172310}',
  ['C{427561}',
   ['CP{651581}', ['cj{1}']],
   ['NP{651582}', ['n{2}']],
   ['PP{651583}',
    ['pp{3}'],
    ['U{1300543}', ['n{4}']],
    ['U{1300544}', ['n{5}']]]]],
 ['S{1172311}',
  ['C{427562}',
   ['CP{651584}', ['cj{1}']],
   ['NP{651585}', ['U{1300545}', ['n{2}']], ['U{1300546}', ['n{3}']]],
   ['VP{651586}', ['vb{4}']],
   ['PP{651587}',
    ['pp{5}'],
    ['U{1300547}', ['n{6}']],
    ['U{1300548}', ['dt{7}'], ['n{8}']]]]],
 ['S{1172312}',
  ['C{427563}',
   ['CP{65

In [23]:
# # Parse each sentence's brackets
# parsed_sentences = []
# for i, sentence in enumerate(sentences):
#     treen = sentence['treen']
#     # Parse treen brackets into a nested list
#     parsed = _parseBrackets(treen)
#     parsed_sentences.append(parsed[0])
#     # print(i, parsed[0])

In [152]:
# Enumerate book names
bb = [
    'GEN',
'EXO',
'LEV',
'NUM',
'DEU',
'JOS',
'JDG',
'RUT',
'1SA',
'2SA',
'1KI',
'2KI',
'1CH',
'2CH',
'EZR',
'NEH',
'EST',
'JOB',
'PSA',
'PRO',
'ECC',
'SNG',
'ISA',
'JER',
'LAM',
'EZK',
'DAN',
'HOS',
'JOL',
'AMO',
'OBA',
'JON',
'MIC',
'NAM',
'HAB',
'ZEP',
'HAG',
'ZEC',
'MAL',
]
enumerated_books = dict()
for i, book in enumerate(bb):
    j = i + 1
    enumerated_books[book] = f'{j:02d}'

In [180]:
# Create XML tree

for chapter, sentences in chapter_dict.items():
    root = etree.Element('root')    
    for sentence in sentences:
        root.append(create_tree(sentence))
    
    # Optionally, add a stylesheet to the XML
    root.addprevious(etree.PI('xml-stylesheet', 'href="treedown.css"'))
    
    # Save XML
    tree = etree.ElementTree(root)
    
    ## NOTE: ##
    # Restructure tree. Involves reordering nodes, so comment this line if you want a tree with words in canonical order
    move_nodes_to_mother(tree) 
    
    str_tree = etree.tostring(tree, pretty_print=True, encoding='unicode') #.decode('unicode')
    
    # example name: 28-HOS-009.xml
    book, ch_num = chapter.split(' ')
    save_path = 'xml/chapters/{}-{}-{:03d}.xml'.format(enumerated_books[book], book, int(ch_num))
    # save_path = '~/github/etcbc/bhsa/_temp/2017/r/bhsa2017_trunc.xml'
    with(open(save_path, 'w', encoding='utf-8')) as f:
        f.write(str_tree)
            


No mother node found for id:  307  Node with mother:  wg 1300590
No mother node found for id:  319  Node with mother:  wg 1300592
No mother node found for id:  366  Node with mother:  wg 1300602
No mother node found for id:  376  Node with mother:  wg 1300606
No mother node found for id:  377  Node with mother:  wg 1300605
No mother node found for id:  390  Node with mother:  wg 1300611
No mother node found for id:  391  Node with mother:  wg 427630
No mother node found for id:  404  Node with mother:  wg 1300616
No mother node found for id:  460  Node with mother:  wg 1300625
No mother node found for id:  471  Node with mother:  wg 1300628
No mother node found for id:  484  Node with mother:  wg 1300632
No mother node found for id:  485  Node with mother:  wg 1300631
No mother node found for id:  507  Node with mother:  wg 1300634
No mother node found for id:  512  Node with mother:  wg 1300637
No mother node found for id:  521  Node with mother:  wg 1300642
No mother node found for i

In [179]:
# Print the root (ignore all node attributes except id, mother, and rela), and the move_nodes_to_mother version of the root, to see the difference

def _keep_attribs(node):
    node_minimal = etree.Element(node.tag)
    if 'id' in node.attrib:
        node_minimal.attrib['id'] = node.attrib['id']
    if 'mother' in node.attrib:
        node_minimal.attrib['mother'] = node.attrib['mother']
    if 'rela' in node.attrib:
        node_minimal.attrib['rela'] = node.attrib['rela']
    for child in node:
        node_minimal.append(_keep_attribs(child))
    return node_minimal

root_minimal = _keep_attribs(root)
root_minimal_first_sentence_child = root_minimal[6]
print(etree.tostring(root_minimal_first_sentence_child, pretty_print=True, encoding='unicode'))
print('\n----------------\n')

# Cleaned up root_minimal_first_sentence_child
move_nodes_to_mother(root_minimal_first_sentence_child)
    
# Print the root (ignore all node attributes except id, mother, and rela), and the move_nodes_to_mother version of the root, to see the difference
print(etree.tostring(root_minimal_first_sentence_child, pretty_print=True, encoding='unicode'))

<sentence id="1235973">
  <wg id="515611">
    <wg id="904539">
      <w id="1"/>
    </wg>
    <wg id="904540">
      <w id="2"/>
    </wg>
    <wg id="904541">
      <wg id="1414205">
        <w id="3"/>
      </wg>
      <wg id="1414206" mother="426122" rela="rec">
        <w id="4"/>
      </wg>
    </wg>
    <wg id="904542">
      <w id="5"/>
      <w id="6"/>
      <w id="7"/>
    </wg>
    <wg id="904543">
      <w id="8"/>
      <wg id="1414207">
        <w id="9"/>
      </wg>
      <w id="10"/>
      <wg id="1414208" mother="1414207" rela="par">
        <w id="11"/>
      </wg>
    </wg>
  </wg>
</sentence>


----------------

No mother node found for id:  426122  Node with mother:  wg 1414206
<sentence id="1235973">
  <wg id="515611">
    <wg id="904539">
      <w id="1"/>
    </wg>
    <wg id="904540">
      <w id="2"/>
    </wg>
    <wg id="904541">
      <wg id="1414205">
        <w id="3"/>
      </wg>
      <wg id="1414206" mother="426122" rela="rec">
        <w id="4"/