In [157]:
from lxml import etree
tree = etree.parse('Akan_release1.xml')
root = tree.getroot()

In [158]:
type(tree)

lxml.etree._ElementTree

In [159]:
root.getchildren()[0].getchildren()

[<Element {http://typecraft.org/typecraft}title at 0x7f783731c240>,
 <Element {http://typecraft.org/typecraft}titleTranslation at 0x7f783731c100>,
 <Element {http://typecraft.org/typecraft}extraMetadata at 0x7f7836983840>,
 <Element {http://typecraft.org/typecraft}body at 0x7f78369832c0>,
 <Element {http://typecraft.org/typecraft}phrase at 0x7f78369836c0>,
 <Element {http://typecraft.org/typecraft}phrase at 0x7f7836983a80>,
 <Element {http://typecraft.org/typecraft}phrase at 0x7f7836983280>,
 <Element {http://typecraft.org/typecraft}phrase at 0x7f7836983900>,
 <Element {http://typecraft.org/typecraft}phrase at 0x7f7836916040>,
 <Element {http://typecraft.org/typecraft}phrase at 0x7f7836983940>,
 <Element {http://typecraft.org/typecraft}phrase at 0x7f7836916080>,
 <Element {http://typecraft.org/typecraft}phrase at 0x7f78369160c0>,
 <Element {http://typecraft.org/typecraft}phrase at 0x7f7836916100>,
 <Element {http://typecraft.org/typecraft}phrase at 0x7f7836916140>,
 <Element {http://ty


## EDA on XML Tree
41 "chunks" of data 
about 5 unique tags:
- body
- extraMetadata
- phrase
- title
- titleTranslation

In [160]:
print(len(root.getchildren()))
set([x.tag for x in root.getchildren()[0].getchildren()])

41


{'{http://typecraft.org/typecraft}body',
 '{http://typecraft.org/typecraft}extraMetadata',
 '{http://typecraft.org/typecraft}phrase',
 '{http://typecraft.org/typecraft}title',
 '{http://typecraft.org/typecraft}titleTranslation'}

In [161]:
child1 = root.getchildren()[0]

### body tag

Not sure why this portion is so unordered. It appears that most of the information is stored as a text attribute in a singular body tag. 

In [162]:
#store in a list to capture all body tag contents
body_strs = []
for body in child1.iter('{http://typecraft.org/typecraft}body'):
    body_strs.append(etree.tostring(body))

In [163]:
body_root = etree.fromstring(body_strs[0])

In [164]:
body_strs[0]

b'<body xmlns="http://typecraft.org/typecraft">&lt;p&gt;&lt;span class="phrase" id="phr454118"&gt;Ama hy&#603; ataade&#603;&lt;/span&gt;&lt;/p&gt; \n&lt;p&gt;&lt;span class="phrase" id="phr454119"&gt;Kofi redidi&lt;/span&gt;&lt;/p&gt; \n&lt;p&gt;&lt;span class="phrase" id="phr454120"&gt;Nkwadaa p&#603; sika&lt;/span&gt;&lt;/p&gt; \n&lt;p&gt;&lt;span class="phrase" id="phr454121"&gt;W&#596;fa maa me sika&lt;/span&gt;&lt;/p&gt; \n&lt;p&gt;&lt;span class="phrase" id="phr454122"&gt;Kwame too Esi Nkra&lt;/span&gt;&lt;/p&gt; \n&lt;p&gt;&lt;span class="phrase" id="phr454123"&gt;Saa abofra yi b&#603;fe&lt;/span&gt;&lt;/p&gt; \n&lt;p&gt;&lt;span class="phrase" id="phr454124"&gt;&#390;bo dan no y&#603; f&#603;&lt;/span&gt;&lt;/p&gt; \n&lt;p&gt;&lt;span class="phrase" id="phr454125"&gt;Akura yi awe me sika&lt;/span&gt;&lt;/p&gt; \n&lt;p&gt;&lt;span class="phrase" id="phr454126"&gt;Mpa k&#603;se&#603; nk&#596; dan no mu&lt;/span&gt;&lt;/p&gt; \n&lt;p&gt;&lt;span class="phrase" id="phr454127"&gt;Nt

### phrase tag
This is probably where the bulk of the morpheme information is located. After iterating through all chunks, we end up with 1312 phrases, which is what was promised on the original Release 1 Documentation. I'm therefore confident that all valuable morpheme information in contained within these tags.

In [165]:
chunks = root.getchildren()
tag = '{http://typecraft.org/typecraft}phrase'

phrases = []

for chunk in chunks:
    for phrase in child1.iter(tag):
        phrases.append(phrase)

In [166]:
len(phrases)

1312

#### A function to help us describe XML elements

In [167]:
test_phrase = phrases[0]
def describe_xml(test_phrase):
    print(f"element: {test_phrase}")
    print(f"text: {test_phrase.text}")
    print(f"attributes: {test_phrase.attrib}")
    print(f"tag: {test_phrase.tag}")
    print(f"children: {test_phrase.getchildren()}")
    print("\n")
describe_xml(test_phrase)

element: <Element {http://typecraft.org/typecraft}phrase at 0x7f78369836c0>
text: None
attributes: {'id': '454118'}
tag: {http://typecraft.org/typecraft}phrase
children: [<Element {http://typecraft.org/typecraft}original at 0x7f7836923400>, <Element {http://typecraft.org/typecraft}translation at 0x7f783691d9c0>, <Element {http://typecraft.org/typecraft}translation2 at 0x7f783691d900>, <Element {http://typecraft.org/typecraft}description at 0x7f783691da80>, <Element {http://typecraft.org/typecraft}word at 0x7f783691dc00>, <Element {http://typecraft.org/typecraft}word at 0x7f783691d600>, <Element {http://typecraft.org/typecraft}word at 0x7f783691da40>]




### Exploring Phrase children

Each phrase element has 5 children: 
- desription
- original
- translation
- translation2
- word

In [168]:
test_children = test_phrase.getchildren()
set([x.tag for x in test_children])

{'{http://typecraft.org/typecraft}description',
 '{http://typecraft.org/typecraft}original',
 '{http://typecraft.org/typecraft}translation',
 '{http://typecraft.org/typecraft}translation2',
 '{http://typecraft.org/typecraft}word'}

In [169]:
test_children

[<Element {http://typecraft.org/typecraft}original at 0x7f7836923440>,
 <Element {http://typecraft.org/typecraft}translation at 0x7f7836916d80>,
 <Element {http://typecraft.org/typecraft}translation2 at 0x7f7836916a40>,
 <Element {http://typecraft.org/typecraft}description at 0x7f7836916f80>,
 <Element {http://typecraft.org/typecraft}word at 0x7f78369169c0>,
 <Element {http://typecraft.org/typecraft}word at 0x7f7836916700>,
 <Element {http://typecraft.org/typecraft}word at 0x7f7836916dc0>]

#### Describing elements of "phrase"

For the purposes of the POS tagger we are only interested in the "word" and "original" elements

In [170]:
for child in test_children:
    describe_xml(child)

element: <Element {http://typecraft.org/typecraft}original at 0x7f7836923440>
text: Ama hyɛ ataadeɛ
attributes: {}
tag: {http://typecraft.org/typecraft}original
children: []


element: <Element {http://typecraft.org/typecraft}translation at 0x7f7836916d80>
text: Ama is wearing a dress.
attributes: {}
tag: {http://typecraft.org/typecraft}translation
children: []


element: <Element {http://typecraft.org/typecraft}translation2 at 0x7f7836916a40>
text: None
attributes: {}
tag: {http://typecraft.org/typecraft}translation2
children: []


element: <Element {http://typecraft.org/typecraft}description at 0x7f7836916f80>
text: None
attributes: {}
tag: {http://typecraft.org/typecraft}description
children: []


element: <Element {http://typecraft.org/typecraft}word at 0x7f78369169c0>
text: None
attributes: {'text': 'Ama'}
tag: {http://typecraft.org/typecraft}word
children: [<Element {http://typecraft.org/typecraft}pos at 0x7f7836983c00>, <Element {http://typecraft.org/typecraft}morpheme at 0x7f78

### Creating a POS dict for all morphemes

After inspecting the "word" element, we find that 
- baseform attribute could be important
- pos tag stores info in text and is a leaf node

In [174]:
#first find out how to access morpheme information
test_words = []
for word in test_phrase.iter("{http://typecraft.org/typecraft}word"):
    test_words.append(word)

test_word = test_words[0].getchildren()

for i in test_word:
    describe_xml(i)
    
#what is gloss? Seems not super important
describe_xml(test_word[1].getchildren()[0])

element: <Element {http://typecraft.org/typecraft}pos at 0x7f7836989140>
text: N
attributes: {}
tag: {http://typecraft.org/typecraft}pos
children: []


element: <Element {http://typecraft.org/typecraft}morpheme at 0x7f78369891c0>
text: None
attributes: {'text': 'ama', 'baseform': 'ama', 'meaning': 'ama'}
tag: {http://typecraft.org/typecraft}morpheme
children: [<Element {http://typecraft.org/typecraft}gloss at 0x7f7836981ac0>]


element: <Element {http://typecraft.org/typecraft}gloss at 0x7f78369815c0>
text: SBJ
attributes: {}
tag: {http://typecraft.org/typecraft}gloss
children: []




1. Isolating POS and Baseform for each word. 

In [184]:
#tag = {http://typecraft.org/typecraft}pos
test_pos = test_word[0].text
print(f"Part of speech: {test_pos}")

#tag = {http://typecraft.org/typecraft}morpheme
test_base_form = test_word[1].attrib['baseform']
print(f"Baseform: {test_base_form}")

Part of speech: N
Baseform: ama
