In [89]:
import json
import xml.etree.ElementTree as ET
from collections import OrderedDict

# this is the full XML file you can download at the bottom of the Perseus Amores page
# you could also fetch it using requests
ovid_file = "ovid_perseus.xml"

tree = ET.parse(ovid_file)
root = tree.getroot()

# Plan: loop through the XML, hoovering up the data we want and ignoring what we don't.
# Store the data in a nested Python dictionary, then change it into a JSON file when we've added everything.

# We can't use the same function for every work because the data structure is inconsistent.
# We'll fix that though!

# The Amores: 3 books, multiple poems in each. Books have titles (in the XML) but poems don't.
def get_amores(works_list):
    work  = root.find(".//*[@n='Am.']")
    work_dict = OrderedDict() # otherwise the title is last which is not what we want in the JSON
    work_dict["title"] = work[0][0].text
    work_dict["abbreviation"] = work.attrib["n"]
    work_dict["slug"] = "amores"
    
    books = work.findall(".//*[@type='book']")
    books_list = []
    
    for book in books:
        book_dict = OrderedDict()
        book_dict["book_index"] = int(book.attrib["n"])
        book_dict["book_title"] = book[0].text
        
        poems = book.findall(".//*[@type='poem']")
        poems_list = []
        # because there's that damned epigram at the beginning of the work
        start_num = 1
        if book.attrib["n"] == "1":
            start_num = 0
        for index, poem in enumerate(poems, start=start_num):
            # if we don't extract the <l>, this picks up the titles as lines
            lines = poem.findall(".//l")
            lines_list = []
            
            poem_dict = OrderedDict()
            poem_dict["poem_index"] = index
            # damned epigram again
            if index == 0:
                poem_title = poem[0].text
            else:
                poem_title = "Poem " + str(index)
            poem_dict["poem_title"] = poem_title
            for index, line in enumerate(lines, start=1):
                line_dict = {}
                line_dict["line_index"] = index
                if (index % 2 == 0):
                    line_dict["meter"] = "pentameter"
                else:
                    line_dict["meter"] = "hexameter"
                line_dict["text"] = line.text
                lines_list.append(line_dict)
                poem_dict["lines"] = lines_list
            poems_list.append(poem_dict)
        book_dict["poems"]= poems_list
        books_list.append(book_dict)
    work_dict["books"] = books_list
    works_list.append(work_dict)

# The Ars: 3 books, with one long poem in each. Solution: call each poem "book x, poem 1". 
def get_ars(works_list):
    ars = root.find(".//*[@n='Ars']")
    
    work_dict = OrderedDict()
    work_dict["title"] = ars[0][0].text
    work_dict["abbreviation"] = ars.attrib["n"]
    work_dict["slug"] = "ars"
    
    books = ars.findall(".//*[@type='book']")
    books_list = []
    for book in books:
        book_dict = OrderedDict()
        book_dict["book_index"] = int(book.attrib["n"])
        book_dict["book_title"] = book[0].text
        
        poems_list = []
        poem_dict = OrderedDict()
        poem_dict["poem_index"] = 1
        poem_dict["poem_title"] = "Poem 1"
        lines_list = []
        # if we don't extract the <l>, this picks up the titles as lines
        poem = book.findall(".//l")
        for index, child in enumerate(poem, start=1):
            line_dict = {}
            line_dict["line_index"] = index
            if (index % 2 == 0):
                line_dict["meter"] = "pentameter"
            else:
                line_dict["meter"] = "hexameter"
            line_dict["text"] = child.text
            lines_list.append(line_dict)
            poem_dict["lines"] = lines_list
        poems_list.append(poem_dict)
        book_dict["poems"]= poems_list
        books_list.append(book_dict)
    work_dict["books"] = books_list
    works_list.append(work_dict)

# Heroides: 1 work with multiple poems. Solution: call it book one, poem x anyway. 
# Also, these poems have titles, by common usage, which none of the other poems do. 
def get_heroides(works_list):
    heroides = root.find(".//*[@n='Ep.']")
    
    work_dict = OrderedDict()
    work_dict["title"] = heroides[0][0].text
    work_dict["abbreviation"] = heroides.attrib["n"]
    work_dict["slug"] = "heroides"
    
    books_list = []
    book_dict = OrderedDict()
    book_dict["book_index"] = 1
    book_dict["book_title"] = "Liber primus" #keeping in Latin to be consistent with the others
    
    poems = heroides.findall(".//*[@type='poem']")
    poems_list = []
    for index, poem in enumerate(poems, start=1):
        poem_dict = OrderedDict()
        poem_dict["poem_index"] = index
        poem_dict["poem_title"] = poem[0].text

        lines_list = []
        lines = poem.findall("./l")
        for index, line in enumerate(lines, start=1):
            # note - had to manually delete the wretched <del> elements in the last poem, in the XML source
            line_dict = {}
            line_dict["line_index"] = index
            if (index % 2 == 0):
                line_dict["meter"] = "pentameter"
            else:
                line_dict["meter"] = "hexameter"
            line_dict["text"] = line.text
            lines_list.append(line_dict)
            poem_dict["lines"] = lines_list
        poems_list.append(poem_dict)
    book_dict["poems"]= poems_list
    books_list.append(book_dict)
    work_dict["books"] = books_list
    works_list.append(work_dict)

# Works that are basically just one longish poem. Guess what? They're now Book One, Poem One too.
def get_etsy(works_list):
    works = []
    medicamina = root.find(".//*[@n='Med.']")
    remedia = root.find(".//*[@n='Rem.']")
    works.append(remedia)
    works.append(medicamina)
    for work in works:
        work_dict = OrderedDict()
        work_dict["title"] = work[0][0].text
        work_dict["abbreviation"] = work.attrib["n"]
        work_dict["slug"] = (work[0][0].text).split()[0].lower()
        
        books_list = []
        book_dict = OrderedDict()
        
        poems_list = []
        poem_dict = OrderedDict()

        book_dict["book_index"] = 1
        book_dict["book_title"] = "Liber primus"
        
        poem_dict["poem_index"] = 1
        poem_dict["poem_title"] = "Poem 1"
        
        poem = work.findall(".//l")
        lines_list =[]
        for index, line in enumerate(poem, start=1):
            line_dict = {}
            line_dict["line_index"] = index
            if (index % 2 == 0):
                line_dict["meter"] = "pentameter"
            else:
                line_dict["meter"] = "hexameter"
            line_dict["text"] = line.text
            lines_list.append(line_dict)
        poem_dict["lines"] = lines_list
        poems_list.append(poem_dict)
        book_dict["poems"]= poems_list
        books_list.append(book_dict)
        work_dict["books"] = books_list
        works_list.append(work_dict)
        

# now the work
corpus = OrderedDict()
corpus["author"] = [{"name": "Ovid"}, {"slug": "ovid"}, {"full name": "Publius Ovidius Naso"}, {"abbreviation": "Ov."}]
works_list = []

get_amores(works_list)
get_ars(works_list)
get_heroides(works_list)
get_etsy(works_list)
corpus["works"]=works_list

json_corpus = json.dumps(corpus)
print json_corpus

with open('corpus.json', 'w') as outfile:
    json.dump(corpus, outfile)

# import requests
# import untangle
# import xmltodict

# I'm leaving these in because I show my work, hoss, and that's what I was using before

# If you want to see how I actually work, this is how I wound my head around what the orignal XML looked like.
# my tedious way of looping through layers...

# print root[1][0][0].attrib #{'lang': 'la', 'n': 'Am.'}
# print root[1][0][0][0][0].text #Amores
# print root[1][0][0][0][1].attrib #{'sample': 'complete', 'org': 'uniform', 'type': 'book', 'n': '1'}
# print root[1][0][0][0][2].attrib #{'sample': 'complete', 'org': 'uniform', 'type': 'book', 'n': '2'}
# print root[1][0][0][0][3].attrib #{'sample': 'complete', 'org': 'uniform', 'type': 'book', 'n': '3'}
# print root[1][0][0][0][1][0] #'head'
# print root[1][0][0][0][1][1] #div2 - <div2 type="poem" n="ep" org="uniform" sample="complete">
# print root[1][0][0][0][1][1][0] #head
# print root[1][0][0][0][1][1][1].text #the first line of the first poem which is actually the epigram but whatevs

# after that I started making things quicker with XPath
# works = root.findall(".//*[@lang='la']")
# for work in works:
#     print work.attrib
#     print work[0][0].text
#     books = root.findall(".//*[@type='book']")
#     for book in books: 
#         print book.attrib
#         print book[0].text
#         poems = book.findall(".//*[@type='poem']")
#         for poem in poems:
#             print poem.attrib
#             for child in poem:
#                 print child.text
    

# work roots cheatsheet
# <text lang="la" n="Am."> Amores
# <text lang="la" n="Med."> Medicamnei
# <text lang="la" n="Ep."> Heroides
# <text lang="la" n="Ars"> Ars Amatoria
# <text lang="la" n="Rem."> Remedia Amoris

{"author": [{"name": "Ovid"}, {"slug": "ovid"}, {"full name": "Publius Ovidius Naso"}, {"abbreviation": "Ov."}], "works": [{"title": "Amores", "abbreviation": "Am.", "slug": "amores", "books": [{"book_index": 1, "book_title": "Liber primus", "poems": [{"poem_index": 0, "poem_title": "EPIGRAMMA IPSIUS", "lines": [{"text": "Qui modo Nasonis fueramus quinque libelli,", "line_index": 1, "meter": "hexameter"}, {"text": "Tres sumus; hoc illi praetulit auctor opus.", "line_index": 2, "meter": "pentameter"}, {"text": "Ut iam nulla tibi nos sit legisse voluptas,", "line_index": 3, "meter": "hexameter"}, {"text": "At levior demptis poena duobus erit.", "line_index": 4, "meter": "pentameter"}]}, {"poem_index": 1, "poem_title": "Poem 1", "lines": [{"text": "Arma gravi numero violentaque bella parabam", "line_index": 1, "meter": "hexameter"}, {"text": "Edere, materia conveniente modis.", "line_index": 2, "meter": "pentameter"}, {"text": "Par erat inferior versus:  risisse Cupido", "line_index": 3, 