In [1]:
import numpy as np
from xml import dom, sax
import xml.etree.ElementTree as ET

In [2]:
file = "data/cf79.xml"

# DOM

In [3]:
# extracting all AUTHOR from file
doc = dom.minidom.parse(file)

all_author = doc.getElementsByTagName("AUTHOR")
all_author_list = []

for author in all_author:
    all_author_list.append(author.firstChild.data)

# removing duplicates
unique_author = np.unique(all_author_list)

In [4]:
# creating output dom
doc = dom.minidom.Document()
 
root = doc.createElement("AUTHORS")
doc.appendChild(root)

for value in all_author_list:
    tempChild = doc.createElement("AUTHOR")
    root.appendChild(tempChild)

    # Write Text
    nodeText = doc.createTextNode(value)
    tempChild.appendChild(nodeText)

# saving autores.xml file
with open('autores.xml', 'w') as f:
    doc.writexml( 
        f,
        indent="",
        addindent="   ",
        newl='\n'
    )

# SAX

In [5]:
# creating handler to extract title
class TitleHandler(sax.handler.ContentHandler):
    def __init__(self):
        # all titles from file
        self.titles = []
        
        # aux variables stating if current tag is title and associated texts (if multiple lines)
        self.isTitle = False
        self.currentTitle = None
    
    def startElement(self, name, attrs):
        if name == 'TITLE':
            self.isTitle = True
            self.currentTitle = []

    def endElement(self, name):
        if name == "TITLE":
            self.isTitle = False
            self.titles.append(" ".join(self.currentTitle))
            self.currentTitle = None

    def characters(self, content):
        if self.isTitle and (content.strip() != ""):
            self.currentTitle.append(content)

In [6]:
handler = TitleHandler()
sax.parse(file, handler)

In [7]:
output = '<?xml version="1.0" ?>\n'
output += "<FILE>\n"
output += "\n".join([f"   <TITLE>{i}</TITLE>" for i in handler.titles])
output += "\n</FILE>"

with open('titulos.xml', 'w') as f:
    f.write(output)