In [43]:
# Read XMI standoff annotation file and convert it into a typical XML text. 
# We can then convert that XML text into an information tree by ignoring all information outside of nodes.
# As a goal, extract NER information in two formats:
# short spans (SSRQ-like)
# long spans (Königsfelden-like)

from xml.sax.saxutils import escape

In [65]:

# Read all spans in the order of the XMI file. (Should be sorted by start index > end index)
from lxml import etree as et

root = et.Element("info", nsmap={"custom":"http:///custom.ecore", "cas":"http:///uima/cas.ecore"})

infile = et.parse("HGB_1_122_035_006.xmi")
in_root = infile.getroot()

text_node = in_root.find("./cas:Sofa", namespaces={"cas":"http:///uima/cas.ecore"})
document_text = text_node.get("sofaString")
original_length = len(document_text)

entities = in_root.findall(".//custom:Span", namespaces={"custom":"http:///custom.ecore"})

# sort list
# note which entity and which tag, start or end, needs to be inserted at this point
sorted_entities = []
for ent in entities:
    sorted_entities.append((ent, ent.get("begin"), "start"))
    sorted_entities.append((ent, ent.get("end"), "end"))

sorted_entities.sort(key=lambda x: int(x[1]))

# take the document text and enter the tags, count indices backwards so inserted tags will not change the indices
for entity, index, type in sorted_entities:
    if type == "start":
        # enter start tag
        inverse_index = int(entity.get("begin")) - original_length
        document_text = document_text[:inverse_index] + f"<Span label='{entity.get('label')}'>" + document_text[inverse_index:]
    else:
        # enter end tag
        inverse_index = int(entity.get("end")) - original_length
        document_text = document_text[:inverse_index] + "</Span>" + document_text[inverse_index:]

document_text = document_text.replace("&", "&amp;")
out_root = et.fromstring("<XML>" + document_text + "</XML>")
out_tree = et.ElementTree(out_root)

In [64]:
#import pprint as pp

out_tree.write('output.xml', pretty_print=True)

In [None]:
# we need to fill in some elements that are missing at the moment: 
# TODO: Add head elements where we left them away for easier tagging
# TODO: We may need to reorder tags when head and an element that covers the same range are misordered (one head will be missing anyways in that case)

In [78]:
# now collect all NAM entities

#nes = out_root.findall(".//Span")
nes = out_root.xpath(".//Span[contains(@label,'NAM')]")

for ne in nes:
    print(et.tostring(ne))
    print(''.join(ne.itertext()))
    # only collect the head
    head_node = ne.find("./Span[@label='head']")
    print(head_node.text)

alias = out_root.xpath(".//Span[contains(@label,'att.alias')]")
for ne in alias:
    print(et.tostring(ne))
    print(''.join(ne.itertext()))
    # only collect the head
    head_node = ne.find("./Span[@label='head']")
    print(head_node.text)

b'<Span label="NAM.PER.SGL"><Span label="att.title">Frau</Span> <Span label="head">Agnes Hoffassin</Span></Span> u . '
Frau Agnes Hoffassin
Agnes Hoffassin
b'<Span label="NAM.PER.SGL"><Span label="head">E&#252;nnelin</Span> <Span label="attr.rel">urtochter</Span></Span> mit\n'
Eünnelin urtochter
Eünnelin
b'<Span label="NAM.PER.SGL"><Span label="head">Heuruhen Samlin</Span> <Span label="att.job">dem <Span label="head">gewantman</Span></Span> <Span label="att.rel"><Span label="PRO.PER.SGL">derselben</Span> <Span label="head">Eemcoleman</Span></Span></Span>\nDerkaufen an '
Heuruhen Samlin dem gewantman derselben Eemcoleman
Heuruhen Samlin
b'<Span label="NAM.PER.SGL"><Span label="head">Hannsen Jsenlin</Span> <Span label="att.job">den Kramer</Span></Span> , u\n'
Hannsen Jsenlin den Kramer
Hannsen Jsenlin
b'<Span label="NAM.PER.SGL"><Span label="att.rel"><Span label="PRO.PER.SGL">seine</Span> <Span label="head">Frau</Span></Span> <Span label="head">Atgnesen</Span></Span> , '
seine Frau Atgne