In [43]:
# Read XMI standoff annotation file and convert it into a XML file similar to TEI with standoff annotation for relationships
# Export the files from Inception as UIMA CAS XMI (XML 1.1) and put them (unzipped!) in the folder named unter "infiles"
# Important: This code assumes no intersecting entities!

In [6]:
import glob

infiles = glob.glob("testfiles/*.xmi")

In [93]:
from lxml import etree as et
from xml.sax.saxutils import escape

ENTITY_TYPES = ["NAM", "NOM", "PRO", "UNK"]

DEFAULT_VALUES = {
    "ent_num_type": "SGL"
}

def create_node_tree(in_root, document_text):
    
    spans = in_root.findall(".//custom:Span", namespaces={"custom":"http:///custom.ecore"})
    # note which entity and which tag, start or end, needs to be inserted at this point
    sorted_spans = []
    for ent in spans:
        sorted_spans.append((ent, int(ent.get("begin")), int(ent.get("end"))))
    sorted_spans.sort(key=lambda x: (x[1], -x[2]))
    work_root = et.Element("XML", nsmap={"custom":"http:///custom.ecore", "cas":"http:///uima/cas.ecore"})
    parent_node = work_root
    for entity, start, end in sorted_spans:
        # classify if span is entity, attribute or description
        label = entity.get('label')
        label = label.split(".")
        span_type = None
        if label[0] in ENTITY_TYPES:
            span_type = "ent"
        elif label[0] == "att":
            span_type = "att"
        elif label[0] == "desc":
            span_type = "desc"
        elif label[0] == "head":
            span_type = "head"
        else:
            print(f"WARNING: Unrecognized Span Label {label[0]}!")
        # we need to check all parent nodes above if they contain the current node
        while(parent_node != work_root):
            if end <= int(parent_node.get("end")):
                current_node = et.SubElement(parent_node, "Entity", id=entity.get("{http://www.omg.org/XMI}id"), span_type=span_type, label=entity.get('label'), start=str(start), end=str(end), text=document_text[start:end])
                break
            else:
                parent_node = parent_node.getparent()
        else:
            current_node = et.SubElement(work_root, "Entity", id=entity.get("{http://www.omg.org/XMI}id"), span_type=span_type, label=entity.get('label'), start=str(start), end=str(end), text=document_text[start:end])
        parent_node = current_node
    return work_root


def write_entities(out_root, work_root):
    # I renamed Entity to Mention here as it's a more accurate descriptor
    entities_node = et.SubElement(out_root, "Mentions")

    for entity in work_root.findall(".//Entity[@span_type='ent']"):
        label = entity.get('label')
        label = label.split(".")
        if len(label) == 2:
            mention_type, entity_type = label
            num_type = DEFAULT_VALUES["ent_num_type"]  # default value
        elif len(label) == 3:
            mention_type, entity_type, num_type = label

        head_elem = entity.find("Entity[@label='head']")
        if head_elem == None:
            # Implizierter Head
            print(f"Warning: Implizierter Head bei {entity.get('id')}.")
            head_start = entity.get("start")
            head_end = entity.get("end")
        else:
            head_start = head_elem.get("start")
            head_end = head_elem.get("end")

        et.SubElement(entities_node, 
            "Reference",
            mention_type=mention_type,
            entity_type=entity_type,
            num_type=num_type,
            start=entity.get("start"),
            end=entity.get("end"),
            head_start=head_start,
            head_end=head_end
            )
    
    for entity in work_root.findall(".//Entity[@span_type='att']"):
        # TODO: Können PRO auch ATT sein oder nicht? Oder dann immer als DESC taggen?
        parent = entity.getparent()
        label = parent.get('label')
        label = label.split(".")
        if len(label) == 2:
            mention_type, entity_type = label
            num_type = DEFAULT_VALUES["ent_num_type"]  # default value
        elif len(label) == 3:
            mention_type, entity_type, num_type = label
        # overwrite mention type
        mention_type = "NOM"

        head_elem = entity.find("Entity[@label='head']")
        if head_elem == None:
            # Implizierter Head
            print(f"Warning: Implizierter Head bei {entity.get('id')}.")
            head_start = entity.get("start")
            head_end = entity.get("end")
        else:
            head_start = head_elem.get("start")
            head_end = head_elem.get("end")

        et.SubElement(entities_node, 
            "Attribute",
            mention_type=mention_type,
            entity_type=entity_type,
            num_type=num_type,
            start=entity.get("start"),
            end=entity.get("end"),
            head_start=head_start,
            head_end=head_end
            )


def process_xmi(xmi_file):
    infile = et.parse(xmi_file)
    in_root = infile.getroot()

    text_node = in_root.find("./cas:Sofa", namespaces={"cas":"http:///uima/cas.ecore"})
    document_text = text_node.get("sofaString")

    work_root = create_node_tree(in_root, document_text)

    work_tree = et.ElementTree(work_root)
    work_tree.write('debug.xml', xml_declaration=True, pretty_print=True, encoding="utf8")

    out_root = et.Element("XML")

    # TODO: Write DocumentMetaData
    out_text = et.SubElement(out_root, "Text")
    out_text.text = document_text
    write_entities(out_root, work_root)
    # TODO: Write Value Annotations
    # TODO: Write Relations
    # TODO: Write Events

    #document_text = document_text.replace("&", "&amp;")
    #work_root = et.fromstring("<XML>" + document_text + "</XML>")
    

    out_tree = et.ElementTree(out_root)
    out_tree.write('output.xml', xml_declaration=True, pretty_print=True, encoding="utf8")


for infile in infiles:
    xml_file = process_xmi(infile)
    break
    #write_xml(xml_file)




In [None]:
"""
        label = entity.get('label')
        label = label.split(".")
        # classify if span is entity, attribute or description
        
        span_type = None
        if label[0] in entity_types:
            span_type = "ent"
        elif label[0] == "att":
            span_type = "att"
        elif label[0] == "desc":
            span_type = "desc"
        else:
            print(f"WARNING: Unrecognized Span Label {label[0]}!")
        """

In [None]:
def write_xml(xml_file):
    pass

In [1]:

# Read all spans in the order of the XMI file. (Should be sorted by start index > end index)
from lxml import etree as et
from xml.sax.saxutils import escape

root = et.Element("info", nsmap={"custom":"http:///custom.ecore", "cas":"http:///uima/cas.ecore"})

infile = et.parse("testfiles/HGB_1_002_047_004.xmi")
in_root = infile.getroot()

text_node = in_root.find("./cas:Sofa", namespaces={"cas":"http:///uima/cas.ecore"})
document_text = text_node.get("sofaString")
original_length = len(document_text)

entities = in_root.findall(".//custom:Span", namespaces={"custom":"http:///custom.ecore"})

# sort list
# note which entity and which tag, start or end, needs to be inserted at this point
sorted_entities = []
for ent in entities:
    sorted_entities.append((ent, ent.get("begin"), "start"))
    sorted_entities.append((ent, ent.get("end"), "end"))

sorted_entities.sort(key=lambda x: int(x[1]))

# take the document text and enter the tags, count indices backwards so inserted tags will not change the indices
for entity, index, type in sorted_entities:
    if type == "start":
        # enter start tag
        inverse_index = int(entity.get("begin")) - original_length
        document_text = document_text[:inverse_index] + f"<Span label='{entity.get('label')}'>" + document_text[inverse_index:]
    else:
        # enter end tag
        inverse_index = int(entity.get("end")) - original_length
        document_text = document_text[:inverse_index] + "</Span>" + document_text[inverse_index:]

document_text = document_text.replace("&", "&amp;")
work_root = et.fromstring("<XML>" + document_text + "</XML>")
out_tree = et.ElementTree(work_root)

In [2]:
#import pprint as pp

out_tree.write('output.xml', pretty_print=True)

In [None]:
# we need to fill in some elements that are missing at the moment: 
# TODO: Add head elements where we left them away for easier tagging
# TODO: We may need to reorder tags when head and an element that covers the same range are misordered (one head will be missing anyways in that case)

In [3]:
# now collect all NAM entities

#nes = work_root.findall(".//Span")
nes = work_root.xpath(".//Span[contains(@label,'NAM')]")

for ne in nes:
    print(et.tostring(ne))
    print(''.join(ne.itertext()))
    # only collect the head
    head_node = ne.find("./Span[@label='head']")
    print(head_node.text)

alias = work_root.xpath(".//Span[contains(@label,'att.alias')]")
for ne in alias:
    print(et.tostring(ne))
    print(''.join(ne.itertext()))
    # only collect the head
    head_node = ne.find("./Span[@label='head']")
    print(head_node.text)

b'<Span label="NAM.PER"><Span label="head">Grede von Arx</Span> <Span label="att.rel"><Span label="NAM.PER"><Span label="head">Elscmen\nvon Arx</Span> <Span label="desc.dead">selige</Span></Span> <Span label="head">tohter</Span></Span> <Span label="att.rel">in ze ziten <Span label="head">efrow</Span> <Span label="NAM.PER"><Span label="head">Ulrich\nGeburensmt</Span> <Span label="att.occ">dez <Span label="head">zimermans</Span></Span> <Span label="desc.loc">ze <Span label="NAM.GPE.LOC">Nuwemburg</Span>\ngesessen</Span></Span></Span></Span> , '
Grede von Arx Elscmen
von Arx selige tohter in ze ziten efrow Ulrich
Geburensmt dez zimermans ze Nuwemburg
gesessen
Grede von Arx
b'<Span label="NAM.PER"><Span label="head">Elscmen\nvon Arx</Span> <Span label="desc.dead">selige</Span></Span> '
Elscmen
von Arx selige
Elscmen
von Arx
b'<Span label="NAM.PER"><Span label="head">Ulrich\nGeburensmt</Span> <Span label="att.occ">dez <Span label="head">zimermans</Span></Span> <Span label="desc.loc">ze <Spa

AttributeError: 'NoneType' object has no attribute 'text'