In [7]:
import xml
import xml.etree.ElementTree as ET
import os
from os import path
import glob
from collections import defaultdict, OrderedDict

In [94]:
data_dir = "/home/shtoshni/Research/events/data/ECB+_LREC2014/ECB+"
source_files = glob.glob("{}/*/*.xml".format(data_dir))

In [95]:
output_dir = "/home/shtoshni/Research/events/data/ECB+_LREC2014/ECB+_html"


HTML_START = '<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8"></head><body>'

human_tag = '<div style="border:2px; display:inline; border-style: solid; border-color: red; padding: 10px; padding-right: 3px; padding-left: 3px">'
nonhuman_tag = '<div style="border:2px; display : inline; border-style: solid; border-color: orange; padding:10px; padding-right: 3px; padding-left: 3px">'
action_tag = '<div style="border:2px; display : inline; border-style: solid; border-color: blue; padding:10px; padding-right: 3px; padding-left: 3px">'
nonaction_tag = '<div style="border:2px; display : inline; border-style: solid; border-color: turquoise; padding:10px; padding-right: 3px; padding-left: 3px">'
time_tag = '<div style="border:2px; display : inline; border-style: solid; border-color: green; padding:10px; padding-right: 3px; padding-left: 3px">'
loc_tag = '<div style="border:2px; display : inline; border-style: solid; border-color: #ADFF2F; padding:10px; padding-right: 3px; padding-left: 3px">'
# duplicate_tag = '<div style="border:2px; display : inline; border-style: solid; border-color: brown; padding:10px; padding-right: 3px; padding-left: 3px">'

end_tag = '</div>'


In [123]:
def return_html(source_file):
    global within_doc_coref
    global num_time
    global num_loc
    global num_action
    global num_human
    global num_non_human
    global total_within_doc_participants
    global unique_coref_ids
    
    tree = ET.parse(source_file)
    root = tree.getroot()

    # Token ID to String
    token_id_to_str = OrderedDict()

    for elem in root.iter('token'):
        t_id = int(elem.attrib['t_id'])
        token_str = elem.text
        sent_id = int(elem.attrib['sentence'])

        token_id_to_str[t_id] = (token_str, sent_id)

    # Markables marking the boundary of events, entities, time, and location.
    markable_elems = list(root.iter('Markables'))
    assert(len(markable_elems) == 1)
    markable_elem = markable_elems[0]

    html_tag_list = defaultdict(list)

    tag_start = False
    tag_ids = []
    for sub_elem in markable_elem:
        m_id = sub_elem.attrib['m_id']
        broad_tag_class = (sub_elem.tag).split('_')[0]
        if broad_tag_class == 'HUMAN':
            html_tag = human_tag
        elif broad_tag_class == 'ACTION':
            html_tag = action_tag
        elif broad_tag_class == 'LOC':
            html_tag = loc_tag
        elif broad_tag_class == 'TIME':
            html_tag = time_tag
        elif broad_tag_class == 'NON':
            html_tag = nonhuman_tag
        elif broad_tag_class == 'NEG':
            html_tag = nonaction_tag
        elif broad_tag_class == 'UNKNOWN':
            continue
        else:
            print("Sweet Glory: {}".format(broad_tag_class))

        t_id_list = []
        for token_elem in sub_elem.iter('token_anchor'):
            t_id_list.append(int(token_elem.attrib['t_id']))

        if len(t_id_list):
            span_start = t_id_list[0]
            span_end = t_id_list[-1]

            html_tag_list[span_start].append((html_tag, 'start', m_id))
            html_tag_list[span_end].append((end_tag, 'end', m_id))
            
            if broad_tag_class == 'TIME':
                num_time += 1
            elif broad_tag_class == 'LOC':
                num_loc += 1
            elif broad_tag_class == 'ACTION':
                num_action += 1
            elif broad_tag_class == 'NEG':
                num_action += 1
            elif broad_tag_class == 'NON':
                num_non_human += 1
            elif broad_tag_class == 'HUMAN':
                num_human += 1
                
    for key in html_tag_list:
        # Sort all the HTML tags so that the end tags occur before the start tags
        html_tag_list[key] = sorted(html_tag_list[key], key=lambda x: x[1])


    # Clustering of various markables.
    relation_elems = list(root.iter('Relations'))
    assert(len(relation_elems) == 1)
    coref_elems = relation_elems[0]

    cluster_idx = 1
    m_id_to_cluster_id = {}
    for coref_elem in coref_elems:
        if coref_elem.tag == 'CROSS_DOC_COREF':
            unique_coref_ids[coref_elem.attrib['note']] += 1
            
            num_source_elems = 0
            for source_elem in coref_elem.iter('source'): 
                m_id_to_cluster_id[source_elem.attrib['m_id']] = cluster_idx
                num_source_elems += 1

            cluster_idx += 1
            within_doc_coref += (1 if num_source_elems > 1 else 0)
            total_within_doc_participants += num_source_elems
            
        elif coref_elem.tag == 'INTRA_DOC_COREF':
            
            num_source_elems = 0
            for source_elem in coref_elem: 
                m_id_to_cluster_id[source_elem.attrib['m_id']] = cluster_idx
                num_source_elems += 1
            
#             print("Hello {}".format(num_source_elems))
            cluster_idx += 1
            within_doc_coref += 1
            total_within_doc_participants += num_source_elems
            
        else:
            print(coref_elem.tag)


    html_string = HTML_START + '<div style="line-height: 3">'
    prev_sent_id = 0
    for idx, (token, sent_id) in token_id_to_str.items():
        if prev_sent_id != sent_id:
            html_string += "<br/>\n"
        prev_sent_id = sent_id
        if idx in html_tag_list:
            tags = html_tag_list[idx]
            for tag, tag_type, elem_id in tags:
                if tag_type == 'start':
                    html_string += tag

        html_string += token + " "

        if idx in html_tag_list:
            tags = html_tag_list[idx]
            for tag, tag_type, m_id in tags:
                if m_id in m_id_to_cluster_id and tag_type == 'end':
                    html_string += "<sub>" + str(m_id_to_cluster_id[m_id]) + "</sub>"

                if tag_type == "end":
                    html_string += tag

    return html_string

In [124]:
within_doc_coref = 0
total_within_doc_participants = 0

num_loc = 0
num_time = 0
num_action = 0
num_human = 0
num_non_human = 0
unique_coref_ids = defaultdict(int)

index_html = HTML_START + '<ol type="1">'

for source_file in source_files:
    base_file = path.basename(source_file)
    output_file = base_file.replace("xml", "html")
    
    index_html += '<li> <a href="{}", target="_blank">'.format(output_file) + base_file.split(".")[0] + '</a></li>\n'
    
    doc_html = return_html(source_file)
    with open(path.join(output_dir, output_file), "w") as f:
        f.write(doc_html)
        

print(within_doc_coref)
print("Unique cross coref chains: {}".format(len(unique_coref_ids)))
print("Total within doc coref participants: %d" %total_within_doc_participants)
print('Num loc: {}'.format(num_loc))
print('Num time: {}'.format(num_time))
print('Num action: {}'.format(num_action))
print('Num human: {}'.format(num_human))
print('Num non-human: {}'.format(num_non_human))


index_html += '</ol>\n</body>\n</html>'
with open(path.join(output_dir, "index.html"), "w") as g:
    g.write(index_html)
    

3745
Unique cross coref chains: 2312
Total within doc coref participants: 18450
Num loc: 2205
Num time: 2412
Num action: 15003
Num human: 9621
Num non-human: 3056
