In [1]:
import xml
import xml.etree.ElementTree as ET
import os
from os import path
import glob
from collections import defaultdict

from utils import get_ent_info, get_all_clusters_from_xml

In [2]:
data_dir = "/home/shtoshni/Research/events/data/red/data/source"
source_files = glob.glob("{}/*/*".format(data_dir))

ann_dir = "/home/shtoshni/Research/events/data/red/data/simp_corr_annotation"
ann_files = glob.glob("{}/*/*".format(ann_dir))

In [3]:
LEVEL = 0

if LEVEL == 0:
    # Just identical coref chains
    output_dir = "/home/shtoshni/Research/events/data/red/coref_red_html"
elif LEVEL == 1:
    # Add appositive and bridging
    output_dir = "/home/shtoshni/Research/events/data/red/ap_br_id_red_html"
elif LEVEL == 2:
    # Add set/member and whole/part to annotation as well
    output_dir = "/home/shtoshni/Research/events/data/red/se_wh_ap_br_id_red_html"

if not path.exists(output_dir):
    os.makedirs(output_dir)
    
HTML_START = '<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8"></head><body>'


start_tag_template = '<div style="border:2px; display:inline; border-style: solid; border-color: {}; padding: 10px; padding-right: 3px; padding-left: 3px">'

entity_tag = start_tag_template.format('#0066CC')
event_tag = start_tag_template.format('violet')

end_tag = '</div>'

type_to_start_tag = {}
type_to_start_tag['IDENTICAL'] = 'ID'
type_to_start_tag['APPOSITIVE'] = 'APPOS'
type_to_start_tag['BRIDGING'] = 'BRIDG'
type_to_start_tag['SET/MEMBER'] = 'S/M'
type_to_start_tag['WHOLE/PART'] = 'W/P'

In [4]:
for source_file in source_files:
    # Read the source doc
    source_lines = open(source_file).readlines()
    source_str = "".join(source_lines)
    
    # Read the annotation file
    base_name = path.basename(source_file)
    dir_name = path.basename(path.dirname(source_file))
    
    ann_file = path.join(path.join(ann_dir, dir_name), base_name + ".RED-Relation.gold.completed.xml")
    
    tree = ET.parse(ann_file)
    root = tree.getroot()
    
    # Get info from the XML file
    ent_map, ent_list = get_ent_info(root)
    mention_to_cluster_info = get_all_clusters_from_xml(root, ent_map, LEVEL=LEVEL)
    
    html_tag_list = {}
    
    # Get all the entity info
    for mention, cluster_info in mention_to_cluster_info.items():
        ent_type, (span_start, span_end) = ent_map[mention]
        if ent_type == 'ENTITY':
            start_tag = entity_tag
        elif ent_type == 'EVENT':
            start_tag = event_tag

        if span_start not in html_tag_list:
            html_tag_list[span_start] = defaultdict(list)
        if span_end not in html_tag_list:
            html_tag_list[span_end] = defaultdict(list)
        
        subscript = ''
        for idx, (cluster_type, cluster_idx) in enumerate(cluster_info):
            if idx > 0:
                subscript += ", "
            subscript += cluster_type + " " + str(cluster_idx)
                

        html_tag_list[span_start]['start'].append((mention, start_tag, ''))
        # Subscript used in end
        html_tag_list[span_end]['end'].append((mention, end_tag, subscript))
    
        
    html_string = HTML_START + '<div style="line-height: 3">'
    
    offset = 0 
    counter = 0
    source_str = source_str.replace("<", "~")
    source_str = source_str.replace(">", "^")
    
    # This list acts like a stack. We push the new mentions based on start tag
    # and remove the mentions in the order of most recent to least recent.
    mentions_processed = []
    
    for idx, token in enumerate(source_str):
        if idx in html_tag_list:
            for tag_type in ['end', 'start']:
                if tag_type == 'end' and (tag_type in html_tag_list[idx]):
                    tags = html_tag_list[idx]['end']
    
                    tags = [(mentions_processed.index(mention), html_tag, cluster_idx) 
                           for mention, html_tag, cluster_idx in tags]
                    # Sort the tags so as to mimic the stack behavior
                    tags = sorted(tags, key=lambda x: x[0], reverse=True)  # Highest mentions first
                    for mention_idx, html_tag, cluster_info in tags:
                        html_string += "<sub>" + cluster_info + "</sub>" 
                        html_string += html_tag
                        # Since we are deleting the highest indices first, the lower indices are unaffected
                        del mentions_processed[mention_idx]

                if tag_type == 'start' and (tag_type in html_tag_list[idx]):
                    for mention_id, html_tag, cluster_idx in html_tag_list[idx]['start']:
                        # Add the mention_id to the current list of active mentions
                        mentions_processed.append(mention_id)
                        html_string += html_tag
        
        html_string += token
            
    html_string += "</div></body></html>"
    html_string = html_string.replace("\n", "\n<br/>")
    html_string = html_string.replace("~", "&lt;")
    html_string = html_string.replace("^", "&gt;")
    with open(path.join(output_dir, base_name + ".html"), "w") as f:
        f.write(html_string)

In [5]:
index_html = HTML_START + '<ol type="1">'

for source_file in source_files:
    base_file = path.basename(source_file)
    output_file = base_file + ".html"
    
    index_html += '<li> <a href="{}", target="_blank">'.format(output_file) + base_file + '</a></li>\n'
    

index_html += '</ol>\n</body>\n</html>'
with open(path.join(output_dir, "index.html"), "w") as g:
    g.write(index_html)