In [81]:
import os
from os import path
import json
import sys
from collections import defaultdict

sys.path.append("../src/")

from red_utils.constants import IDX_TO_ELEM_TYPE

## Set input and output directories

In [82]:
# input_file = "/home/shtoshni/Research/events/proc_data/red/independent/"
# output_dir = "/home/shtoshni/Research/events/data/red/bert_html"
input_file = "/home/shtoshni/Research/events/proc_data/red/independent_truecase/"
output_dir = "/home/shtoshni/Research/events/data/red/bert_html_truecase"


suffix = "{}.512.jsonlines"

splits = ["train", "dev", "test"]


if not path.exists(output_dir):
    os.makedirs(output_dir)

## HTML Setup

In [83]:
HTML_START = '<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8"></head><body>'


start_tag_template = '<div style="border:2px; display:inline; border-style: solid; border-color: {}; padding: 10px; padding-right: 3px; padding-left: 3px">'

entity_tag = start_tag_template.format('#0066CC')
event_tag = start_tag_template.format('violet')

end_tag = '</div>'


In [84]:
html_files = []

for split in splits:
    print(f"Processing {split.capitalize()}")
    # Read the source doc
    split_file = path.join(input_file, suffix.format(split))
    
    with open(split_file) as f:
        for line in f:
            instance = json.loads(line.strip())
            
            bert_seg_idx = []
            doc_list = [] 
            for sentence in instance["sentences"]:
                doc_list.extend(sentence)
                bert_seg_idx.append(len(sentence) + (bert_seg_idx[-1] if len(bert_seg_idx) else 0))
            
            bert_seg_idx = set(bert_seg_idx)
            html_tag_list = {}

            # Get all the entity info
            for cluster_idx, cluster in enumerate(instance["clusters"]):
                for mention in cluster:
                    span_start, span_end, ent_type = mention
                    span_end = span_end + 1  ## Now span_end is not part of the span
                    ent_type = IDX_TO_ELEM_TYPE[ent_type]
                    if ent_type == 'ENTITY':
                        start_tag = entity_tag
                    elif ent_type == 'EVENT':
                        start_tag = event_tag

                    if span_start not in html_tag_list:
                        html_tag_list[span_start] = defaultdict(list)
                    if span_end not in html_tag_list:
                        html_tag_list[span_end] = defaultdict(list)

#                     subscript = ''
                    subscript = ent_type[:3] + " " + str(cluster_idx)


                    html_tag_list[span_start]['start'].append((start_tag))
                    # Subscript used in end
                    html_tag_list[span_end]['end'].append((span_start, cluster_idx, end_tag, subscript))


            html_string = HTML_START + '<div style="line-height: 3">'
            for token_idx, token in enumerate(doc_list):
                if token_idx in bert_seg_idx:
                    html_string += "\n<br/>"
                    
                if token_idx in html_tag_list:
                    for tag_type in ['end', 'start']:
                        if tag_type == 'end' and (tag_type in html_tag_list[token_idx]):
                            tags = html_tag_list[token_idx]['end']

                            # Sort the tags so as to mimic the stack behavior
                            tags = sorted(tags, key=lambda x: x[0] - x[1] * 1e-5)  # Highest mentions first
                            for _, _, html_tag, subscript in tags:
                                html_string += "<sub>" + subscript + "</sub>" 
                                html_string += html_tag
                                # Since we are deleting the highest indices first, the lower indices are unaffected

                        if tag_type == 'start' and (tag_type in html_tag_list[token_idx]):
                            for html_tag in html_tag_list[token_idx]['start']:
                                html_string += html_tag

                html_string += " " + token

            html_string += "</div></body></html>"
            html_string = html_string.replace("\n", "\n<br/>")
            html_string = html_string.replace("~", "&lt;")
            html_string = html_string.replace("^", "&gt;")
            
            base_name = f"({split}) " + instance["doc_key"].replace("/", "-")
            file_name = path.join(output_dir, base_name + ".html")
            html_files.append(file_name)
            with open(file_name, "w") as f:
                f.write(html_string)

Processing Train
Processing Dev
Processing Test


In [85]:
index_html = HTML_START + '<ol type="1">'

for html_file in html_files:
    base_name = path.splitext(path.basename(html_file))[0]
    base_name = base_name.replace("-", "/")
    index_html += '<li> <a href="{}", target="_blank">'.format(html_file) + base_name + '</a></li>\n'
    
index_html += '</ol>\n</body>\n</html>'
index_file_path = path.join(output_dir, "index.html")
print(index_file_path)
with open(path.join(output_dir, "index.html"), "w") as g:
    g.write(index_html)

/home/shtoshni/Research/events/data/red/bert_html_truecase/index.html
