In [236]:
import xml
import xml.etree.ElementTree as ET
import os
from os import path
import glob
from collections import defaultdict, OrderedDict
import spacy
import json

In [154]:
spacy_nlp = spacy.load('en_core_web_lg')

In [199]:
def get_ent_info(xml_root):
    ent_map = {}
    ent_list = []

    for elem in xml_root.iter('entity'):
        span_str = list(elem.iter('span'))[0].text
        span_start, span_end = [int(endpoint) for endpoint in span_str.split(",")]
        elem_id = list(elem.iter('id'))[0].text
        elem_type = list(elem.iter('type'))[0].text

        if elem_type == 'ENTITY' or elem_type == 'EVENT':
            ent_map[elem_id] = (elem_type, (span_start, span_end))
            ent_list.append([(span_start, span_end), elem_id])
            
    # Sort entity list on the basis of span start index
    ent_list = sorted(ent_list, key=lambda x: x[0][0])  
    
    return ent_map, ent_list

In [200]:
def get_clusters_from_xml(xml_root, ent_map):
    clusters = []
    for elem in xml_root.iter('relation'):
        elem_type = list(elem.iter('type'))[0].text
        if elem_type == 'IDENTICAL':
            # Initiate new cluster
            new_cluster = []
            elem_props = list(elem.iter('properties'))[0]

            for sub_elem in elem_props.iter():
                if sub_elem.tag == 'FirstInstance' or sub_elem.tag == 'Coreferring_String':
                    ent_id = sub_elem.text
                    assert(ent_id in ent_map)
                    new_cluster.append(ent_id)

            clusters.append(new_cluster)
            
    return clusters

In [201]:
def tokenize_string(string):
    string = string.strip()
    if string == "":
        return []
    else:
        doc = spacy_nlp(string)
        tokenized_sent = [token.text for token in doc if token.text.strip() != '']
        return tokenized_sent

In [190]:
def tokenize_doc(doc_str, ent_list):
    """Tokenizes a document given in string format.
    doc_str: Document string
    ent_list: List of entities with each entry being ((span_start, span_end), ent_id) 
        where spans are provided in the character space.
    
    Returns:
    tokenized_doc: List of tokens
    ent_id_to_token_spans: Entity ID to span indices in token space.
    """
    tokenized_doc = []
    token_counter = 0  
    char_offset = 0  # Till what point has the document been processed
    ent_id_to_token_spans = OrderedDict()

    for (span_start, span_end), ent_id in ent_list:
        # Tokenize the string before the span and after the last span
        before_span_str = source_str[char_offset: span_start]
        before_span_tokens = tokenize_string(before_span_str)
        tokenized_doc.extend(before_span_tokens)
        token_counter += len(before_span_tokens)

        # Tokenize the span
        span_tokens = tokenize_string(source_str[span_start: span_end])
        ent_id_to_token_spans[ent_id] = (token_counter, token_counter + len(span_tokens))
        tokenized_doc.extend(span_tokens)
        char_offset = span_end
        token_counter += len(span_tokens)

    # Add the tokens after the last span
    rem_doc = doc_str[char_offset:]
    rem_tokens = tokenize_string(rem_doc)
    token_counter += len(rem_tokens)

    tokenized_doc.extend(rem_tokens)
    
    return tokenized_doc, ent_id_to_token_spans

In [202]:
def get_cluster_spans(clusters_ent_id, ent_id_to_token_spans):
    clusters = []
    for cluster in clusters_ent_id:
        cluster_spans = []
        for ent_id in cluster:
            cluster_spans.append(ent_id_to_token_spans[ent_id])
        clusters.append(cluster_spans)
        
    return clusters

In [203]:
def get_dummy_speaker(tokenized_sents):
    speakers = []
    for sent in tokenized_sents:
        speakers.append(["spk1"] * len(sent))
    return speakers

In [251]:
source_file = "/home/shtoshni/Research/events/data/red/data/source/deft/NYT_ENG_20130426.0143"
source_str  = "".join(open(source_file).readlines())
# source_str  = source_str.replace("\n", " ")

base_name = path.basename(source_file)
dir_name = path.basename(path.dirname(source_file))

ann_file = path.join(path.join(ann_dir, dir_name), base_name + ".RED-Relation.gold.completed.xml")
tree = ET.parse(ann_file)
root = tree.getroot()

ent_map, ent_list = get_ent_info(root)
clusters_ent_id = get_clusters_from_xml(root, ent_map)

tokenized_doc, ent_id_to_token_spans = tokenize_doc(source_str, ent_list)
tokenized_str = " ".join(tokenized_doc)

# TODO (ReTokenize the sentence using spacy)
# Match the # of before and after tokens

cluster_spans = get_cluster_spans(clusters_ent_id, ent_id_to_token_spans)

doc_info = {}
doc_info["doc_key"] = base_name
doc_info["sentences"] = tokenized_sents
doc_info["clusters"] = cluster_spans
doc_info["speakers"] = get_dummy_speaker(tokenized_sents)

# print(doc_info)

In [252]:
# print(len(tokenized_sents))
tokenized_str = tokenized_str.strip()
print(tokenized_str)

< DOC id="NYT_ENG_20130426.0143 " type="story " from_file="/newswire / daily_process / nyt / english / source_data/20130426/20130426,90f99e6a52944168fef63819315c0b2e.xml " > < HEADLINE > GEORGE JONES IN REAL LIFE AND REAL TIME < /HEADLINE > < TEXT > < P > Early in 1977 , a couple of years after he and Tammy Wynette had divorced , the country music star George Jones played a show at the Stardust Inn in Waldorf , Md. , his first in the Washington area in several years . I was then a young pop music critic at The Washington Post , so it fell to me to write a profile of the notoriously hard- drinking singer . < /P > < P > What followed was an experience so intense , so remarkable for its raw emotional force , that I could not help but think of it Friday , when it was announced that Jones had died in a Nashville , Tenn. , hospital at the age of 81 . That weekend I learned two important things about Jones : that there was very little distance between him and his songs , and that the same qua

In [250]:
retokenized_str = ""
doc = spacy_nlp(tokenized_str)
for sent in doc.sents:
    retokenized_str += sent.text + " "

retokenized_str = retokenized_str.strip()
print(retokenized_str)
retokenized_tokens = retokenized_str.split()
# print(retokenized_tokens)
print(len(retokenized_tokens))

< DOC id="NYT_ENG_20130426.0143 " type="story " from_file="/newswire / daily_process / nyt / english / source_data/20130426/20130426,90f99e6a52944168fef63819315c0b2e.xml " > < HEADLINE > GEORGE JONES IN REAL LIFE AND REAL TIME < /HEADLINE > < TEXT > < P > Early in 1977 , a couple of years after he and Tammy Wynette had divorced , the country music star George Jones played a show at the Stardust Inn in Waldorf , Md. , his first in the Washington area in several years . I was then a young pop music critic at The Washington Post , so it fell to me to write a profile of the notoriously hard- drinking singer . < /P > < P > What followed was an experience so intense , so remarkable for its raw emotional force , that I could not help but think of it Friday , when it was announced that Jones had died in a Nashville , Tenn. , hospital at the age of 81 . That weekend I learned two important things about Jones : that there was very little distance between him and his songs , and that the same qua

In [None]:
def load_splits_file(list_file):
    return set([file_name.strip() for file_name in open(list_file).readlines()])

In [241]:
data_dir = "/home/shtoshni/Research/events/data/red/data/source"
source_files = glob.glob("{}/*/*".format(data_dir))

ann_dir = "/home/shtoshni/Research/events/data/red/data/annotation"
ann_files = glob.glob("{}/*/*".format(ann_dir))

# Load the file splits
dev_list_file = "/home/shtoshni/Research/events/data/red/docs/dev.txt"
dev_set = load_splits_file(dev_list_file)

test_list_file = "/home/shtoshni/Research/events/data/red/docs/test.txt"
test_set = load_splits_file(test_list_file)

# Output directory
output_dir = "/home/shtoshni/Research/events/data/red/split"

In [245]:
print(len(source_files[-2]))

85


In [256]:
train_data = []
dev_data = []
test_data = []

for source_file in source_files[:2]:
    # Read the source doc
    source_lines = open(source_file).readlines()
    doc_str = "".join(source_lines)
    
    # Read the annotation file
    base_name = path.basename(source_file)
    dir_name = path.basename(path.dirname(source_file))
    red_file_name = path.join(dir_name, base_name)
    print(red_file_name)
    
    ann_file = path.join(path.join(ann_dir, dir_name), base_name + ".RED-Relation.gold.completed.xml")    
    tree = ET.parse(ann_file)
    root = tree.getroot()
    
    # Get entity and cluster information from the annotation file
    ent_map, ent_list = get_ent_info(root)
    clusters_ent_id = get_clusters_from_xml(root, ent_map)
    
    # Tokenize the doc
    tokenized_doc, ent_id_to_token_spans = tokenize_doc(doc_str, ent_list)
    
    # Break the document into sentences.
    tokenized_sents = []
    tokenized_doc_str = " ".join(tokenized_doc)
    reproc_doc = spacy_nlp(tokenized_doc_str)
    for sent in reproc_doc.sents:
        sent_text = sent.text
#         tokenized_sents.append(sent_text.split())
        tokenized_sents.append(sent_text)
    cluster_spans = get_cluster_spans(clusters_ent_id, ent_id_to_token_spans)
    
#     try:
#         # Check the retokenized doc is same as tokenized doc
#         retokenized_doc = []
#         for sent in tokenized_sents:
#             retokenized_doc.extend(sent)
#         assert(tokenized_doc == retokenized_doc)
#     except AssertionError:
#         print(len(tokenized_doc))
#         print(len(retokenized_doc))
#         break

    doc_info = {}
    doc_info["doc_key"] = red_file_name
    doc_info["sentences"] = tokenized_sents
    doc_info["clusters"] = cluster_spans
    doc_info["speakers"] = get_dummy_speaker(tokenized_sents)
    
    file_name = path.join(dir_name, base_name)
    if red_file_name in dev_set:
        dev_data.append(doc_info)
    elif red_file_name in test_set:
        test_data.append(doc_info)
    else:
        train_data.append(doc_info)

deft/NYT_ENG_20130426.0143
1394
6451


In [257]:
for split, data in zip(['train', 'dev', 'test'], [train_data, dev_data, test_data]):
    with open(path.join(output_dir, "{}.english.jsonlines".format(split)), 'w') as f:
        for instance in data:
            f.write(json.dumps(instance) + "\n")