In [1]:
import xml
import xml.etree.ElementTree as ET
import os
from os import path
import glob
from collections import defaultdict, Counter
import numpy as np

from utils import get_ent_info, get_clusters_from_xml

In [2]:
data_dir = "/home/shtoshni/Research/events/data/red/data/source"
source_files = glob.glob("{}/*/*".format(data_dir))

ann_dir = "/home/shtoshni/Research/events/data/red/data/mod_annotation"
ann_files = glob.glob("{}/*/*".format(ann_dir))

output_dir = "/home/shtoshni/Research/events/data/red/imp_error_logs"
if not path.exists(output_dir):
    os.makedirs(output_dir)

# Search For Problematic Spans Marked as Multiple Entities

In [3]:
files_with_issues = defaultdict(list)


for source_file in source_files:
    # Read the source doc
    source_lines = open(source_file).readlines()
    source_str = "".join(source_lines)
    
    # Read the annotation file
    base_name = path.basename(source_file)
    dir_name = path.basename(path.dirname(source_file))
    
    ann_file = path.join(path.join(ann_dir, dir_name), base_name + ".RED-Relation.gold.completed.xml")
    
    tree = ET.parse(ann_file)
    root = tree.getroot()
    
    span_to_elem_id = {}
    # Get info from the XML file
    for elem in root.iter('entity'):
        span_str = list(elem.iter('span'))[0].text
        span_start, span_end = [int(endpoint) for endpoint in span_str.split(",")]
        elem_id = list(elem.iter('id'))[0].text
        elem_type = list(elem.iter('type'))[0].text
        
        span = tuple((span_start, span_end))
        if elem_type == 'ENTITY' or elem_type == 'EVENT':
            if (span in span_to_elem_id):
                same_something = False
                for (prev_elem_type, prev_elem_id) in span_to_elem_id[span]:
                    if prev_elem_type == elem_type or prev_elem_id == elem_id:
                        same_something = True
                        files_with_issues[dir_name + "-" + base_name].append(
                            ('Multiple Entities', span, source_str[span_start: span_end],
                             source_str[span_start-15: span_end+15].replace("\n", " "),
                             span_to_elem_id[span][0], span_to_elem_id[span][1], 
                             elem_type, elem_id))
                if not same_something:
                    span_to_elem_id[span].append((elem_type, elem_id))
            else:
                span_to_elem_id[span]= [(elem_type, elem_id)]
            
            
file_issues_list = list(files_with_issues.items())
file_issues_list = sorted(file_issues_list, key=lambda x: x[1], reverse=True)
print("Total files with issues: {}\n".format(len(files_with_issues)))

Total files with issues: 0



# Search For ElementsThat Are Part of Multiple Chains

In [4]:
for source_file in source_files:
    # Read the source doc
    source_lines = open(source_file).readlines()
    source_str = "".join(source_lines)
    
    # Read the annotation file
    base_name = path.basename(source_file)
    dir_name = path.basename(path.dirname(source_file))
    
    ann_file = path.join(path.join(ann_dir, dir_name), base_name + ".RED-Relation.gold.completed.xml")
    
    tree = ET.parse(ann_file)
    root = tree.getroot()
    
    # First build the element ID to span mapping because element IDs are used in CorefChains
    elem_id_to_span = {}
    for elem in root.iter('entity'):
        span_str = list(elem.iter('span'))[0].text
        span_start, span_end = [int(endpoint) for endpoint in span_str.split(",")]
        elem_id = list(elem.iter('id'))[0].text
        elem_type = list(elem.iter('type'))[0].text
        
        span = tuple((span_start, span_end))
        if elem_type == 'ENTITY' or elem_type == 'EVENT':
            if elem_id in elem_id_to_span:
                if span == elem_id_to_span[elem_id]:
                    continue
                else:
                    print(elem_id, span_to_elem_id[span])
            else:
                elem_id_to_span[elem_id] = span
                
    
    ent_to_cluster = {}
    cluster_cnter = 0
    for elem in root.iter('relation'):
        type_elem = elem.find('type').text
        if type_elem == 'IDENTICAL':
            cluster_id = elem.find('id').text
            prop_elem = elem.find('properties')
            for sub_elem in prop_elem:
                ent_id = sub_elem.text
                span = elem_id_to_span[ent_id]
                
                if ent_id in ent_to_cluster:
                    span_start, span_end = span
                    files_with_issues[dir_name + "-" + base_name].append(
                        ('Multiple Clusters', source_str[span_start: span_end], 
                         source_str[span_start-15: span_end+15].replace("\n", " "), span, ent_id, 
                         ent_to_cluster[ent_id], cluster_id))
                else:
                    ent_to_cluster[ent_id] = cluster_id
                
            cluster_cnter += 1

# Search for chains that don't have elements of the same type

In [5]:
for source_file in source_files:
    # Read the source doc
    source_lines = open(source_file).readlines()
    source_str = "".join(source_lines)
    
    # Read the annotation file
    base_name = path.basename(source_file)
    dir_name = path.basename(path.dirname(source_file))
    
    ann_file = path.join(path.join(ann_dir, dir_name), base_name + ".RED-Relation.gold.completed.xml")
    
    tree = ET.parse(ann_file)
    root = tree.getroot()
    
    # First build the element ID to span mapping because element IDs are used in CorefChains
    elem_id_to_type = {}
    for elem in root.iter('entity'):
        span_str = list(elem.iter('span'))[0].text
        span_start, span_end = [int(endpoint) for endpoint in span_str.split(",")]
        elem_id = list(elem.iter('id'))[0].text
        elem_type = list(elem.iter('type'))[0].text
        
        span = tuple((span_start, span_end))
        if elem_type == 'ENTITY' or elem_type == 'EVENT':
            elem_id_to_type[elem_id] = elem_type
                
    
    for elem in root.iter('relation'):
        type_elem = elem.find('type').text
        if type_elem == 'IDENTICAL':
            cluster_id = elem.find('id').text
            prop_elem = elem.find('properties')
            cluster_type = set()
            for sub_elem in prop_elem:
                ent_id = sub_elem.text
                cluster_type.add(elem_id_to_type[ent_id])
                
            if len(cluster_type) > 1:
                files_with_issues[dir_name + "-" + base_name].append(
                        ('Inconsistent Type', cluster_id))

In [6]:
file_issues_list = list(files_with_issues.items())
file_issues_list = sorted(file_issues_list, key=lambda x: x[1], reverse=True)
print("Total files with issues: {}\n".format(len(files_with_issues)))
print("Check {} for issues with each file".format(output_dir))
for file_name, issues in file_issues_list:
    with open(path.join(output_dir, file_name + ".txt"), 'w') as f:
        f.write("Number of issues: {}\n\n".format(len(issues)))
        for issue in issues:
            issue_str = [str(issue_attrib) for issue_attrib in issue]
            f.write("   ".join(issue_str) + "\n")

Total files with issues: 0

Check /home/shtoshni/Research/events/data/red/imp_error_logs for issues with each file


In [7]:
num_issues_to_num_files = defaultdict(list)
for file_name, issues in file_issues_list:
    num_issues_to_num_files[len(issues)].append(file_name)
    
num_issues = sorted(num_issues_to_num_files.keys())
with open(path.join(output_dir, "meta.txt"), 'w') as f:
    for num_issue in num_issues:
        f.write("Num issues:{}\n".format(num_issue))
        print("Num issues: {}, Num files: {}".format(num_issue, len(num_issues_to_num_files[num_issue])))
        for file_name in num_issues_to_num_files[num_issue]:
            f.write(file_name + "\n")
        f.write("\n")