In [1]:
import xml
import xml.etree.ElementTree as ET
import os
from os import path
import glob
from collections import defaultdict
import numpy as np

from utils import get_ent_info, get_clusters_from_xml

In [2]:
data_dir = "/home/shtoshni/Research/events/data/red/data/source"
source_files = glob.glob("{}/*/*".format(data_dir))

ann_dir = "/home/shtoshni/Research/events/data/red/data/annotation"
ann_files = glob.glob("{}/*/*".format(ann_dir))

In [3]:
output_dir = "/home/shtoshni/Research/events/data/red/stats"

In [4]:
num_chains = defaultdict(int) 
num_mentions = defaultdict(int)
num_singletons = defaultdict(int)
chain_lengths = defaultdict(list)


files_with_issues = defaultdict(int)

for source_file in source_files:
    # Read the source doc
    source_lines = open(source_file).readlines()
    source_str = "".join(source_lines)
    
    # Read the annotation file
    base_name = path.basename(source_file)
    dir_name = path.basename(path.dirname(source_file))
    
    ann_file = path.join(path.join(ann_dir, dir_name), base_name + ".RED-Relation.gold.completed.xml")
    
    tree = ET.parse(ann_file)
    root = tree.getroot()
    
    # Get info from the XML file
    ent_map, ent_list = get_ent_info(root)
    clusters = get_clusters_from_xml(root, ent_map)
    
    doc_mentions = defaultdict(int)
    
    for _, (ent_type, (_, _)) in ent_map.items():
        num_mentions[ent_type] += 1
        doc_mentions[ent_type] += 1
        
    clustered_mentions = defaultdict(int)
    
    all_clustered_mentions = set()
    for cluster in clusters:
        # Check the entity type of the first element in the cluster
        cluster_type = ent_map[cluster[0]][0]
        
        num_chains[cluster_type] += 1
        chain_lengths[cluster_type].append(len(cluster))
        
        
        for mention in cluster:
            elem_type, (span_start, span_end) = ent_map[mention]
            if mention in all_clustered_mentions:
                # Not supposed to happen
#                 print("{}: {} - \"{}\" is not uniquely clustered\n".format(
#                     base_name, mention, source_str[span_start: span_end]))        
                files_with_issues[base_name] += 1
        
            # Add mention to all mentions
            all_clustered_mentions.add(mention)
            
            try:
                assert (elem_type == cluster_type)  # Type of cluster should be consistent
            except AssertionError:
#                 print("Type of cluster {} and element {} don't match".format(cluster_type, elem_type))
#                 print("{}, Element: {}\n".format(base_name, mention))
                files_with_issues[base_name] += 1
    
            # Update the count of different mention types
            clustered_mentions[elem_type] += 1
    
    for ent_type in ['EVENT', 'ENTITY']:
        num_singletons[ent_type] += doc_mentions[ent_type] - clustered_mentions[ent_type]
    
files_with_issues = sorted(files_with_issues.items(), key=lambda x: x[1], reverse=True)
for file_name, num_issues in files_with_issues:
    print("{}: Num issues {}".format(file_name, num_issues))

5c0dd992beaff240f732e0fdacbd49e4.mpdf: Num issues 6
NYT_ENG_20130424.0047: Num issues 4
alt.support.divorce_20050113.2451: Num issues 3
NYT_ENG_20131225.0200: Num issues 2
APW_ENG_20101231.0037: Num issues 2
soc.culture.iraq_20050211.0445: Num issues 2
uk.gay-lesbian-bi_20050127.0311: Num issues 2
soc.culture.china_20050203.0639: Num issues 2
PROXY_AFP_ENG_20020404_0305: Num issues 2
NYT_ENG_20130613.0153: Num issues 1
NYT_ENG_20130619.0092: Num issues 1
XIN_ENG_20101125.0137: Num issues 1
4829d3d91263ed9d8801e6d94c3569a5.mpdf: Num issues 1
5c59566e9132c060423cad5b2d1bac1e.mpdf: Num issues 1
57026b7bcb8f855de3e26d572db35285: Num issues 1
alt.sys.pc-clone.dell_20050226.2350: Num issues 1
misc.legal.moderated_20050129.2225: Num issues 1
44b011cd504c9ed71beb851324db886a: Num issues 1
0f03cc5a508d630c6c8c8c61396e31a9: Num issues 1
PROXY_AFP_ENG_20020414_0542: Num issues 1


In [5]:
print("# of files with issues:", len(files_with_issues))

for metric, metric_str in zip([num_mentions, num_chains, num_singletons],
                              ['{} Mentions', '{} Chains', 'Singleton {} Mentions']):
    for mention_type in ['ENTITY', 'EVENT']:
        print(("Number of " + metric_str + ": {}").format(mention_type.capitalize(), metric[mention_type]))
    print()
    
# print("Number of Entity Mentions:", num_mentions['ENTITY'])
# print("Number of Event Mentions:", num_mentions['EVENT'])

# print("Number of Entity Chains:", num_chains['ENTITY'])
# print("Number of Event Chains:", num_chains['EVENT'])

# print("Number of Singleton Entity Mentions:", num_singleton_entities)
# print("Number of Singleton Event Mentions:", num_singleton_events)

for mention_type in ['ENTITY', 'EVENT']:
    mention_str = mention_type.capitalize()
    type_chain_lengths = np.asarray(chain_lengths[mention_type])
    print("Mean {} chain length: {:.2f}".format(mention_str, np.mean(type_chain_lengths)))
    print("Max {} chain length: {}".format(mention_str, np.max(type_chain_lengths)))
    

# of files with issues: 20
Number of Entity Mentions: 10319
Number of Event Mentions: 8731

Number of Entity Chains: 1287
Number of Event Chains: 762

Number of Singleton Entity Mentions: 3880
Number of Singleton Event Mentions: 6609

Mean Entity chain length: 5.00
Max Entity chain length: 82
Mean Event chain length: 2.79
Max Event chain length: 21
