In [1]:
import re
import spacy
import neuralcoref
import pandas as pd
from os import listdir
from collections import defaultdict, OrderedDict

In [2]:
PATH = './qbcoref/data/data-gold/'

In [3]:
nlp = spacy.load('en_core_web_lg')
neuralcoref.add_to_pipe(nlp)

<spacy.lang.en.English at 0x7f95a9be9f98>

In [4]:
def get_prediction(string):
    doc = nlp(string)

    prediction = defaultdict(set)
    for cluster in doc._.coref_clusters:
        main = cluster.main
        main_start, main_end = main.start, main.end - 1
        main_pos = tuple([main_start]) if main_start == main_end else (main_start, main_end)
        for coref in cluster:
            start, end = coref.start, coref.end - 1
            prediction[main_pos].add(tuple([start]) if start == end else (start, end))

    return prediction

In [6]:
def get_truth(df):
    temp = defaultdict(list)
    for idx, row in df.iterrows():
        _, coreference = row
        if coreference == '-':
            pass
        else:
            coords = coreference.split('|')
            for coord in coords:
                pos = re.search(r'\d+', coord).group(0)
                if '(' in coord and ')' in coord:
                    temp[pos].append([idx])
                elif '(' in coord:
                    temp[pos].append([idx])
                elif ')' in coord:
                    temp[pos][-1].append(idx)

    truth = defaultdict(set)
    temp_main = None
    for values in temp.values():
        for i, value in enumerate(values):
            coord = value if isinstance(value, int) else tuple(value)
            if i == 0:
                temp_main = coord
            truth[temp_main].add(coord)

    return truth

In [7]:
def get_recall(tp, fn):
    if (not tp and not fn):
        return 0
    return tp / (tp + fn)

def get_precision(tp, fp):
    if (not tp and not fp):
        return 0
    return tp / (tp + fp)

def get_f1(truth, prediction):
    tp = [val for val in truth if val in prediction]
    fp = [val for val in prediction if val not in tp]
    fn = [val for val in truth if val not in prediction]
    
    precision = get_precision(len(tp), len(fp))
    recall = get_recall(len(tp), len(fn))

    if precision + recall == 0:
        return 0
    
    return 2 * (precision * recall) / (precision + recall)

In [16]:
def calc_doc_f1(truth, prediction):
    f1 = [get_f1(value, prediction[key]) for key, value in truth.items()]
    return sum(f1)/(len(f1) + 0.00001)

In [9]:
def parse_doc(doc):
    df = pd.read_csv(pd.compat.StringIO(doc), sep='\t', names=['id', 'token', 'coreference'], usecols=[2, 3, 11], index_col=0)

    prediction = get_prediction(' '.join(df['token'].values))
    truth = get_truth(df)

    return calc_doc_f1(truth, prediction)

In [17]:
def process_conll(filename):
#     print(filename)
    with open(PATH + filename) as f:
        raw_data = f.readlines()[2:-4]
        docs = ''.join(raw_data).split('\n\n')
        file_f1 = [parse_doc(item) for item in docs]
        return sum(file_f1)/(len(file_f1) + 0.00001)

In [18]:
f1_per_file = [process_conll(f) for f in listdir(PATH)]

In [21]:
#corpora-wide f1
sum(f1_per_file)/len(f1_per_file)

0.07160771327486981