# Preliminaries

In [1]:
from nltk import word_tokenize
from util import read_schema, read_collection_annotation, get_question
%load_ext autoreload
%autoreload 2
import metrics
metrics.logger.setLevel("WARNING")

[36m2019-11-28 07:29:31,465[0m [37mINFO    [0m [01;34mailog.classes.setup_logging        [0m Config loaded.[0m


In [2]:
def avg(l):
    return sum(l)/len(l)


# Load data
Assuming you have brat-data in `/data/`...

Load all collections and the annotation schema.

In [3]:
path = "/data/brat-data/rc-datasets/"
labels, categories = read_schema(f'{path}annotation.conf')
msmarco = read_collection_annotation(f'{path}msmarco')
hotpotqa = read_collection_annotation(f'{path}hotpotqa')
record = read_collection_annotation(f'{path}record')
multirc =  read_collection_annotation(f'{path}multirc')
newsqa = read_collection_annotation(f'{path}newsqa')
drop = read_collection_annotation(f'{path}drop')
all_datasets = [msmarco, hotpotqa, record, multirc, newsqa, drop]


/data/brat-data/rc-datasets/msmarco/*.txt
/data/brat-data/rc-datasets/hotpotqa/*.txt
/data/brat-data/rc-datasets/record/*.txt
/data/brat-data/rc-datasets/multirc/*.txt
/data/brat-data/rc-datasets/newsqa/*.txt
/data/brat-data/rc-datasets/drop/*.txt


In [4]:
path_marco = "/data/brat-data/rc-datasets/marco-"
msmarco_marco = read_collection_annotation(f'{path_marco}msmarco')
hotpotqa_marco = read_collection_annotation(f'{path_marco}hotpotqa')
record_marco = read_collection_annotation(f'{path_marco}record')
multirc_marco =  read_collection_annotation(f'{path_marco}multirc')
newsqa_marco = read_collection_annotation(f'{path_marco}newsqa')
drop_marco = read_collection_annotation(f'{path_marco}drop')
all_datasets_marco = [msmarco_marco, hotpotqa_marco, record_marco, multirc_marco, newsqa_marco, drop_marco]


/data/brat-data/rc-datasets/marco-msmarco/*.txt
/data/brat-data/rc-datasets/marco-hotpotqa/*.txt
/data/brat-data/rc-datasets/marco-record/*.txt
/data/brat-data/rc-datasets/marco-multirc/*.txt
/data/brat-data/rc-datasets/marco-newsqa/*.txt
/data/brat-data/rc-datasets/marco-drop/*.txt


# Inter-annotator Agreement

In [5]:
from util import interpolated_agreement_tp_fp_fn
# this category was discarded in favour of the quantitative lex overlap analysis
correct_labels = [l for l in labels if l not in categories['Source']]
# taking supporting facts into consideration biases the annotation scores towards 1
correct_labels.remove("SupportingFact")


def pprint(r):
    print(f"{r:0.2f}")

def micro_f1(tp, fp, fn):
    
    p = tp/(tp+fp)
    r = tp/(tp+fn)
    return 2*p*r/(p+r)
    
print("results for: msmarco, hotpotqa, record, multirc, newsqa, drop")
for i,ds in enumerate(all_datasets):
         r = interpolated_agreement_tp_fp_fn(ds, all_datasets_marco[i], correct_labels)
         pprint(micro_f1(*r))

from functools import reduce
def tuplesum(t1,t2):
    return (a+b for a,b in zip(t1,t2))

print("All datasets, all categories:")
r = [interpolated_agreement_tp_fp_fn(ds, all_datasets_marco[i], correct_labels) for i,ds in enumerate(all_datasets)]
pprint(micro_f1(*reduce(tuplesum, r)))

results for: msmarco, hotpotqa, record, multirc, newsqa, drop
0.86
0.88
0.73
0.76
0.87
0.85
All datasets, all categories:
0.82


# Logistic Regression

In [6]:
from metrics import MaxNgramScorer, ContainsUniqueNgramScorer, MaxContainsScorer
s_max_ngram = MaxNgramScorer(True, True, True)
s_max_contains = MaxContainsScorer(True, True, True)
s_unique_unigram = ContainsUniqueNgramScorer(n=1, remove_stopwords=True)
s_unique_bigram = ContainsUniqueNgramScorer(n=2, remove_stopwords=True)
from util import split_drop, split_hotpotqa, split_newsqa, split_multirc, split_record, split_msmarco
from learn import cv_the_cv, dataset_from_sample
from sklearn.linear_model import LogisticRegression
l = lambda: LogisticRegression(solver='lbfgs')
scorers = [s_max_ngram, s_max_contains, s_unique_unigram, s_unique_bigram]
import learn
learn.logger.setLevel("WARNING")

In [7]:
drop_ds = dataset_from_sample(drop, scorers, split_drop)
results = cv_the_cv(drop_ds, l, k=0, n=5)

p = results[:, 0]
r = results[:, 1]
f1 = results[:, 2]
print("DROP")
print(f"P: {p.mean():0.2f} (+/- {p.std() * 2:0.2f})")
print(f"R: {r.mean():0.2f} (+/- {r.std() * 2:0.2f})")
print(f"F1: {f1.mean():0.2f} (+/- {f1.std() * 2:0.2f})")

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


DROP
P: 0.61 (+/- 0.02)
R: 0.79 (+/- 0.01)
F1: 0.64 (+/- 0.02)


In [8]:
hotpot_ds = dataset_from_sample(hotpotqa, scorers, split_hotpotqa)
results = cv_the_cv(hotpot_ds, l, k=0, n=5)

p = results[:, 0]
r = results[:, 1]
f1 = results[:, 2]
print("HOTPOTQA")
print(f"P: {p.mean():0.2f} (+/- {p.std() * 2:0.2f})")
print(f"R: {r.mean():0.2f} (+/- {r.std() * 2:0.2f})")
print(f"F1: {f1.mean():0.2f} (+/- {f1.std() * 2:0.2f})")

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


HOTPOTQA
P: 0.21 (+/- 0.02)
R: 0.58 (+/- 0.04)
F1: 0.25 (+/- 0.02)


In [9]:
msmarco_ds = dataset_from_sample(msmarco, scorers, split_msmarco)
results = cv_the_cv(msmarco_ds, l, k=0, n=5)

p = results[:, 0]
r = results[:, 1]
f1 = results[:, 2]
print("MSMARCO")
print(f"P: {p.mean():0.2f} (+/- {p.std() * 2:0.2f})")
print(f"R: {r.mean():0.2f} (+/- {r.std() * 2:0.2f})")
print(f"F1: {f1.mean():0.2f} (+/- {f1.std() * 2:0.2f})")

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




HBox(children=(IntProgress(value=0, max=30), HTML(value='')))




HBox(children=(IntProgress(value=0, max=30), HTML(value='')))




HBox(children=(IntProgress(value=0, max=30), HTML(value='')))




HBox(children=(IntProgress(value=0, max=30), HTML(value='')))




HBox(children=(IntProgress(value=0, max=30), HTML(value='')))


MSMARCO
P: 0.07 (+/- 0.03)
R: 0.53 (+/- 0.05)
F1: 0.11 (+/- 0.03)


In [10]:
record_ds = dataset_from_sample(record, scorers, split_record)
results = cv_the_cv(record_ds, l, k=0, n=5)

p = results[:, 0]
r = results[:, 1]
f1 = results[:, 2]
print("RECORD")
print(f"P: {p.mean():0.2f} (+/- {p.std() * 2:0.2f})")
print(f"R: {r.mean():0.2f} (+/- {r.std() * 2:0.2f})")
print(f"F1: {f1.mean():0.2f} (+/- {f1.std() * 2:0.2f})")

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




HBox(children=(IntProgress(value=0, max=42), HTML(value='')))




HBox(children=(IntProgress(value=0, max=42), HTML(value='')))




HBox(children=(IntProgress(value=0, max=42), HTML(value='')))




HBox(children=(IntProgress(value=0, max=42), HTML(value='')))




HBox(children=(IntProgress(value=0, max=42), HTML(value='')))


RECORD
P: 0.30 (+/- 0.04)
R: 0.57 (+/- 0.04)
F1: 0.36 (+/- 0.04)


In [11]:
multirc_ds = dataset_from_sample(multirc, scorers, split_multirc)
results = cv_the_cv(multirc_ds, l, k=0, n=5)

p = results[:, 0]
r = results[:, 1]
f1 = results[:, 2]
print("MULTIRC")
print(f"P: {p.mean():0.2f} (+/- {p.std() * 2:0.2f})")
print(f"R: {r.mean():0.2f} (+/- {r.std() * 2:0.2f})")
print(f"F1: {f1.mean():0.2f} (+/- {f1.std() * 2:0.2f})")

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




HBox(children=(IntProgress(value=0, max=49), HTML(value='')))




HBox(children=(IntProgress(value=0, max=49), HTML(value='')))




HBox(children=(IntProgress(value=0, max=49), HTML(value='')))




HBox(children=(IntProgress(value=0, max=49), HTML(value='')))




HBox(children=(IntProgress(value=0, max=49), HTML(value='')))


MULTIRC
P: 0.36 (+/- 0.02)
R: 0.61 (+/- 0.03)
F1: 0.40 (+/- 0.02)


In [12]:
newsqa_ds = dataset_from_sample(newsqa, scorers, split_newsqa)
results = cv_the_cv(newsqa_ds, l, k=0, n=5)

p = results[:, 0]
r = results[:, 1]
f1 = results[:, 2]
print("NEWSQA")
print(f"P: {p.mean():0.2f} (+/- {p.std() * 2:0.2f})")
print(f"R: {r.mean():0.2f} (+/- {r.std() * 2:0.2f})")
print(f"F1: {f1.mean():0.2f} (+/- {f1.std() * 2:0.2f})")

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




HBox(children=(IntProgress(value=0, max=38), HTML(value='')))




HBox(children=(IntProgress(value=0, max=38), HTML(value='')))




HBox(children=(IntProgress(value=0, max=38), HTML(value='')))




HBox(children=(IntProgress(value=0, max=38), HTML(value='')))




HBox(children=(IntProgress(value=0, max=38), HTML(value='')))


NEWSQA
P: 0.19 (+/- 0.02)
R: 0.68 (+/- 0.05)
F1: 0.26 (+/- 0.02)


# Average question lengths

In [13]:
print("results for:  msmarco, hotpotqa, record, multirc, newsqa, drop")
all_lengths = []
for ds in all_datasets:
    lengths = [len(word_tokenize(get_question(s.raw_text))) for s,_ in ds]
    all_lengths.append(lengths)
    print(f"{avg(lengths):0.2f}")

results for:  msmarco, hotpotqa, record, multirc, newsqa, drop
6.30
18.30
24.88
11.44
7.30
13.00


In [14]:
new_all_lengths = [all_lengths[1], all_lengths[2], all_lengths[3], all_lengths[5]]
print(avg([q for ds in new_all_lengths for q in ds]))

16.905
