In [1]:
%load_ext autoreload
%autoreload 2

In [21]:
from attrdict import AttrDict

import pandas as pd
import spacy

from allennlp.predictors.predictor import Predictor
from allennlp.models.archival import load_archive
from nltk.parse.corenlp import CoreNLPParser, CoreNLPDependencyParser

from utils import CoreNLPServer

from sklearn.metrics import classification_report, log_loss

* models.heuristics
    - Contains all the heuristics models, the code repository contains models in addition to the ones being presented here.
* models.pronoun_resolution
    - Resolves pronoun mentions from the clusters predicted by the cored models
* models.multi_pass_sieve
    - Implements Backoff mechanism
    - The functionality is similar to the multi-pass sieve algorithm proposed for coref resolution by manning et al.
    - The stack of model is applied in the order of decreasing precision from top to bottom (or left to right if seen as arguments to the model) and increasing recall in the reverse direction.

In [20]:
# Heuristics models implement coref resolution based on heuristics described in the paper
# Pronoun resolution is a simple wrapper to convert coref predictions into class-specific labels
# Multi pass sieve model implements backoff mechanism
from models.heuristics.random_distance import RandomModel
from models.heuristics.token_distance import TokenDistanceModel
from models.heuristics.syntactic_distance import StanfordSyntacticDistanceModel
from models.heuristics.parallelism import AllenNLPParallelismModel as ParallelismModel
from models.heuristics.url_title import StanfordURLTitleModel as URLModel

from models.pronoun_resolution import PronounResolutionModel

from models.multi_pass_sieve import MultiPassSieveModel

In [19]:
# Has minor fixes for py3 and to take df as input instead of filepath
from gap.gap_scorer_ext import read_annotations, calculate_scores, add_to_score_view

In [5]:
# Instantiate stanford corenlp server
STANFORD_CORENLP_PATH = '/home/sandeep/Downloads/stanford-corenlp-full-2018-10-05/'
server = CoreNLPServer(classpath=STANFORD_CORENLP_PATH,
                        corenlp_options=AttrDict({'port': 9090, 
                                                  'timeout': '600000', 
                                                  'quiet': 'true',
                                                  'preload': 'tokenize,spplit,lemma,parse,deparse'}))
server.start()
STANFORD_SERVER_URL = server.url

In [7]:
STANFORD_MODEL = CoreNLPParser(url=STANFORD_SERVER_URL)
SPACY_MODEL = spacy.load('en_core_web_lg')
model_url = 'https://s3-us-west-2.amazonaws.com/allennlp/models/biaffine-dependency-parser-ptb-2018.08.23.tar.gz'
archive = load_archive(model_url, cuda_device=0)
ALLEN_DEP_MODEL = Predictor.from_archive(archive)

In [86]:
train = pd.read_csv('data/gap-development.tsv', sep='\t')
# normalizing column names
train.columns = map(lambda x: x.lower().replace('-', '_'), train.columns)
with pd.option_context('display.max_rows', 10, 'display.max_colwidth', 15):
    display(train)

Unnamed: 0,id,text,pronoun,pronoun_offset,a,a_offset,a_coref,b,b_offset,b_coref,url
0,development-1,Zoe Telford...,her,274,Cheryl Cassidy,191,True,Pauline,207,False,http://en.w...
1,development-2,He grew up ...,His,284,MacKenzie,228,True,Bernard Leach,251,False,http://en.w...
2,development-3,He had been...,his,265,Angeloz,173,False,De la Sota,246,True,http://en.w...
3,development-4,The current...,his,321,Hell,174,False,Henry Rosen...,336,True,http://en.w...
4,development-5,Her Santa F...,She,437,Kitty Oppen...,219,False,Rivera,294,True,http://en.w...
...,...,...,...,...,...,...,...,...,...,...,...
1995,development...,Faye's thir...,her,433,Nicole,255,False,Faye,328,True,http://en.w...
1996,development...,The plot of...,her,246,Doris Chu,111,False,Mei,215,True,http://en.w...
1997,development...,Grant playe...,she,348,Maria,259,True,Imelda Stau...,266,False,http://en.w...
1998,development...,The fashion...,She,284,Helen,145,True,Suzanne Bar...,208,False,http://en.w...


In [63]:
random_coref_model = RandomModel(SPACY_MODEL)
random_proref_model = PronounResolutionModel(random_coref_model)

token_distance_coref_model = TokenDistanceModel(SPACY_MODEL)
token_distance_proref_model = PronounResolutionModel(token_distance_coref_model)

syntactic_distance_coref_model = StanfordConstituencyModel(STANFORD_MODEL)
syntactic_distance_proref_model = PronounResolutionModel(syntactic_distance_coref_model, n_jobs=-1)

parallelism_coref_model = ParallelismModel(ALLEN_DEP_MODEL, SPACY_MODEL)
parallelism_proref_model = PronounResolutionModel(parallelism_coref_model)

url_title_coref_model = URLModel(STANFORD_MODEL)
url_title_proref_model = PronounResolutionModel(url_title_coref_model, n_jobs=-1)

In [17]:
# Creates sieve pipeline of heuristics models, applying each new heuristic with appropriate backoff models
# Multi pass sieve - order of models provided as input is important
#    - left to right: recall increases
#    - right to left: precision increases
preds = MultiPassSieveModel(random_proref_model).predict(train)
score_df = add_to_score_view(preds, train, None, 'Random')

preds = MultiPassSieveModel(token_distance_proref_model).predict(train)
score_df = add_to_score_view(preds, train, score_df, 'Token Distance')

preds = MultiPassSieveModel(syntactic_distance_proref_model,
                           token_distance_proref_model).predict(train)
score_df = add_to_score_view(preds, train, score_df, 'Syntactic Distance')

preds = MultiPassSieveModel(parallelism_proref_model,
                            syntactic_distance_proref_model,
                           token_distance_proref_model).predict(train)
score_df = add_to_score_view(preds, train, score_df, 'Parallelism')

preds = MultiPassSieveModel(url_title_proref_model,
                            parallelism_proref_model,
                            syntactic_distance_proref_model,
                           token_distance_proref_model).predict(train)
score_df = add_to_score_view(preds, train, score_df, 'Parallelism+URL')

100%|██████████| 2000/2000 [00:40<00:00, 49.98it/s]


Unnamed: 0,M,F,B,O
Random,49.13,51.42,1.05,50.28


100%|██████████| 2000/2000 [00:40<00:00, 49.04it/s]


Unnamed: 0,M,F,B,O
Random,49.13,51.42,1.05,50.28
Token Distance,51.3,47.2,0.92,49.25


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   13.7s


development-217, Tokens in parse tree and input sentence don't match.


[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:   33.8s
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1218 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1768 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed:  3.2min finished
100%|██████████| 2000/2000 [00:41<00:00, 51.52it/s]


Unnamed: 0,M,F,B,O
Random,49.13,51.42,1.05,50.28
Token Distance,51.3,47.2,0.92,49.25
Syntactic Distance,66.63,66.6,1.0,66.61


  6%|▋         | 126/2000 [00:42<12:26,  2.51it/s]

Dependency parse and tokenizer tokens dont match.


 10%|█         | 206/2000 [01:09<10:20,  2.89it/s]

Dependency parse and tokenizer tokens dont match.


 48%|████▊     | 962/2000 [05:28<06:27,  2.68it/s]

Dependency parse and tokenizer tokens dont match.


 55%|█████▌    | 1104/2000 [06:16<04:11,  3.57it/s]

Dependency parse and tokenizer tokens dont match.


 91%|█████████▏| 1826/2000 [10:27<01:00,  2.89it/s]

Dependency parse and tokenizer tokens dont match.


 98%|█████████▊| 1957/2000 [11:10<00:15,  2.82it/s]

Dependency parse and tokenizer tokens dont match.


100%|██████████| 2000/2000 [11:23<00:00,  3.06it/s]
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   14.0s


development-217, Tokens in parse tree and input sentence don't match.


[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:   33.9s
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1218 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1768 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed:  3.2min finished
100%|██████████| 2000/2000 [00:42<00:00, 47.29it/s]


Unnamed: 0,M,F,B,O
Random,49.13,51.42,1.05,50.28
Token Distance,51.3,47.2,0.92,49.25
Syntactic Distance,66.63,66.6,1.0,66.61
Parallelism,69.16,68.39,0.99,68.77


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   14.0s


development-217, Tokens in parse tree and input sentence don't match.


[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:   33.9s
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1218 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1768 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed:  3.2min finished
  6%|▋         | 126/2000 [00:42<12:36,  2.48it/s]

Dependency parse and tokenizer tokens dont match.


 10%|█         | 206/2000 [01:09<10:19,  2.90it/s]

Dependency parse and tokenizer tokens dont match.


 48%|████▊     | 962/2000 [05:30<06:29,  2.67it/s]

Dependency parse and tokenizer tokens dont match.


 55%|█████▌    | 1104/2000 [06:18<04:17,  3.48it/s]

Dependency parse and tokenizer tokens dont match.


 91%|█████████▏| 1826/2000 [10:30<01:00,  2.86it/s]

Dependency parse and tokenizer tokens dont match.


 98%|█████████▊| 1957/2000 [11:13<00:15,  2.85it/s]

Dependency parse and tokenizer tokens dont match.


100%|██████████| 2000/2000 [11:26<00:00,  3.14it/s]
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   14.1s


development-217, Tokens in parse tree and input sentence don't match.


[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:   33.4s
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1218 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1768 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed:  3.2min finished
100%|██████████| 2000/2000 [00:43<00:00, 45.60it/s]


Unnamed: 0,M,F,B,O
Random,49.13,51.42,1.05,50.28
Token Distance,51.3,47.2,0.92,49.25
Syntactic Distance,66.63,66.6,1.0,66.61
Parallelism,69.16,68.39,0.99,68.77
Parallelism+URL,74.33,70.35,0.95,72.33


In [16]:
y_pred = pd.DataFrame(preds, columns=['A', 'B'])
y_pred['NEITHER'] = ~y_pred['A'] & ~y_pred['B']

y_true = pd.DataFrame(train[['a_coref', 'b_coref']].values, columns=['A', 'B'])
y_true['NEITHER'] = ~y_true['A'] & ~y_true['B']

print(classification_report(y_true, y_pred, target_names=['A', 'B', 'NEITHER']))

              precision    recall  f1-score   support

           A       0.68      0.75      0.71       874
           B       0.69      0.77      0.73       925
     NEITHER       0.00      0.00      0.00       201

   micro avg       0.69      0.69      0.69      2000
   macro avg       0.46      0.51      0.48      2000
weighted avg       0.62      0.69      0.65      2000
 samples avg       0.69      0.69      0.69      2000



  'precision', 'predicted', average, warn_for)


In [22]:
# shift the predictions by 0.25
log_loss(y_true, y_pred + 0.25)

0.8393752431386904