In [167]:
#!pip install -U spacy
#!pip install neuralcoref

In [168]:
# Import English model
import spacy
nlp = spacy.load("en_core_web_sm")
# load NeuralCoref and add it to the pipe of SpaCy's model
import neuralcoref
coref = neuralcoref.NeuralCoref(nlp.vocab)
nlp.add_pipe(coref, name='neuralcoref')
import pandas as pd
import numpy as np

In [169]:
# Read in data file
df = pd.read_csv('data.tsv', sep='\t')
df.head()

Unnamed: 0,ID,Text,Pronoun,Pronoun-offset,A,A-offset,B,B-offset,URL
0,development-1,Zoe Telford -- played the police officer girlf...,her,274,Cheryl Cassidy,191,Pauline,207,http://en.wikipedia.org/wiki/List_of_Teachers_...
1,development-2,"He grew up in Evanston, Illinois the second ol...",His,284,MacKenzie,228,Bernard Leach,251,http://en.wikipedia.org/wiki/Warren_MacKenzie
2,development-3,"He had been reelected to Congress, but resigne...",his,265,Angeloz,173,De la Sota,246,http://en.wikipedia.org/wiki/Jos%C3%A9_Manuel_...
3,development-4,The current members of Crime have also perform...,his,321,Hell,174,Henry Rosenthal,336,http://en.wikipedia.org/wiki/Crime_(band)
4,development-5,Her Santa Fe Opera debut in 2005 was as Nuria ...,She,437,Kitty Oppenheimer,219,Rivera,294,http://en.wikipedia.org/wiki/Jessica_Rivera


In [171]:
# Create result dataframe to store mention offsets
spacy_resolved = pd.DataFrame()
spacy_resolved['ID'] = df['ID']
spacy_resolved['mention_offset'] = 0
spacy_resolved['mention'] = ''
spacy_resolved.head()

Unnamed: 0,ID,mention_offset,mention
0,development-1,0,
1,development-2,0,
2,development-3,0,
3,development-4,0,
4,development-5,0,


In [None]:
def fine_mention():
    

In [178]:
# Loop over the dataframe rows
def find_main(data):
    resolved = pd.DataFrame()
    for i in range(len(data)):
        index = data.loc[i, 'ID']
        doc = nlp(data.loc[i,"Text"])
        for token in doc:
            # if token offset matches pronoun offset
            if token.idx == data.loc[i,"Pronoun-offset"]:
                # if coreference is resolved then find the character offset of the mention
                if token._.coref_clusters != []:
                    resolved.loc[resolved['ID']==index, 'mention_offset'] = token._.coref_clusters[0].main.start_char
                    resolved.loc[resolved['ID']==index, 'mention'] = str(token._.coref_clusters[0].main)
                else:
                    resolved.loc[resolved['ID']==index, 'mention_offset'] = None
                    resolved.loc[resolved['ID']==index, 'mention'] = None
    return resolved

In [182]:
# Call function
f = find_main(df.loc[0:10,:], spacy_resolved)
f.dropna().head()

Unnamed: 0,ID,mention_offset,mention
0,development-1,170,Phoebe Thomas
1,development-2,0,He
2,development-3,98,De la Sota
3,development-4,281,Rank
4,development-5,0,Her


In [141]:
resolved.set_index(resolved['ID'], inplace=True, drop=True)
resolved.drop(columns=['ID'], inplace=True)
resolved.drop(resolved.loc[resolved['mention_offset']=='X',:].index, axis=0, inplace=True)
resolved['mention_offset'] = resolved['mention_offset']
resolved.shape

(1806, 2)

In [142]:
#resolved.to_csv("spacy_resolved.csv")
resolved.head(10)

Unnamed: 0_level_0,mention_offset,mention
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
development-1,170,Phoebe Thomas
development-2,0,He
development-3,98,De la Sota
development-4,281,Rank
development-5,0,Her
development-6,0,Sandra Collins
development-7,152,Reb Asher's brother Rabbi Shlomo Arieli
development-9,128,Mary
development-10,196,Christina Jennings
development-12,217,Eleanor


In [143]:
# Correct answers
answers = pd.read_csv("submission.csv")    #Load submission file
answers['dummy'] = 'X'
answers['mention'] = df['A'] * answers['A'] + df['B'] * answers['B'] + answers['dummy'] * answers['NEITHER']
answers.set_index(answers['ID'], inplace=True)
answers.drop(columns=['ID', 'dummy'], inplace=True)
answers.drop(answers.loc[answers['mention']=='X',:].index, axis=0, inplace=True)
answers.head(10)

Unnamed: 0_level_0,A,B,NEITHER,mention
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
development-1,1,0,0,Cheryl Cassidy
development-2,1,0,0,MacKenzie
development-3,0,1,0,De la Sota
development-4,0,1,0,Henry Rosenthal
development-5,0,1,0,Rivera
development-6,1,0,0,Collins
development-8,0,1,0,Robert Christgau
development-9,0,1,0,Kelsey
development-10,1,0,0,Christina Jennings
development-11,0,1,0,David Onley


In [144]:
join_answers_neuralcoref = pd.concat([answers['mention'], resolved['mention']], axis=1, join='inner')
join_answers_neuralcoref['similar'] = False
join_answers_neuralcoref.columns = ['answers', 'resolved', 'similar']
join_answers_neuralcoref.head(10)

Unnamed: 0_level_0,answers,resolved,similar
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
development-1,Cheryl Cassidy,Phoebe Thomas,False
development-2,MacKenzie,He,False
development-3,De la Sota,De la Sota,False
development-4,Henry Rosenthal,Rank,False
development-5,Rivera,Her,False
development-6,Collins,Sandra Collins,False
development-9,Kelsey,Mary,False
development-10,Christina Jennings,Christina Jennings,False
development-12,Eleanor,Eleanor,False
development-13,Williams,Mary Helen Moses,False


In [145]:
# Answers that match with neuralcoref resolution results
for i in join_answers_neuralcoref.index:
    text1 = join_answers_neuralcoref.loc[i, 'answers']
    text2 = join_answers_neuralcoref.loc[i, 'resolved']
    if text1 == text2:
        join_answers_neuralcoref.loc[i, 'similar'] = True
    else:
        doc1 = set([word.text for word in nlp(text1)])
        doc2 = set([word.text for word in nlp(text2)])
        if len(doc1.intersection(doc2))>0:
            join_answers_neuralcoref.loc[i, 'similar'] = True
        else:
            join_answers_neuralcoref.loc[i, 'similar'] = False
            

In [151]:
join_answers_neuralcoref.head(20)
print(join_answers_neuralcoref['similar'].sum())
print(join_answers_neuralcoref.shape)

833
(1627, 3)


In [153]:
# CoreNLP resolved results
corenlp = pd.read_csv("core_nlp_resolved.csv")
corenlp.set_index(corenlp['ID'], inplace=True)
corenlp.drop(columns=['ID', 'Unnamed: 0'], inplace=True)
corenlp.head()

Unnamed: 0_level_0,Resolution
ID,Unnamed: 1_level_1
development-1,Phoebe Thomas
development-2,MacKenzie
development-3,Governor Angeloz
development-5,Kitty Oppenheimer
development-6,Sandra Collins


In [154]:
join_answers_corenlp = pd.concat([answers['mention'], corenlp['Resolution']], axis=1, join='inner')
join_answers_corenlp['similar'] = False
join_answers_corenlp.columns = ['answers', 'resolved', 'similar']
join_answers_corenlp.head(10)

Unnamed: 0_level_0,answers,resolved,similar
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
development-1,Cheryl Cassidy,Phoebe Thomas,False
development-2,MacKenzie,MacKenzie,False
development-3,De la Sota,Governor Angeloz,False
development-5,Rivera,Kitty Oppenheimer,False
development-6,Collins,Sandra Collins,False
development-8,Robert Christgau,Greg Kot of the Chicago Tribune perceived `` f...,False
development-9,Kelsey,Mary,False
development-10,Christina Jennings,Christina Jennings,False
development-11,David Onley,David Onley,False
development-12,Eleanor,Eleanor,False


In [155]:
# Answers that match with corenlp resolution results
for i in join_answers_corenlp.index:
    text1 = join_answers_corenlp.loc[i, 'answers']
    text2 = join_answers_corenlp.loc[i, 'resolved']
    if text1 == text2:
        join_answers_corenlp.loc[i, 'similar'] = True
    else:
        doc1 = set([word.text for word in nlp(text1)])
        doc2 = set([word.text for word in nlp(text2)])
        if len(doc1.intersection(doc2))>0:
            join_answers_corenlp.loc[i, 'similar'] = True
        else:
            join_answers_corenlp.loc[i, 'similar'] = False
 

In [156]:
join_answers_corenlp.head()
print(join_answers_corenlp['similar'].sum())
print(join_answers_corenlp.shape)

879
(1745, 3)


In [157]:
neural_corenlp = pd.concat([resolved['mention'], corenlp['Resolution']], axis=1, join='inner')
neural_corenlp['similar'] = False
neural_corenlp.columns = ['neuralcoref_resolved', 'corenlp_resolved', 'similar']
neural_corenlp.head(10)

Unnamed: 0_level_0,neuralcoref_resolved,corenlp_resolved,similar
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
development-1,Phoebe Thomas,Phoebe Thomas,False
development-2,He,MacKenzie,False
development-3,De la Sota,Governor Angeloz,False
development-5,Her,Kitty Oppenheimer,False
development-6,Sandra Collins,Sandra Collins,False
development-7,Reb Asher's brother Rabbi Shlomo Arieli,Reb Asher 's brother,False
development-9,Mary,Mary,False
development-10,Christina Jennings,Christina Jennings,False
development-12,Eleanor,Eleanor,False
development-13,Mary Helen Moses,Mary Helen Moses,False


In [158]:
# Compare neuralcoref resolution with corenlp resolution
for i in neural_corenlp.index:
    text1 = neural_corenlp.loc[i, 'neuralcoref_resolved']
    text2 = neural_corenlp.loc[i, 'corenlp_resolved']
    if text1 == text2:
        neural_corenlp.loc[i, 'similar'] = True
    else:
        doc1 = set([word.text for word in nlp(text1)])
        doc2 = set([word.text for word in nlp(text2)])
        if len(doc1.intersection(doc2))>0:
            neural_corenlp.loc[i, 'similar'] = True
        else:
            neural_corenlp.loc[i, 'similar'] = False

In [159]:
print(neural_corenlp['similar'].sum())
print(neural_corenlp.shape)

876
(1764, 3)


In [165]:
neural_corenlp.head(10)

Unnamed: 0_level_0,neuralcoref_resolved,corenlp_resolved,similar
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
development-1,Phoebe Thomas,Phoebe Thomas,True
development-2,He,MacKenzie,False
development-3,De la Sota,Governor Angeloz,False
development-5,Her,Kitty Oppenheimer,False
development-6,Sandra Collins,Sandra Collins,True
development-7,Reb Asher's brother Rabbi Shlomo Arieli,Reb Asher 's brother,True
development-9,Mary,Mary,True
development-10,Christina Jennings,Christina Jennings,True
development-12,Eleanor,Eleanor,True
development-13,Mary Helen Moses,Mary Helen Moses,True


Unnamed: 0_level_0,Resolution
ID,Unnamed: 1_level_1
development-1,Phoebe Thomas
development-2,MacKenzie
development-3,Governor Angeloz
development-5,Kitty Oppenheimer
development-6,Sandra Collins


In [77]:
# Answers that match with corenlp resolution results
for i in corenlp.index:
    text1 = corenlp.loc[i, 'Resolution']
    text2 = answers.loc[i, 'mention']
    if text1 == text2:
        answers.loc[i, 'corenlp_match'] = True
    else:
        doc1 = set([word.text for word in nlp(text1)])
        doc2 = set([word.text for word in nlp(text2)])
        if len(doc1.intersection(doc2))>0:
            answers.loc[i, 'corenlp_match'] = True
        else:
            answers.loc[i, 'corenlp_match'] = False
            
answers.head(20)

Unnamed: 0_level_0,A,B,NEITHER,mention,neuralcoref_match,corenlp_match
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
development-1,1,0,0,Cheryl Cassidy,False,False
development-2,1,0,0,MacKenzie,False,True
development-3,0,1,0,De la Sota,True,False
development-4,0,1,0,Henry Rosenthal,False,False
development-5,0,1,0,Rivera,False,False
development-6,1,0,0,Collins,True,True
development-7,0,0,1,X,False,False
development-8,0,1,0,Robert Christgau,False,False
development-9,0,1,0,Kelsey,False,False
development-10,1,0,0,Christina Jennings,True,True


In [78]:
answers['neuralcoref_match'].sum()

820

In [79]:
answers['corenlp_match'].sum()

879

In [81]:
pd.concat([corenlp.loc[answers['corenlp_match']==True, 'Resolution'],answers.loc[answers['corenlp_match']==True,['mention', 'corenlp_match']]], axis=1)

Unnamed: 0_level_0,Resolution,mention,corenlp_match
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
development-2,MacKenzie,MacKenzie,True
development-6,Sandra Collins,Collins,True
development-10,Christina Jennings,Christina Jennings,True
development-11,David Onley,David Onley,True
development-12,Eleanor,Eleanor,True
development-14,Kazuki Nakajima,Kazuki Nakajima,True
development-15,Maria 's mother,Maria,True
development-17,Maurice,Maurice,True
development-20,Ramsey 's,Ramsey,True
development-22,Kurt,Kurt,True


In [85]:
(answers['A']+answers['B']+answers['NEITHER'] == 1).sum()

2000

In [90]:
joined_mentions = pd.concat([corenlp['Resolution'], resolved['mention']], axis=1, join='inner')
joined_mentions['similar'] = False
joined_mentions.head()

Unnamed: 0_level_0,Resolution,mention,similar
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
development-1,Phoebe Thomas,Phoebe Thomas,False
development-2,MacKenzie,He,False
development-3,Governor Angeloz,De la Sota,False
development-5,Kitty Oppenheimer,Her,False
development-6,Sandra Collins,Sandra Collins,False


In [91]:
for i in joined_mentions.index:
    text1 = joined_mentions.loc[i, 'Resolution']
    text2 = joined_mentions.loc[i, 'mention']
    if text1 == text2:
        joined_mentions.loc[i, 'similar'] = True
    else:
        doc1 = set([word.text for word in nlp(text1)])
        doc2 = set([word.text for word in nlp(text2)])
        if len(doc1.intersection(doc2))>0:
            joined_mentions.loc[i, 'similar'] = True
        else:
            joined_mentions.loc[i, 'similar'] = False
            
joined_mentions.head(50)

Unnamed: 0_level_0,Resolution,mention,similar
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
development-1,Phoebe Thomas,Phoebe Thomas,True
development-2,MacKenzie,He,False
development-3,Governor Angeloz,De la Sota,False
development-5,Kitty Oppenheimer,Her,False
development-6,Sandra Collins,Sandra Collins,True
development-7,Reb Asher 's brother,Reb Asher's brother Rabbi Shlomo Arieli,True
development-8,Greg Kot of the Chicago Tribune perceived `` f...,X,False
development-9,Mary,Mary,True
development-10,Christina Jennings,Christina Jennings,True
development-11,David Onley,X,False


In [93]:
joined_mentions['similar'].sum()

863

In [164]:
answers.shape

(1799, 4)

In [177]:
None