In [98]:
import pandas as pd
import numpy as np
import os
from transformers import BertModel, BertTokenizer
import torch
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
# from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

import spacy
nlp = spacy.load('en')

import textsimilarity
from textsimilarity.build_semantic_graph.build_tree import build_tree
from textsimilarity.build_semantic_graph.prune_and_merge_tree import prune
from textsimilarity.build_semantic_graph.rearrange_tree import rearrange
from textsimilarity.build_semantic_graph.build_graph import get_graph
from textsimilarity.build_semantic_graph.merge_graph import merge
from textsimilarity.build_semantic_graph.build_semantic_graph import merge_dp_coref
from textsimilarity.build_semantic_graph import build_semantic_graph
from textsimilarity import get_similarity


from textsimilarity.preprocess import get_coref_and_dp
from scipy.special import softmax

stop_words = set(stopwords.words('english')) 

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jiajinghu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [50]:
def get_word2vec_embeddings(doc, with_coref=True):
    if with_coref:
        coref = get_coref_and_dp.get_neural_coreference(doc)
        doc = ' '.join(coref)
    nlp_doc = nlp(doc)
    emb = np.array([nlp_doc[i].vector for i in range(len(doc.split()))])
    return emb

In [44]:
def cosine_similarity(vec1, vec2):
    dot = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return dot/(norm1*norm2)

In [149]:
def get_wms(doc1, doc2, emb1, emb2):
    # we treat doc1 as the query document
    result1 = [(word, emb) for word, emb in zip(doc1,emb1) if (word not in stop_words) and (not np.all(emb==0))]
    document1 = [x[0] for x in result1]
    embedding1 = [x[1] for x in result1]
    
    result2 = [(word, emb) for word, emb in zip(doc2,emb2) if (word not in stop_words) and (not np.all(emb==0))]
    document2 = [x[0] for x in result2]
    embedding2 = [x[1] for x in result2]

    if len(embedding1)==0 or len(embedding2)==0:
        return 0
    else:
        s1 = set(document1)
        s2 = set(document2)
        
        d2 = {}
        for s in s2:
            d2[s] = document2.count(s)/len(document2)
        
        sent_sim = []
        for i2, e2 in enumerate(embedding2):
            word_sim = []
            for e1 in embedding1:
                word_sim.append(cosine_similarity(e1, e2))
            # print(document2[i2], word_sim)
            sent_sim.append(max(word_sim)*d2[document2[i2]])
            print(f'{document2[i2]} matches {document1[np.argmax(word_sim)]}')
    print(sent_sim)
    return sum(sent_sim)

In [92]:
def get_graph_from_sent(sentence):
    dp = get_coref_and_dp.get_spacy_dependency(sentence)
    coref = get_coref_and_dp.get_neural_coreference(sentence)
    w2v = get_word2vec_embeddings(' '.join(coref))
    data = build_semantic_graph.merge_dp_coref([dp], [coref])
    sent = build_tree(data[0])
    sent = {'sequence':sent['words'], 'tree':prune(sent['tree'], sent['words'])}
    sent = {'sequence': sent['sequence'], 'tree': rearrange(sent['tree'], sent['sequence'])}
    return get_graph(sent['tree']), w2v, dp, coref

In [108]:
def get_node_embeddings(graph, embeddings):
    node_features = []
    for node in graph['nodes']:
        node_features.append(embeddings[[node['index']]].mean(axis=0))            
    return np.array(node_features)

def get_graph_similarity(graph1, graph2, emb1, emb2, threshold):
    # we treat graph1 as the query document
    embedding1 = get_node_embeddings(graph1, emb1)
    embedding2 = get_node_embeddings(graph2, emb2)
    
    w_match = 1
    w_nonmatch = 0.5
    
    graph_sim = []
    node_match = []
    for i2, e2 in enumerate(embedding2):
        node_sim = []
        for e1 in embedding1:
            if (np.all(e1==0))|(np.all(e2==0)):
                node_sim.append(0)
            else:
                node_sim.append(get_similarity.cosine_similarity(e1, e2))
        max_sim_node = np.argmax(node_sim)
        if max(node_sim) >= threshold:
            node_match.append((max_sim_node, i2))
        graph_sim.append(max(node_sim))
        print(f"{graph2['nodes'][i2]['word']} matches {graph1['nodes'][max_sim_node]['word']}")
        
    print(np.array(graph_sim)/len(graph2))
#     print('node match: ', node_match)
    if len(node_match) <= 1:
        return sum(graph_sim)*w_nonmatch/len(graph2), None
    else:
        weights = np.zeros(len(graph_sim)) + w_nonmatch
        for i in range(len(node_match)):
            for j in range(i, len(node_match)):
                n1_g1 = node_match[i][0]
                n1_g2 = node_match[i][1]
                n2_g1 = node_match[j][0]
                n2_g2 = node_match[j][1]
                
                n1_n2_g1 = graph1['edges'][n1_g1][n2_g1]
                n2_n1_g1 = graph1['edges'][n2_g1][n1_g1]

                n1_n2_g2 = graph2['edges'][n1_g2][n2_g2]
                n2_n1_g2 = graph2['edges'][n2_g2][n1_g2]

                if ((n1_n2_g1!='')&(n1_n2_g1 == n1_n2_g2))|((n2_n1_g1!='')&(n2_n1_g1 == n2_n1_g2)):
                    weights[n1_g2] += (w_match-w_nonmatch)/len(graph2)
                    weights[n2_g2] += (w_match-w_nonmatch)/len(graph2)
#                     print(n1_g2)
#                     print(n2_g2)
                    print('(', graph2['nodes'][n1_g2]['word'], ',', graph2['nodes'][n2_g2]['word'], ') matches ','(',
                         graph1['nodes'][n1_g1]['word'], ',', graph1['nodes'][n2_g1]['word'], ')')
                    
        return np.dot(graph_sim, weights)/len(graph2), weights

In [147]:
def get_graph_similarity_wms(sent1, sent2, graph1, graph2, emb1, emb2, threshold):
    
    # we treat graph1 as the query document
    embedding1 = get_node_embeddings(graph1, emb1)
    embedding2 = get_node_embeddings(graph2, emb2)
    
    w_match = 1
    w_nonmatch = 0.5
    
    graph_sim = []
    node_match = []
    for i2, n2 in enumerate(graph2['nodes']):
        node_sim = []
        for i1, n1 in enumerate(graph1['nodes']):
            sent_wms = get_wms(n1['word'].split(' '), 
                                   n2['word'].split(' '),
                                   emb1.take(n1['index'], axis=0),
                                   emb2.take(n2['index'], axis=0))
            if sent_wms == None:
                node_sim.append(0)
            else:
                node_sim.append(sent_wms)
        max_sim_node = np.argmax(node_sim)
        if max(node_sim) >= threshold:
            node_match.append((max_sim_node, i2))
        graph_sim.append(max(node_sim))
        print(f"{graph2['nodes'][i2]['word']} matches {graph1['nodes'][max_sim_node]['word']}")
        
    print(np.array(graph_sim)/len(graph2))
#     print('node match: ', node_match)
    if len(node_match) <= 1:
        return sum(graph_sim)*w_nonmatch/len(graph2), None
    else:
        weights = np.zeros(len(graph_sim)) + w_nonmatch
        for i in range(len(node_match)):
            for j in range(i, len(node_match)):
                n1_g1 = node_match[i][0]
                n1_g2 = node_match[i][1]
                n2_g1 = node_match[j][0]
                n2_g2 = node_match[j][1]
                
                n1_n2_g1 = graph1['edges'][n1_g1][n2_g1]
                n2_n1_g1 = graph1['edges'][n2_g1][n1_g1]

                n1_n2_g2 = graph2['edges'][n1_g2][n2_g2]
                n2_n1_g2 = graph2['edges'][n2_g2][n1_g2]

                if ((n1_n2_g1!='')&(n1_n2_g1 == n1_n2_g2))|((n2_n1_g1!='')&(n2_n1_g1 == n2_n1_g2)):
                    weights[n1_g2] += (w_match-w_nonmatch)/len(graph2)
                    weights[n2_g2] += (w_match-w_nonmatch)/len(graph2)
#                     print(n1_g2)
#                     print(n2_g2)
                    print('(', graph2['nodes'][n1_g2]['word'], ',', graph2['nodes'][n2_g2]['word'], ') matches ','(',
                         graph1['nodes'][n1_g1]['word'], ',', graph1['nodes'][n2_g1]['word'], ')')
                    
        return np.dot(graph_sim, weights)/len(graph2), weights

In [101]:
def get_sent_similarity(sent1, sent2, wms_weight, threshold=None):
    g1, w2v1, dp1, coref1 = get_graph_from_sent(sent1)
    g2, w2v2, dp2, coref2 = get_graph_from_sent(sent2)
    
    if wms_weight == 0:
        graph_sim = get_graph_similarity(g1, g2, w2v1, w2v2, threshold)[0]
        return graph_sim
    elif wms_weight == 1:
        wms = get_wms(coref1, coref2, w2v1, w2v2)
        return wms
    else:
        wms = get_wms(coref1, coref2, w2v1, w2v2)
        graph_sim = get_graph_similarity(g1, g2, w2v1, w2v2, threshold)[0]
        return wms_weight*wms + (1-wms_weight)*graph_sim

In [129]:
def get_sent_similarity_embed_wms(sent1, sent2, wms_weight, threshold=None):
    g1, w2v1, dp1, coref1 = get_graph_from_sent(sent1)
    g2, w2v2, dp2, coref2 = get_graph_from_sent(sent2)
    
    if wms_weight == 0:
        graph_sim = get_graph_similarity_wms(sent1, sent2, g1, g2, w2v1, w2v2, threshold)[0]
        return graph_sim
    elif wms_weight == 1:
        wms = get_wms(coref1, coref2, w2v1, w2v2)
        return wms
    else:
        wms = get_wms(coref1, coref2, w2v1, w2v2)
        graph_sim = get_graph_similarity_wms(sent1, sent2, g1, g2, w2v1, w2v2, threshold)[0]
        return wms_weight*wms + (1-wms_weight)*graph_sim

In [200]:
DATA_DIR = 'bbcsports'
train = pd.read_csv(os.path.join('../data', DATA_DIR, 'raw', 'train.csv'))
test = pd.read_csv(os.path.join('../data', DATA_DIR, 'raw', 'test.csv'))
wms = np.load(os.path.join('../data', DATA_DIR, 'processed', f'wms_train.npy'))
wms_test = np.load(os.path.join('../data', DATA_DIR, 'processed', f'wmd_test.npy'))
graph_sim = np.load(os.path.join('../data', DATA_DIR, 'processed', 'sim_train_wms_wmsweight_0.0_threshold0.9.npy'))

In [201]:
wms_diagmax = wms + np.identity(wms.shape[0])*wms.max()
wms_diagmax
wmd = wms.max() - wms_diagmax
wmd

array([[0.        , 8.63328227, 8.66166243, ..., 8.66043857, 7.90423293,
        8.40649062],
       [8.63328227, 0.        , 8.65322615, ..., 8.74688693, 7.91890335,
        8.43698883],
       [8.66166243, 8.65322615, 0.        , ..., 8.69331454, 7.9897584 ,
        8.45857386],
       ...,
       [8.66043857, 8.74688693, 8.69331454, ..., 0.        , 7.83710559,
        8.35932456],
       [7.90423293, 7.91890335, 7.9897584 , ..., 7.83710559, 0.        ,
        8.30595917],
       [8.40649062, 8.43698883, 8.45857386, ..., 8.35932456, 8.30595917,
        0.        ]])

In [202]:
wmd_test = wms.max()-wms_test

In [195]:
graph_distance = graph_sim.max()-graph_sim
graph_distance

array([[0.        , 6.51990106, 6.40122763, ..., 5.92252394, 6.06699425,
        6.99808362],
       [6.51990106, 0.        , 6.31260664, ..., 6.14676593, 6.32416242,
        7.01503861],
       [6.40122763, 6.31260664, 0.        , ..., 6.05052924, 6.34765894,
        7.16051235],
       ...,
       [5.92252394, 6.14676593, 6.05052924, ..., 0.        , 5.99632615,
        7.0123436 ],
       [6.06699425, 6.32416242, 6.34765894, ..., 5.99632615, 0.        ,
        7.0343109 ],
       [6.99808362, 7.01503861, 7.16051235, ..., 7.0123436 , 7.0343109 ,
        0.        ]])

In [212]:
knn = KNeighborsClassifier(metric="precomputed")
knn.fit(wmd, train.label)
pred = knn.predict(wmd)
train['wmd_pred'] = pred
train.head()

Unnamed: 0,label,sentence,wmd_pred,graph_pred
0,cricket,lee who took 2 24 off seven overs on saturday ...,rugby,cricket
1,athletics,radcliffe must make a decision by tuesday the ...,football,cricket
2,cricket,shoaib believes injuries are an inevitable par...,rugby,football
3,athletics,the only time you see british sprinters gettin...,athletics,football
4,tennis,but it pays off and the spaniard edges ahead i...,football,football


In [214]:
knn.score(wmd_test, test.label)

0.33590576766856217

In [219]:
g_knn = KNeighborsClassifier(metric="precomputed", n_neighbors=10)
g_knn.fit(graph_distance, train.label)
pred = g_knn.predict(graph_distance)
train['graph_pred'] = pred
train.head()

Unnamed: 0,label,sentence,wmd_pred,graph_pred
0,cricket,lee who took 2 24 off seven overs on saturday ...,rugby,cricket
1,athletics,radcliffe must make a decision by tuesday the ...,football,cricket
2,cricket,shoaib believes injuries are an inevitable par...,rugby,football
3,athletics,the only time you see british sprinters gettin...,athletics,football
4,tennis,but it pays off and the spaniard edges ahead i...,football,football


In [220]:
g_knn.score(graph_distance, train.label)

0.48777464788732394

In [161]:
wmd_correct_graph_wrong = train[(train.label==train.wmd_pred)&(train.label!=train.graph_pred)]
print(wmd_correct_graph_wrong.shape)
graph_correct_wmd_wrong = train[(train.label!=train.wmd_pred)&(train.label==train.graph_pred)]
print(graph_correct_wmd_wrong.shape)
both_wrong = train[(train.label!=train.wmd_pred)&(train.label!=train.graph_pred)]
print(both_wrong.shape)

(2448, 4)
(1723, 4)
(2534, 4)


In [27]:
wmd[0,:].argsort()

array([   0, 1232,  353, ..., 5983, 1313, 1525])

In [28]:
train.take(wmd[0,:].argsort()[:10])

Unnamed: 0,label,sentence,wmd_pred
0,cricket,lee who took 2 24 off seven overs on saturday ...,football
1232,football,arsenal v stokeswindon notts co v middlesbroug...,cricket
353,rugby,but you talk about certain players and they ll...,football
6227,rugby,dimitri yachvili biarritz pierre mignoni clerm...,rugby
3897,rugby,dimitri yachvili biarritz pierre mignoni clerm...,rugby
4429,football,derby v watford or fulhamman utd or exeter v m...,football
1650,tennis,the 2002 champion thomas johansson fought back...,tennis
1244,football,van nistelrooy hungry for returnmanchester uni...,football
3484,cricket,club spokesman keith cook told bbc sport we ha...,rugby
2544,football,people want to write him off but if he has kep...,football


In [10]:
train.iloc[0].sentence

'lee who took 2 24 off seven overs on saturday and also sent michael papps to hospital with a bouncer was contrite afterwards'

In [29]:
train.iloc[1232].sentence

'arsenal v stokeswindon notts co v middlesbroughman utd v exeterplymouth v evertonleicester v blackpoolderby v wigansunderland v crystal palacewolves v millwallyeading v newcastlehull v colchestertottenham v brightonreading v stockport swanseabirmingham v leedshartlepool v bostonmilton keynes dons v peterborougholdham v man citychelsea v scunthorpecardiff v blackburncharlton v rochdalewest ham v norwichsheff utd v aston villapreston v west bromrotherham v yeovilburnley v liverpoolbournemouth v chestercoventry v crewewatford v fulhamipswich v boltonportsmouth v gillinghamnorthampton v southamptonqpr v nottm forestluton v hinckley brentfordmatches to be played on weekend of 8 9 january'

In [30]:
train.iloc[353].sentence

'but you talk about certain players and they ll say if they think they re up to scratch or that they don t want them in their team'

In [38]:
wmd_correct_graph_wrong

Unnamed: 0,label,sentence,wmd_pred,graph_pred
7,football,gunners keeper manuel almunia who got the nod ...,football,cricket
9,football,so now he says that he is heading for corinthi...,football,athletics
12,rugby,england are second bottom in the six nations t...,rugby,football
14,football,teenager rooney returned to everton after euro...,football,tennis
24,rugby,ireland next face france at lansdowne road in ...,rugby,football
...,...,...,...,...
8848,athletics,world indoor 60m hurdles championalso won impr...,athletics,football
8853,cricket,they batted well in tough conditions and when ...,cricket,football
8859,athletics,what now for kelly holmes last april kelly hol...,athletics,football
8868,rugby,dominici backs lacklustre francewing christoph...,rugby,football


In [41]:
train.take(wmd[7,:].argsort()[:10])

Unnamed: 0,label,sentence,wmd_pred,graph_pred
7,football,gunners keeper manuel almunia who got the nod ...,football,cricket
1232,football,arsenal v stokeswindon notts co v middlesbroug...,cricket,cricket
6227,rugby,dimitri yachvili biarritz pierre mignoni clerm...,rugby,cricket
3897,rugby,dimitri yachvili biarritz pierre mignoni clerm...,rugby,football
1244,football,van nistelrooy hungry for returnmanchester uni...,football,cricket
353,rugby,but you talk about certain players and they ll...,football,football
6708,cricket,salman butt imran farhat younis khan yousuf yo...,cricket,cricket
4429,football,derby v watford or fulhamman utd or exeter v m...,football,football
3484,cricket,club spokesman keith cook told bbc sport we ha...,rugby,cricket
535,football,klinsmann issues lehmann warninggermany coach ...,football,cricket


In [42]:
train.take(graph_distance[7,:].argsort()[:10])

Unnamed: 0,label,sentence,wmd_pred,graph_pred
7,football,gunners keeper manuel almunia who got the nod ...,football,cricket
2816,rugby,bell who featured in the england a side which ...,tennis,football
25,rugby,he was due to have a hospital scan on monday w...,football,football
3964,cricket,having declared and sacrificed the possibility...,cricket,football
7274,cricket,having pulled charl willoughby through mid wic...,rugby,cricket
5248,football,but the chester chairman added after they disc...,football,football
8171,athletics,kenteris 31 who won the men s 200m title at th...,athletics,athletics
2609,cricket,mushtaq who took 185 wickets in a 52 test care...,cricket,cricket
100,rugby,umaga admitted the fear of injury weighed on h...,rugby,cricket
1256,cricket,boje tosses one up from over the wicket and jo...,football,cricket


In [43]:
train.iloc[7].sentence

'gunners keeper manuel almunia who got the nod ahead of jens lehmann did well to save a well struck frank lampard shot'

In [46]:
train.iloc[2816].sentence

'bell who featured in the england a side which beat france 30 20 10 days ago added i recognise that i got into the england a squad because of injuries'

In [102]:
get_sent_similarity(train.iloc[7].sentence, train.iloc[2816].sentence, 1)

bell matches almunia
featured matches got
england matches lehmann
side matches lampard
beat matches got
france matches jens
30 matches lampard
20 matches lehmann
10 matches struck
days matches jens
ago matches ahead
added matches struck
recognise matches got
got matches got
england matches lehmann
squad matches well
injuries matches gunners
[0.03767391162760118, 0.03776483325397267, 0.05978843745063333, 0.021968594368766334, 0.04051180096233592, 0.032288193702697754, 0.017958293942844167, 0.007329944740323459, 0.006678421269444858, 0.014843995080274694, 0.023190691190607408, 0.027043009505552405, 0.026110899799010334, 0.05024772882461548, 0.05914577315835392, 0.021104740745881024, 0.029578229960273292]


0.5132274995831881

In [109]:
get_sent_similarity(train.iloc[7].sentence, train.iloc[2816].sentence, 0, 0.9)

added matches did well
bell matches gunners keeper manuel almunia
featured matches got
who matches who
a side matches a well struck frank lampard shot
beat matches got
which matches who
france matches gunners keeper manuel almunia
in the england matches of jens lehmann
20 10 days ago matches of jens lehmann
recognise matches got
i matches who
got matches got
i matches who
a squad matches a well struck frank lampard shot
that matches who
into the england matches of jens lehmann
because matches ahead
injuries matches gunners keeper manuel almunia
of matches of jens lehmann
[0.24767241 0.3453486  0.32100108 0.4121512  0.24753407 0.3443503
 0.21119106 0.29877204 0.25279742 0.12138194 0.22194265 0.14371303
 0.4271057  0.22383317 0.2374913  0.14017054 0.24941224 0.20708103
 0.15396719 0.14678605]


  node_features.append(embeddings[[node['index']]].mean(axis=0))


2.476851500570774

In [150]:
get_sent_similarity_embed_wms(train.iloc[7].sentence, train.iloc[2816].sentence, 0, 0.9)

added matches well
[0.09606555104255676]
added matches manuel
[0.30098938941955566]
added matches got
[0.4205186069011688]
added matches nod
[-0.10475103557109833]
added matches ahead
[0.017317883670330048]
added matches jens
[0.08117477595806122]
added matches save
[0.19132281839847565]
added matches struck
[0.45973116159439087]
added matches a well struck frank lampard shot
bell matches well
[0.2248062938451767]
bell matches almunia
[0.64045649766922]
bell matches got
[0.13924570381641388]
bell matches nod
[0.1669345200061798]
bell matches ahead
[0.07046061009168625]
bell matches jens
[0.5451347827911377]
bell matches save
[0.3377898633480072]
bell matches lampard
[0.4371160566806793]
bell matches gunners keeper manuel almunia
featured matches well
[-0.01187386829406023]
featured matches manuel
[0.21964016556739807]
featured matches got
[0.6420021653175354]
featured matches nod
[0.03307522088289261]
featured matches ahead
[-0.11111399531364441]
featured matches jens
[-0.0470263622701

  node_features.append(embeddings[[node['index']]].mean(axis=0))


1.671551605220884

In [59]:
train.iloc[7].sentence

'gunners keeper manuel almunia who got the nod ahead of jens lehmann did well to save a well struck frank lampard shot'

In [48]:
train.iloc[1244].sentence

'van nistelrooy hungry for returnmanchester united striker ruud van nistelrooy said he was hungry to play as he returned to training on tuesday'

In [104]:
get_sent_similarity(train.iloc[7].sentence, train.iloc[1244].sentence, 1)

van matches almunia
nistelrooy matches almunia
hungry matches lampard
returnmanchester matches almunia
united matches manuel
striker matches keeper
ruud matches keeper
van matches almunia
nistelrooy matches lehmann
said matches got
van nistelrooy hungry for returnmanchester united striker ruud van nistelrooy matches almunia
hungry matches lampard
play matches almunia
van nistelrooy hungry for returnmanchester united striker ruud van nistelrooy matches keeper
returned matches keeper
training matches lehmann
tuesday matches frank
[0.07479828245499555, 0.05776873756857479, 0.06125042017768411, 0.030854183084824505, 0.03930818333345301, 0.03190899245879229, 0.03175855033537921, 0.06607097036698285, 0.05818829115699319, 0.030330461614272174, 0.07258847881765927, 0.061751576030955595, 0.030854183084824505, 0.06383565594168271, 0.03193635449689977, 0.030714248909669763, 0.024667561054229736]


0.7985851308878731

In [105]:
get_sent_similarity(train.iloc[7].sentence, train.iloc[1244].sentence, 0, 0.9)

said matches got
van nistelrooy matches gunners keeper manuel almunia
hungry matches did well
for returnmanchester united striker ruud van nistelrooy matches gunners keeper manuel almunia
was matches did well
van nistelrooy hungry for returnmanchester united striker ruud van nistelrooy matches gunners keeper manuel almunia
hungry matches gunners keeper manuel almunia
to play matches gunners keeper manuel almunia
returned matches gunners keeper manuel almunia
van nistelrooy hungry for returnmanchester united striker ruud van nistelrooy matches gunners keeper manuel almunia
as matches gunners keeper manuel almunia
to training matches gunners keeper manuel almunia
on tuesday matches did well
[0.51561785, 0.69243675, 0.38015696, 0.7697212, 0.49986517, 0.6966078, 0.38383353, 0.5640406, 0.5345552, 0.5227442, 0.6457718, 0.6753479, 0.5534012]


  node_features.append(embeddings[[node['index']]].mean(axis=0))


1.8585250452160835

In [151]:
get_sent_similarity_embed_wms(train.iloc[7].sentence, train.iloc[1244].sentence, 0, 0.9)

said matches well
[-0.02419951558113098]
said matches manuel
[0.2965957820415497]
said matches got
[0.515617847442627]
said matches nod
[-0.028672778978943825]
said matches ahead
[-0.11177268624305725]
said matches lehmann
[0.14669086039066315]
said matches save
[0.09168779104948044]
said matches struck
[0.5021913647651672]
said matches got
van matches well
nistelrooy matches well
[0.06986905634403229, 0.18034853041172028]
van matches almunia
nistelrooy matches almunia
[0.3178927004337311, 0.24551713466644287]
van matches got
nistelrooy matches got
[0.08234793692827225, 0.13480320572853088]
van matches nod
nistelrooy matches nod
[0.007625386118888855, 0.05281656235456467]
van matches ahead
nistelrooy matches ahead
[0.025330714881420135, 0.1174120381474495]
van matches lehmann
nistelrooy matches lehmann
[0.2160801738500595, 0.23275339603424072]
van matches save
nistelrooy matches save
[0.06995730102062225, 0.07835730165243149]
van matches lampard
nistelrooy matches lampard
[0.2639704942

  node_features.append(embeddings[[node['index']]].mean(axis=0))


1.4610962308943272

In [110]:
g1

{'nodes': [{'type': 'A',
   'dep': 'root',
   'pos': 'AUX',
   'word': 'did well',
   'index': [12, 13]},
  {'type': 'A',
   'dep': 'nsubj',
   'pos': 'PROPN',
   'word': 'gunners keeper manuel almunia',
   'index': [0, 1, 2, 3]},
  {'type': 'V', 'dep': 'relcl', 'pos': 'VERB', 'word': 'got', 'index': [5]},
  {'type': 'A', 'dep': 'nsubj', 'pos': 'PRON', 'word': 'who', 'index': [4]},
  {'type': 'A',
   'dep': 'dobj',
   'pos': 'NOUN',
   'word': 'the nod',
   'index': [6, 7]},
  {'type': 'A', 'dep': 'advmod', 'pos': 'ADV', 'word': 'ahead', 'index': [8]},
  {'type': 'M',
   'dep': 'pobj',
   'pos': 'PROPN',
   'word': 'of jens lehmann',
   'index': [9, 10, 11]},
  {'type': 'V',
   'dep': 'xcomp',
   'pos': 'VERB',
   'word': 'to save',
   'index': [14, 15]},
  {'type': 'V',
   'dep': 'dobj',
   'pos': 'VERB',
   'word': 'a well struck frank lampard shot',
   'index': [16, 17, 18, 19, 20, 21]}],
 'edges': [['', '', '', '', '', '', '', '', ''],
  ['nsubj', '', '', '', '', '', '', '', ''],
 