In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import ndcg_score
from textsimilarity.get_similarity import get_sent_similarity, get_graph_similarity, cosine_similarity
from textsimilarity.preprocess import get_coref_and_dp
from textsimilarity.get_w2v_features import get_word2vec_embeddings
from textsimilarity.build_semantic_graph.build_semantic_graph import get_graph_from_sent
from textsimilarity import draw_graph
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
from sentence_transformers import SentenceTransformer
# from textsimilarity.srl_similarity import *
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

In [3]:
dictionary = pd.read_csv('../data/dictionary/processed/Li_srl.csv')
dictionary

Unnamed: 0.1,Unnamed: 0,word1,word2,sent1,sent2,human_sim,srl_sim
0,0,cord,smile,"Cord is strong, thick string.",A smile is the expression that you have on you...,0.0100,0.000000
1,1,rooster,voyage,A rooster is an adult male chicken.,A voyage is a long journey on a ship or in a s...,0.0050,0.000000
2,2,noon,string,Noon is 12 o’clock in the middle of the day.,"String is thin rope made of twisted threads, u...",0.0125,0.000000
3,3,fruit,furnace,Fruit or a fruit is something which grows on a...,A furnace is a container or enclosed space in ...,0.0475,2.842466
4,4,autograph,shore,An autograph is the signature of someone famou...,"The shores or shore of a sea, lake, or wide ri...",0.0050,0.000000
...,...,...,...,...,...,...,...
59,59,cushion,pillow,A cushion is a fabric case filled with soft ma...,A pillow is a rectangular cushion which you re...,0.5225,1.055305
60,60,cemetery,graveyard,A cemetery is a place where dead people’s bodi...,"A graveyard is an area of land, sometimes near...",0.7725,1.882579
61,61,automobile,car,An automobile is a car.,A car is a motor vehicle with room for a small...,0.5575,0.000000
62,62,midday,noon,Midday is 12 o’clock in the middle of the day.,Noon is 12 o’clock in the middle of the day.,0.9550,0.000000


In [4]:
ndcg_score(np.asarray([dictionary.human_sim.values.tolist()]), np.asarray([dictionary.srl_sim.values.tolist()]))

0.6667961149148282

In [5]:
np.corrcoef(dictionary.human_sim.values.tolist(), dictionary.srl_sim.values.tolist())

array([[1.        , 0.15253372],
       [0.15253372, 1.        ]])

In [6]:
sentences1 = dictionary.sent1.values
sentences2 = dictionary.sent2.values

In [7]:
embeddings1 = model.encode(sentences1)
embeddings2 = model.encode(sentences2)

In [8]:
dictionary['bert_sim'] = list(map(cosine_similarity, embeddings1, embeddings2))

dictionary.head()

Unnamed: 0.1,Unnamed: 0,word1,word2,sent1,sent2,human_sim,srl_sim,bert_sim
0,0,cord,smile,"Cord is strong, thick string.",A smile is the expression that you have on you...,0.01,0.0,0.03138
1,1,rooster,voyage,A rooster is an adult male chicken.,A voyage is a long journey on a ship or in a s...,0.005,0.0,0.008297
2,2,noon,string,Noon is 12 o’clock in the middle of the day.,"String is thin rope made of twisted threads, u...",0.0125,0.0,-0.062238
3,3,fruit,furnace,Fruit or a fruit is something which grows on a...,A furnace is a container or enclosed space in ...,0.0475,2.842466,0.143097
4,4,autograph,shore,An autograph is the signature of someone famou...,"The shores or shore of a sea, lake, or wide ri...",0.005,0.0,0.07334


In [34]:
ndcg_score(np.asarray([dictionary.human_sim.values.tolist()]), np.asarray([dictionary.bert_sim.values.tolist()]))

0.9787456506203578

In [9]:
np.corrcoef(dictionary.human_sim.values.tolist(), dictionary.bert_sim.values.tolist())

array([[1.        , 0.88485346],
       [0.88485346, 1.        ]])

In [5]:
dictionary['srl_sim'] = list(map(get_wms_srl_recursive, sentences1, sentences2))

dictionary.head()

SRL got no result on sentence: Cord is strong, thick string.
SRL got no result on sentence: A rooster is an adult male chicken.


AttributeError: 'NoneType' object has no attribute 'text'

In [10]:
wms_sents = []
graph_sents0 = []
graph_sents1 = []

for _,row in dictionary.iterrows():
    sent1 = row.sent1
    sent2 = row.sent2
    graph1, _, coref1 = get_graph_from_sent(sent1)
    w2v1 = get_word2vec_embeddings(' '.join(coref1), with_coref=False)
    
    graph2, _, coref2 = get_graph_from_sent(sent2)
    w2v2 = get_word2vec_embeddings(' '.join(coref2), with_coref=False)
    
    wms_sents.append(get_sent_similarity(coref1, coref2, w2v1, w2v2))
    graph_sents0.append(get_sent_similarity(coref1, coref2, w2v1, w2v2, graph1, graph2, 0, threshold=0))
    graph_sents1.append(get_sent_similarity(coref1, coref2, w2v1, w2v2, graph1, graph2, 0, threshold=0.9))
    
dictionary['wms'] = wms_sents
dictionary['graph_sim_0'] = graph_sents0
dictionary['graph_sim_0.9'] = graph_sents1

dictionary.head()

  node_features.append(embeddings[[node['index']]].mean(axis=0))


Unnamed: 0.1,Unnamed: 0,word1,word2,sent1,sent2,human_sim,srl_sim,bert_sim,wms,graph_sim_0,graph_sim_0.9
0,0,cord,smile,"Cord is strong, thick string.",A smile is the expression that you have on you...,0.01,0.0,0.03138,0.480549,1.308765,0.873035
1,1,rooster,voyage,A rooster is an adult male chicken.,A voyage is a long journey on a ship or in a s...,0.005,0.0,0.008297,0.66793,1.90845,1.068503
2,2,noon,string,Noon is 12 o’clock in the middle of the day.,"String is thin rope made of twisted threads, u...",0.0125,0.0,-0.062238,0.337889,1.209262,0.904273
3,3,fruit,furnace,Fruit or a fruit is something which grows on a...,A furnace is a container or enclosed space in ...,0.0475,2.842466,0.143097,0.596266,2.364872,1.982638
4,4,autograph,shore,An autograph is the signature of someone famou...,"The shores or shore of a sea, lake, or wide ri...",0.005,0.0,0.07334,0.505734,2.35316,1.350107


In [37]:
ndcg_score(np.asarray([dictionary.human_sim.values.tolist()]), np.asarray([dictionary.wms.values.tolist()]))

0.8385309220090293

In [38]:
ndcg_score(np.asarray([dictionary.human_sim.values.tolist()]), np.asarray([dictionary.graph_sim_0.values.tolist()]))

0.665854436242791

In [39]:
ndcg_score(np.asarray([dictionary.human_sim.values.tolist()]), np.asarray([dictionary['graph_sim_0.9'].values.tolist()]))

0.7225152603858515

In [11]:
np.corrcoef(dictionary.human_sim.values.tolist(), dictionary.wms.values.tolist())

array([[1.        , 0.39451375],
       [0.39451375, 1.        ]])

In [12]:
np.corrcoef(dictionary.human_sim.values.tolist(), dictionary.graph_sim_0.values.tolist())

array([[1.        , 0.11663205],
       [0.11663205, 1.        ]])

In [13]:
np.corrcoef(dictionary.human_sim.values.tolist(), dictionary['graph_sim_0.9'].values.tolist())

array([[1.        , 0.24539045],
       [0.24539045, 1.        ]])