In [None]:
import spacy
from nltk.corpus import wordnet as wn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
nlp = spacy.load('en_core_web_lg', disable=['parser', 'tagger', 'ner'])

In [None]:
all_words = set()
for pos in ['v', 'n', 'a']:
    for synset in wn.all_synsets(pos):
        lemma_names = [x for x in synset.lemma_names() if x.isalpha()]
        all_words.update(lemma_names)
        

In [None]:
word_vectors = dict()
for word in all_words:
    word_nlp = nlp(word.lower())[0]
    has_vector, vector = word_nlp.has_vector, word_nlp.vector
    if has_vector:
        word_vectors[word] = vector

In [None]:
df = pd.read_csv('../datasets/synonym_dataset.csv')

In [None]:
def pair_has_wordvec(row):
    return (row['word1'] in word_vectors) and (row['word2'] in word_vectors)

In [None]:
df['has_word_vec'] = df.apply(pair_has_wordvec, axis=1)

In [None]:
df = df.loc[df.has_word_vec]

In [None]:
def cosine_similarity(row):
    v1 = word_vectors[row['word1']]
    v2 = word_vectors[row['word2']]
    return np.dot(v1, v2)/(np.linalg.norm(v1)*np.linalg.norm(v1))

In [None]:
df['cosine'] = df.apply(cosine_similarity, axis=1)

In [None]:
pal = sns.color_palette()

plt.hist(df.loc[df.synonym==1, 'cosine'], bins=100, normed=True, label='synonym')
plt.hist(df.loc[df.synonym==0, 'cosine'], bins=100, normed=True, alpha=0.5, label='not synonym')
plt.legend()
plt.show()