## Imports, as always!

In [None]:
import logging
from gensim.models import Word2Vec
import numpy as np
import matplotlib.pyplot as plt

from corpora import *
from my_utils import *
import score

In [None]:
bn2wn_mapping_path = '../resources/bn2wn_mapping.txt'
sentences_p = InputSentences('../EuroSense/parsed_low_stp/sentences_precision.txt')
sentences_c = InputSentences('../EuroSense/parsed_low_stp/sentences_coverage.txt')
sentences_folder = InputSentences('../EuroSense/parsed_low_stp/')
test_data_path = "../wordsim353/combined.tab"

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Parsing Corpora

In [None]:
corpora_path = '../EuroSense/eurosense.v1.0.high-precision.xml'
bn2wn_mapping_path = '../resources/bn2wn_mapping.txt'
outfile_path = './EuroSense/parsed_lower/sentences_precision.txt'

parse(corpora_path, bn2wn_mapping_path, outfile_path, "precision")
#1,903,116 sentences

In [None]:
corpora_path = '../EuroSense/EuroSense/eurosense.v1.0.high-coverage.xml'
bn2wn_mapping_path = '../resources/bn2wn_mapping.txt'
outfile_path = '../EuroSense/parsed_lower/sentenses_coverage.txt'

parse(corpora_path, bn2wn_mapping_path, outfile_path, "coverage")
#1,903,181 sentences

## Training the Model; Parameter Tuning

All tunings are done with the following parameters:

dataset = precision; min_count = 5; workers = 5; score for words not found = -1

### 1. Tuning 'window'
size = 400 (from the reference paper); alpha = 0.025; iter = 5 (default)

In [None]:
windows = [3, 5, 6, 8, 10]
correlations = []
pvals = []

In [None]:
for windw in windows:
    embeddings_path = "../resources/embeddings_win{}.vec".format(windw)
    
    model_win = Word2Vec(sentences_p, size=400, workers=5, window=windw)
    save_embeddings(embeddings_path, model_win.wv)
    
    corr, p, data = score.spearman(test_data_path, embeddings_path, bn2wn_mapping_path)
    correlations.append(corr)
    pvals.append(p)

In [None]:
del model_win
fig = plt.gcf()
fig.set_size_inches(8.5, 3)

plt.subplot(1, 2, 1)
plt.plot(windows, correlations, 'b')
plt.xlabel('Window')
plt.ylabel('Correlation')
plt.title("Correlation graph")
plt.grid()

plt.subplot(1, 2, 2)
plt.plot(windows, pvals, 'r')
plt.xlabel('Window')
plt.ylabel('p-value')
plt.title("p-value graph")
plt.grid()

plt.tight_layout()
plt.show()

print(correlations)
print(pvals)

### 2. High precision corpus vs High coverage corpus vs Combined (not uniquely)
window = 6; size = 400 (from the reference paper); alpha = 0.025; iter = 5 (default)

In [None]:
names = ["Precision", "Coverage", "Combined"]
correlations = []
pvals = []

In [None]:
embeddings_path = "../resources/embeddings_precision.vec"
model_precision = Word2Vec(sentences_p, size=400, workers=5, window=6)
save_embeddings(embeddings_path, model_precision.wv)

corr, p, data = score.spearman(test_data_path, embeddings_path, bn2wn_mapping_path)
correlations.append(corr)
pvals.append(p)
#=========================================
embeddings_path = "../resources/embeddings_coverage.vec"
model_coverage = Word2Vec(sentences_c, size=400, workers=5, window=6)
save_embeddings(embeddings_path, model_coverage.wv)

corr, p, data = score.spearman(test_data_path, embeddings_path, bn2wn_mapping_path)
correlations.append(corr)
pvals.append(p)
#=========================================
embeddings_path = "../resources/embeddings_combined.vec"
model_comb = Word2Vec(sentences_folder, size=400, workers=5, window=6)
save_embeddings(embeddings_path, model_comb.wv)

corr, p, data = score.spearman(test_data_path, embeddings_path, bn2wn_mapping_path)
correlations.append(corr)
pvals.append(p)

In [None]:
print("Vocab size for high precision data is: {:,d}".format(len(model_precision.wv.vocab)))
print("Vocab size for high coverage data is: {:,d}".format(len(model_coverage.wv.vocab)))
print("Vocab size for the combined data is: {:,d}".format(len(model_comb.wv.vocab)))

del model_precision
del model_coverage
del model_comb

In [None]:
bar_width = 0.3
pos1 = np.arange(len(correlations))
pos2 = [x + bar_width for x in pos1]

fig = plt.gcf()
fig.set_size_inches(8, 3)

plt.subplot(1, 2, 1)
plt.bar(pos1, correlations, color='blue', width=bar_width)
plt.title("Correlation graph")
plt.xticks([y for y in range(len(correlations))], names)
plt.grid()

plt.subplot(1, 2, 2)
plt.bar(pos2, _pvals, color='red', width=bar_width)
plt.title("p-value graph")
plt.xticks([y + bar_width for y in range(len(correlations))], names)
plt.grid()

plt.tight_layout()
plt.show()

print(correlations)
print(pvals)

### 3. Tuning vector 'size'
window = 6; alpha = 0.025; iter = 5 (default); combined dataset

In [None]:
sizes = [100, 200, 300, 400, 500, 600]
correlations = []
pvals = []

In [None]:
for s in sizes:
    embeddings_path = "../resources/embeddings_size{}.vec".format(s)
    
    model_size = Word2Vec(sentences_folder, workers=5, size=s, window=6)
    save_embeddings(embeddings_path, model_size.wv)
    
    corr, p, data = score.spearman(test_data_path, embeddings_path, bn2wn_mapping_path)
    correlations.append(corr)
    pvals.append(p)

In [None]:
del model_size
fig = plt.gcf()
fig.set_size_inches(8.5, 3)

plt.subplot(1, 2, 1)
plt.plot(sizes, correlations, 'b')
plt.xlabel('Vector sizes')
plt.ylabel('Correlation')
plt.title("Correlation graph for vector 'size'")
plt.grid()

plt.subplot(1, 2, 2)
plt.plot(sizes, pvals, 'r')
plt.xlabel('Vector sizes')
plt.ylabel('p-value')
plt.title("p-value graph for vector 'size'")
plt.grid()

plt.tight_layout()
plt.show()

print(correlations)
print(pvals)

### 4.1. Tuning starting 'alpha'
size = 500; window = 6; iter = 5 (default)

In [None]:
alphas = [0.02, 0.04, 0.06, 0.08, 0.1]
correlations = []
pvals = []

In [None]:
for alph in alphas:
    embeddings_path = "../resources/embeddings_alpha{}.vec".format(alph)
    
    model_alph = Word2Vec(sentences_folder, workers=5, size=500, window=6, alpha=alph)
    save_embeddings(embeddings_path, model_alph.wv)
    
    corr, p, data = score.spearman(test_data_path, embeddings_path, bn2wn_mapping_path)
    correlations.append(corr)
    pvals.append(p)

In [None]:
alphas = [0.02, 0.04, 0.06, 0.08, 0.1]
_correlations = []
_pvals = []

In [None]:
del model_alph

fig = plt.gcf()
fig.set_size_inches(8.5, 3)

plt.subplot(1, 2, 1)
plt.plot(alphas, correlations, 'b')
plt.xlabel('Alpha')
plt.ylabel('Correlation')
plt.title("Correlation graph for alpha")
plt.grid()

plt.subplot(1, 2, 2)
plt.plot(alphas, pvals, 'r')
plt.xlabel('Alpha')
plt.ylabel('p-value')
plt.title("p-value graph for alpha")
plt.grid()

plt.tight_layout()
plt.show()

print(correlations)
print(pvals)

### 4.2. Tuning starting 'alpha'
size = 500; window = 6; iter = 5 (default)

In [None]:
alphas = [0.005, 0.007, 0.009, 0.01, 0.02]
correlations = []
pvals = []

In [None]:
for alph in alphas:
    embeddings_path = "../resources/embeddings_alpha{}.vec".format(alph)
    
    model_alph = Word2Vec(sentences_folder, workers=5, size=500, window=6, alpha=alph)
    save_embeddings(embeddings_path, model_alph.wv)
    
    corr, p, data = score.spearman(test_data_path, embeddings_path, bn2wn_mapping_path)
    correlations.append(corr)
    pvals.append(p)

In [None]:
#del model_alph

fig = plt.gcf()
fig.set_size_inches(8.5, 3)

plt.subplot(1, 2, 1)
plt.plot(alphas, correlations, 'b')
plt.xlabel('Alpha')
plt.ylabel('Correlation')
plt.title("Correlation graph for alpha")
plt.grid()

plt.subplot(1, 2, 2)
plt.plot(alphas, pvals, 'r')
plt.xlabel('Alpha')
plt.ylabel('p-value')
plt.title("p-value graph for alpha")
plt.grid()

plt.tight_layout()
plt.show()

print(correlations)
print(pvals)

### 5. Tuning epochs 'iter'
alpha = 0.009; size = 500; window = 6;

In [None]:
epochs = [30]
correlations = []
pvals = []

In [None]:
for epoch in epochs:
    embeddings_path = "../resources/embeddings_epochs{}.vec".format(epoch)
    
    model_iter = Word2Vec(sentences_folder, workers=5, size=500, window=6, iter=epoch)
    save_embeddings(embeddings_path, model_iter.wv)
    
    corr, p, data = score.spearman(test_data_path, embeddings_path, bn2wn_mapping_path)
    correlations.append(corr)
    pvals.append(p)

In [None]:
correlations

In [None]:
#epochs = [40, 50]

epochs = [30]
# correlations = []
# pvals = []

model_iter = Word2Vec(sentences_folder, workers=5, size=500, window=6, alpha=0.03, iter=30)
save_embeddings(embeddings_path, model_iter.wv)

corr, p, data = score.spearman(test_data_path, embeddings_path, bn2wn_mapping_path)
correlations.append(corr)
pvals.append(p)

In [None]:
print(embeddings_path)
for epoch in epochs:
    embeddings_path = "../resources/embeddings_epochs{}.vec".format(epoch)
    
    model_iter = Word2Vec(sentences_folder, workers=5, size=500, window=6, iter=epoch)
    save_embeddings(embeddings_path, model_iter.wv)
    
    corr, p, data = score.spearman(test_data_path, embeddings_path, bn2wn_mapping_path)
    correlations.append(corr)
    pvals.append(p)

In [None]:
# del model_iter

# plt.plot(alphas, correlations, 'b', alphas, pvals, 'r')
# plt.xlabel('Epochs')
# plt.text(45, .135, 'Correlation')
# plt.text(48, .008, 'p-value')
# plt.show()

correlations

In [None]:
correlations = []
pvals = []

# embeddings_path = "../resources/embeddings_size{}_lr{}_epo{}.vec".format(100, 0.045, 10)

# model_iter = Word2Vec(sentences_folder, workers=5, size=100, window=6, alpha=0.045, iter=10)
# save_embeddings(embeddings_path, model_iter.wv)

# corr, p, data = score.spearman(test_data_path, embeddings_path, bn2wn_mapping_path)
# correlations.append(corr)
# pvals.append(p)


embeddings_path = "../resources/embeddings_prec_size{}_win{}_neg{}_lr{}_epo{}.vec".format(100, 6, 20, 0.09, 10)

model_iter = Word2Vec(sentences_p, workers=5, size=100, window=6, alpha=0.09, negative=20, iter=10)
save_embeddings(embeddings_path, model_iter.wv)

corr, p, data = score.spearman(test_data_path, embeddings_path)
correlations.append(corr)
pvals.append(p)



In [None]:
print(len(model_iter.wv.vocab))

In [None]:
senses_only_model = KeyedVectors.load_word2vec_format(embeddings_path, binary=False)

In [None]:
print(len(senses_only_model.vocab))

In [None]:
correlations

In [None]:
data

## TSNE Plots

In [None]:
from gensim.models import KeyedVectors

#embeddings_path = "../resources/embeddings_size100_win6_neg20_lr0.09_epo10.vec"
embeddings_path = "../resources/embeddings_size100_win6_neg5_lr0.09_epo10.vec"
model = KeyedVectors.load_word2vec_format(embeddings_path, binary=False)

In [None]:
model.most_similar("implement_bn:00082712v", topn=10)

In [None]:
from sklearn.manifold import TSNE
import numpy as np

keys = ['believe_bn:00083369v', 'agree_bn:00082476v', 'field_bn:00007985n', 'power_bn:00063940n', 'implement_bn:00082712v']

embedding_clusters = []
word_clusters = []
for word in keys:
    embeddings = []
    words = []
    for similar_word, _ in model.most_similar(word, topn=20):
        words.append(similar_word)
        embeddings.append(model[similar_word])
    embedding_clusters.append(embeddings)
    word_clusters.append(words)
    
embedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape
tsne_model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32)
embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)

In [None]:
tsne_similar_words_plot('Similar words to some randomly selected sense embeddings', keys, embeddings_en_2d, word_clusters, 0.7, 'similar_words.png')

In [None]:
from sklearn.manifold import TSNE
import numpy as np

keys = ["love", "sex", "smart", "student", "plane", "car"]


embedding_clusters = []
word_clusters = []
for word in keys:
    senses, embeddings = get_first_similar_words(word, model)
    embedding_clusters.append(embeddings)
    word_clusters.append(senses)
    
embedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape
tsne_model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32)
embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)

In [None]:
tsne_similar_words_plot('Senses of some word taken from the combined.tab file', keys, embeddings_en_2d, word_clusters, 0.7, 'similar_words2.png')