In [4]:
import copy
import json
import re
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import spearmanr

In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/egor_baryshnikov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Preprocessing

In [8]:
from preprocessing import clear_sentences

In [6]:
path_data_prepaired = '../dataset/dataset.json'

In [7]:
data = None
with open(path_data_prepaired) as file_data:
    data = json.load(file_data)

In [9]:
%time sentences = clear_sentences(data)

CPU times: user 3.98 s, sys: 372 ms, total: 4.36 s
Wall time: 4.36 s


In [17]:
simtest_data.shape

(999, 10)

In [66]:
simtest_data.head()

Unnamed: 0,word1,word2,POS,SimLex999,conc(w1),conc(w2),concQ,Assoc(USF),SimAssoc333,SD(SimLex)
0,old,new,A,1.58,2.72,2.81,2,7.25,1,0.41
2,hard,difficult,A,8.77,3.76,2.21,2,5.94,1,1.19
4,hard,easy,A,0.95,3.76,2.07,2,5.82,1,0.93
5,fast,rapid,A,8.75,3.32,3.07,2,5.66,1,1.68
6,happy,glad,A,9.17,2.56,2.36,1,5.49,1,1.59
7,short,long,A,1.23,3.61,3.18,2,5.36,1,1.58
9,weird,strange,A,8.93,1.59,1.86,1,4.26,1,1.3
10,wide,narrow,A,1.03,3.06,3.04,2,4.06,1,0.58
11,bad,awful,A,8.42,1.68,1.92,1,3.41,1,1.74
12,easy,difficult,A,0.58,2.07,2.21,1,2.83,1,0.36


## Data Processing

In [14]:
from models import Word2Vec

In [15]:
model = Word2Vec(sentences)

In [16]:
model.create_vocabulary(r=200)
model.create_corpus_matrix(L=2)

Creating vocabulary
Creating corpus matrix


## Word similarity test preparation

In [18]:
path_data_simtest = '../dataset/SimLex-999/SimLex-999.txt'

In [19]:
simtest_data = pd.read_table(path_data_simtest)

In [20]:
simtest_data.shape

(999, 10)

In [21]:
for col in 'word1', 'word2':
    simtest_data = simtest_data.loc[simtest_data[col].map(lambda x: x in model.vocab)]

In [22]:
simtest_data.shape

(241, 10)

# EMF

In [23]:
model_EMF = copy.deepcopy(model)

In [24]:
model_EMF.compute_embedds_EMF(5, alpha=.5)

Computing of words embeddings
Value of the objective:  -238119357.39017475


In [25]:
model_EMF.W.shape

(3723, 200)

In [27]:
X_EMF = model_EMF.get_features_matrix(sentences)
X_EMF.shape

  review_vec /= words_count


(194439, 200)

# RO

In [28]:
model_RO = copy.deepcopy(model_EMF)
model_RO.compute_embedds_riem(k=5, step=5e-5, max_iter=20, alpha=.5)
#Ради картиночек Богу Картиночек можно построить плот лосса

Iteration 1 has started
-238119357.39017475
Iteration 2 has started
-239559767.94034463
Iteration 3 has started
-237104811.64661655
Iteration 4 has started
-237371524.69220865
Iteration 5 has started
-234329757.7393481
Iteration 6 has started
-234056705.22403586
Iteration 7 has started
-231559121.46592265
Iteration 8 has started
-232422736.96238202
Iteration 9 has started
-233076582.00003293
Iteration 10 has started
-237682031.22115788
Iteration 11 has started
-236134480.62144554
Iteration 12 has started
-236505035.55803984
Iteration 13 has started
-231191135.94350466
Iteration 14 has started
-236389022.82785344
Iteration 15 has started
-241556703.0950799
Iteration 16 has started
-242098362.89673612
Iteration 17 has started
-235150970.60118538
Iteration 18 has started
-236814003.18161786
Iteration 19 has started
-237737264.35266206
Iteration 20 has started
-239066270.54043892


In [29]:
model_RO = copy.deepcopy(model_EMF)
model_RO.compute_embedds_riem(k=5, step=5e-5, max_iter=7, alpha=.5)

Iteration 1 has started
-238119357.39017475
Iteration 2 has started
-239559767.94034463
Iteration 3 has started
-237104811.64661655
Iteration 4 has started
-237371524.69220865
Iteration 5 has started
-234329757.7393481
Iteration 6 has started
-234056705.22403586
Iteration 7 has started
-231559121.46592265


In [30]:
model_RO.W.shape

(3723, 200)

In [31]:
X_RO = model_RO.get_features_matrix(sentences)
X_RO.shape

  review_vec /= words_count


(194439, 200)

## Testing

In [67]:
def calculate_spearman(model, simtest_data= simtest_data, w1_colname= 'word1', w2_colname= 'word2'):

    vec = pd.DataFrame()
    for col in w1_colname, w2_colname:
        vec[col] = simtest_data[col].apply(lambda x: model.get_word_embedding(x))

    cosine_sim_lambda = lambda x: np.float64(cosine_similarity(vec.loc[x, w1_colname].reshape(1, -1),
                                                               vec.loc[x, w2_colname].reshape(1, -1)))
    for i in simtest_data.index:
        vec.loc[i, 'cos_sim'] = cosine_sim_lambda(i)
    
    return spearmanr(np.array(simtest_data['SimLex999']), np.array(vec['cos_sim']))

In [69]:
calculate_spearman(model_EMF)

SpearmanrResult(correlation=0.12277144201259625, pvalue=0.057011563144918447)

In [70]:
calculate_spearman(model_RO)

SpearmanrResult(correlation=0.13051144877572177, pvalue=0.04294807615727263)