In [1]:
import copy
import json
import re
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import spearmanr

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/egor_baryshnikov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Preprocessing

In [3]:
from preprocessing import clear_sentences

In [4]:
path_data_prepaired = '../dataset/dataset.json'

In [5]:
data = None
with open(path_data_prepaired) as file_data:
    data = json.load(file_data)

In [6]:
%time sentences = clear_sentences(data)

CPU times: user 3.93 s, sys: 369 ms, total: 4.3 s
Wall time: 4.31 s


## Data Processing

In [42]:
from models import Word2Vec

In [68]:
model = Word2Vec(sentences)
model.d = 100

In [69]:
model.create_vocabulary(r=200)
model.create_corpus_matrix(L=2)

Creating vocabulary
Creating corpus matrix


## Word similarity test preparation

In [14]:
path_data_simtest = '../dataset/SimLex-999/SimLex-999.txt'

In [15]:
simtest_data = pd.read_table(path_data_simtest)

In [16]:
simtest_data.shape

(999, 10)

In [17]:
for col in 'word1', 'word2':
    simtest_data = simtest_data.loc[simtest_data[col].map(lambda x: x in model.vocab)]

In [18]:
simtest_data.shape

(241, 10)

# MODEL 1: IMF

In [73]:
model_IMF = copy.deepcopy(model)
model_IMF.compute_embedds_IMF(k=5, alpha=0.5)

Computing of words embeddings
Value of the SGNS's objective:  -238309200.59163


In [71]:
model_IMF.W.shape

(3723, 100)

# MODEL 2: RO

In [74]:
model_RO = copy.deepcopy(model_IMF)
model_RO.compute_embedds_riem(k=5, step=1e-4, max_iter=2, alpha=0.5)
#Ради картиночек Богу Картиночек можно построить плот лосса

Value of the SGNS's objective on the 0 iteration: 
 -238309200.59163
Value of the SGNS's objective on the 1 iteration: 
 -236278219.98338166


In [49]:
model_RO = copy.deepcopy(model_IMF)
model_RO.compute_embedds_riem(k=5, step=5e-5, max_iter=6, alpha=0.5)

Value of the SGNS's objective on the 0 iteration: 
 -238119357.39017475
Value of the SGNS's objective on the 1 iteration: 
 -239559767.94034463
Value of the SGNS's objective on the 2 iteration: 
 -237104811.64661655
Value of the SGNS's objective on the 3 iteration: 
 -237371524.69220865
Value of the SGNS's objective on the 4 iteration: 
 -234329757.7393481
Value of the SGNS's objective on the 5 iteration: 
 -234056705.22403586


In [80]:
model_RO.W.shape

(3723, 500)

# MODEL 3: EMF

In [65]:
model_EMF = copy.deepcopy(model)
model_EMF.compute_embedds_EMF(k=15, eps=5e-1, max_iter=20, step=1e-8)

0 iteration
6.175060650128607
6.175060650057614
4.884386198736214
4.867707257555285
3.8351347588889264
3.598991799658367
3.3715093770977504
3.2020585594189166
2.8563735399970316
2.704786232501581
2.4921204104420074
2.3097711310622033
2.2039164510591984
2.0484326852961625
1.9672024841523321
1.9569674853948706
1.9333546369638348
1.8835040134553167
1.8677176199405001
1.8584516938960958
1.8318680070168287
1.803304766868759
1.7812782163047018
1.7538397972507802
1.727175741782687
1.6991114969027075
1.6624902733037243
1.6220782408716954
1.58152286064548
1.541042550145792
1.5030734293146377
1.468694089950367
1.4346006140184826
1.3993654208481145
1.3616533789811693
1.3264798090216492
1.2981320718454172
1.277228838128837
1.26210838354712
1.2483306266617806
1.2320140959754533
1.2138917893632366
1.1979228306381347
1.185341068565012
1.1742037958982612
1.1625504914963365
1.149858168826087
1.1369391697287836
1.1249056748089636
1.1139749225651407
First loop finished
Second loop finished
1 iteration
0.

## Testing

In [21]:
def calculate_spearman(model, simtest_data= simtest_data, w1_colname= 'word1', w2_colname= 'word2'):

    vec = pd.DataFrame()
    for col in w1_colname, w2_colname:
        vec[col] = simtest_data[col].apply(lambda x: model.get_word_embedding(x))

    cosine_sim_lambda = lambda x: np.float64(cosine_similarity(vec.loc[x, w1_colname].reshape(1, -1),
                                                               vec.loc[x, w2_colname].reshape(1, -1)))
    for i in simtest_data.index:
        vec.loc[i, 'cos_sim'] = cosine_sim_lambda(i)
    
    return spearmanr(np.array(simtest_data['SimLex999']), np.array(vec['cos_sim']))

In [82]:
print('IMF Word Similarity test:',
      calculate_spearman(model_IMF))

SpearmanrResult(correlation=0.18780790335995323, pvalue=0.0034281095720956924)

In [83]:
print('RO Word Similarity test:',
      calculate_spearman(model_RO))

SpearmanrResult(correlation=0.18920449961794297, pvalue=0.003191631471541072)

In [40]:
vec = pd.DataFrame()
for col in 'word1', 'word2':
    vec[col] = simtest_data[col].apply(lambda x: model_EMF.get_word_embedding2(x))

cosine_sim_lambda = lambda x: np.float64(cosine_similarity(vec.loc[x, 'word1'].reshape(1, -1),
                                                           vec.loc[x, 'word2'].reshape(1, -1)))
for i in simtest_data.index:
    vec.loc[i, 'cos_sim'] = cosine_sim_lambda(i)

print('EMF Word Similarity test:',
      spearmanr(np.array(simtest_data['SimLex999']), np.array(vec['cos_sim'])))

EMF Word Similarity test: SpearmanrResult(correlation=0.061191320637053695, pvalue=0.3442033838438966)


## Classification tests

In [28]:
# create dataframe from the json file
df = pd.read_json(path_data_prepaired)

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix

In [86]:
counter = 0
for model in model_EMF, model_RO:
    counter += 1
    model_str = 'EMF'
    if counter % 2 == 0:
        model_str = 'RO'
        
    print('', '#'*10, '\nMF model: {}\n'.format(model_str), '#'*10,'\n')
    #get features matrix
    X = model.get_features_matrix(sentences)
    
    # get indices of rows which contain NaNs
    del_idx = np.argwhere(np.isnan(X))[:, 0]
    
    # delete rows with NaNs
    X = X[~np.isnan(X).any(axis=1)]
    
    y = (df['overall'] > 3).apply(int)
    y = y.drop(del_idx)
    
    #split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, shuffle=True)
    
    for clf in LogisticRegression(), LinearSVC():
        print('Classificator: {}\n'.format(clf))
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        
        f1_sc = f1_score(y_true= y_test, y_pred= y_pred)
        print('F1-score = {}'.format(f1_sc))
        
        conf_m = confusion_matrix(y_true=y_test, y_pred=y_pred)
        print('Confusion matrix:\n',
              pd.DataFrame(conf_m, columns=['True 0', 'True 1'], index=['Predicted 0', 'Predicted 1']))

 ########## 
MF model: EMF
 ########## 



  review_vec /= words_count




Classificator: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)





F1-score = 0.8901212058231699
Confusion matrix:
              True 0  True 1
Predicted 0    5373    9686
Predicted 1    1945   47111


Classificator: LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

F1-score = 0.8901555971489611
Confusion matrix:
              True 0  True 1
Predicted 0    5412    9647
Predicted 1    1973   47083
 ########## 
MF model: RO
 ########## 



  review_vec /= words_count




Classificator: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)





F1-score = 0.8901866288684148
Confusion matrix:
              True 0  True 1
Predicted 0    5392    9667
Predicted 1    1954   47102


Classificator: LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

F1-score = 0.8902294883553806
Confusion matrix:
              True 0  True 1
Predicted 0    5432    9627
Predicted 1    1982   47074


In [41]:
X = model_EMF.get_features_matrix2(sentences)

# get indices of rows which contain NaNs
del_idx = np.argwhere(np.isnan(X))[:, 0]

# delete rows with NaNs
X = X[~np.isnan(X).any(axis=1)]

y = (df['overall'] > 3).apply(int)
y = y.drop(del_idx)

#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, shuffle=True)

for clf in LogisticRegression(), LinearSVC():
    print('\n\nClassificator: {}\n'.format(clf))
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    f1_sc = f1_score(y_true= y_test, y_pred= y_pred)
    print('F1-score = {}'.format(f1_sc))

    conf_m = confusion_matrix(y_true=y_test, y_pred=y_pred)
    print('Confusion matrix:\n',
          pd.DataFrame(conf_m, columns=['True 0', 'True 1'], index=['Predicted 0', 'Predicted 1']))

  review_vec /= words_count




Classificator: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)





F1-score = 0.8856753900783488
Confusion matrix:
              True 0  True 1
Predicted 0    5589    9470
Predicted 1    2539   46517


Classificator: LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)





F1-score = 0.885803893498592
Confusion matrix:
              True 0  True 1
Predicted 0    4964   10095
Predicted 1    2030   47026
