In [1]:
import json
import re
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords

In [None]:
# nltk.download('stopwords')

## Preprocessing

In [2]:
path_data_prepaired = '../dataset/dataset.json'

In [3]:
data = None
with open(path_data_prepaired) as file_data:
    data = json.load(file_data)

In [4]:
from preprocessing import clear_sentences

In [5]:
%time sentences = clear_sentences(data)

CPU times: user 4.43 s, sys: 329 ms, total: 4.76 s
Wall time: 4.76 s


In [6]:
l = len(sentences)
sentences = sentences[:l//10]

## 1) Words Embeddings as Matrix Factorization

In [7]:
from models import Word2Vec

In [8]:
model = Word2Vec(sentences)

In [9]:
model.create_vocabulary()
model.create_corpus_matrix()
model.compute_embedds_EMF(5, alpha=1)

Creating vocabulary
Creating corpus matrix
Computing of words embeddings
-6865082.855317236


In [10]:
model.compute_embedds_riem(1)

Iteration 1 has started
-6865082.855317236
Iteration 2 has started
-7736484.404539731
Iteration 3 has started
-6858655.357490511
Iteration 4 has started
-7726351.684199596
Iteration 5 has started
-6852173.369949247
Iteration 6 has started
-7715350.44717634
Iteration 7 has started
-6844416.531853705
Iteration 8 has started
-7700483.011641293
Iteration 9 has started
-6831350.247145227
Iteration 10 has started
-7671791.640724014
Iteration 11 has started
-6807374.005320498
Iteration 12 has started
-7614856.671224229
Iteration 13 has started
-6824250.237148488
Iteration 14 has started
-7585577.644440516
Iteration 15 has started
-7001465.1020168485
Iteration 16 has started
-7439205.264657776
Iteration 17 has started
-7148220.860857635
Iteration 18 has started
-7344992.325126817
Iteration 19 has started
-7204635.770387191
Iteration 20 has started
-7513962.833973066


In [None]:
model.W.shape

In [None]:
X = model.W @ model.C

wc = model.D.sum()
w = np.array(model.D.sum(axis=1))
c = np.array(model.D.sum(axis=0))

model.grad(X, 1, wc, w, c)

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [None]:
gr = np.array(model.D.toarray()) * sigmoid(- X) - (1 * w * c / wc) * sigmoid(X)

In [None]:
gr

In [None]:
type(gr)

In [None]:
##### Compute review embeddings #####
def get_review_embedding(model, review):
    """
    model -- word2vec model instance, which is used
    review -- current review to be embedded
    """
    
    review_vec = np.zeros(model.d)
    words_count = 0
    stops = set(stopwords.words("english"))
    
    for word in review:
        if (word in model.vocab) and not (word in stops):
            review_vec += model.get_word_embedding(word)
            words_count += 1
    review_vec /= words_count
    return review_vec

In [None]:
##### Compute review embeddings #####
def get_features_matrix(model, reviews):
    """
    model -- word2vec model instance, which is used
    reviews -- the whole collection of reviews
    """
    X = np.zeros((len(reviews), model.d))
    for idx, review in enumerate(reviews):
        X[idx, :] = get_review_embedding(model, review)
    return X

In [None]:
X = get_features_matrix(model, sentences)
X.shape

### Comparison models of embeddings

In [None]:
# import clissifiers and necessary functions
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score

In [None]:
# create dataframe from the json file
df = pd.read_json(path_data_prepaired)

In [None]:
# get labels
y = (df['overall'] > 3).apply(int)

In [None]:
# get indices of rows which contain NaNs
del_idx = np.argwhere(np.isnan(X))[:, 0]

In [None]:
# delete rows with NaNs
X = X[~np.isnan(X).any(axis=1)]
y = y.drop(del_idx)

In [None]:
y.shape, X.shape

In [None]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.33, random_state=42, shuffle=True)

Try RF classifier out of the box

In [None]:
rf_clf = RandomForestClassifier(random_state=42)
cv_scores = cross_val_score(rf_clf, X_train, y_train, n_jobs=-1, cv=5)

In [None]:
print(f'Accuracy score: {cv_scores.mean()} +/- {cv_scores.std()}')

Evaluate model on the holdout set

In [None]:
rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_test)
print(f'Accuracy on holdout set: {accuracy_score(y_test, y_pred)}')

In [None]:
params = {'n_estimators': list(range(5, 51, 5)), 'max_depth': list(range(5, 100, 5)), 
         'min_samples_split': list(range(1, 11, 1))}

clf = RandomizedSearchCV(RandomForestClassifier(), params, n_jobs=-1, 
                         cv=5, verbose=1, random_state=42)

In [None]:
clf.fit(X_train, y_train)

xgboost is very slow on mac ....

In [None]:
xgbm_cls = XGBClassifier(random_state=42)
cv_scores = cross_val_score(xgbm_cls, X_train, y_train, n_jobs=-1, cv=5)

In [None]:
print(f'Accuracy score: {cv_scores.mean()} +/- {cv_scores.std()}')

In [None]:
X_df = pd.DataFrame(X)

In [None]:
X_df.to_csv('../dataset/X_1.csv')
y.to_csv('../dataset/y_1.csv')

In [None]:
X_df.to_csv('../dataset/X_1.csv')

In [None]:
y.shape