# Word2Vec

In [1]:
# Imports
import pandas as pd
import gensim
import numpy as np
import matplotlib as plt
import ast

from utils.utils import isSubArray

from nltk import pos_tag
from nltk.tokenize import word_tokenize

from collections import Counter

from imblearn.under_sampling import RandomUnderSampler

from xgboost import XGBClassifier

from sklearn import naive_bayes, svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_predict, StratifiedKFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Load Data
data = pd.read_csv("data/filtered_reviews_large.csv", names=["Review Text", "Stars", "Polarity", "Processed Review Text"])
# data = pd.read_csv("data/filtered_reviews.csv", names=["Review Text", "Stars", "Polarity"])

### Undersampling to balance dataset

In [3]:
reviews_df = data[["Processed Review Text", "Review Text"]]
reviews_df.head()

Unnamed: 0,Processed Review Text,Review Text
0,be really good course recommend have large pe...,it was really good course i recommend for havi...
1,be really good course recommend have large pe...,it was really good course i recommend for havi...
2,instructions final project project be biggest ...,the instructions for the final project the pro...
3,have like do assignments weren lock payment re...,would have liked to do the assignments if they...
4,coursera issue not course coursera app apple t...,coursera issue not the course itself coursera ...


In [4]:
under = RandomUnderSampler(random_state=1)

In [5]:
X, y_stars, y_pol = (reviews_df, data["Stars"].values.reshape(-1,1), data["Polarity"].values.reshape(-1,1))
X_res_stars, y_res_stars = under.fit_resample(X, y_stars)
print(Counter(y_res_stars))

Counter({1: 12240, 2: 12240, 3: 12240, 4: 12240, 5: 12240})


In [6]:
X_res_pol, y_res_pol = under.fit_resample(X, y_pol)
print(Counter(y_res_pol))

Counter({'negative': 25732, 'neutral': 25732, 'positive': 25732})


In [7]:
stars_data = {
    'Review Text': X_res_stars["Review Text"].ravel(),
    'Processed Review Text': X_res_stars["Processed Review Text"].ravel(),
    'Stars': y_res_stars.ravel()
}

pol_data = {
    'Review Text': X_res_pol["Review Text"].ravel(),
    'Processed Review Text': X_res_pol["Processed Review Text"].ravel(),
    'Polarity': y_res_pol.ravel()
}

res_stars = pd.DataFrame(stars_data)
res_pol = pd.DataFrame(pol_data)

In [8]:
res_pol["Processed Review Text"] = res_pol["Processed Review Text"].apply(str)
res_pol["Processed Review Text"] = res_pol["Processed Review Text"].apply(word_tokenize)
res_pol.head()

Unnamed: 0,Review Text,Processed Review Text,Polarity
0,the instructions for the final project the pro...,"[instructions, final, project, project, be, bi...",negative
1,would have liked to do the assignments if they...,"[have, like, do, assignments, weren, lock, pay...",negative
2,coursera issue not the course itself coursera ...,"[coursera, issue, not, course, coursera, app, ...",negative
3,i liked the course and the material but it was...,"[like, course, material, be, frustrate, work, ...",negative
4,the instructions for the final project the pro...,"[instructions, final, project, project, be, bi...",negative


In [9]:
res_stars["Processed Review Text"] = res_stars["Processed Review Text"].apply(str)
res_stars["Processed Review Text"] = res_stars["Processed Review Text"].apply(word_tokenize)
res_stars.head()

Unnamed: 0,Review Text,Processed Review Text,Stars
0,the technology used is poor the content disjoi...,"[technology, use, be, poor, content, disjoint,...",1
1,very limited to ibm db2 not useful in real wor...,"[very, limit, ibm, db2, not, useful, real, wor...",1
2,this is an introduction to an introduction the...,"[be, introduction, introduction, keep, tell, l...",1
3,hi i completed entire program and received the...,"[hi, complete, entire, program, receive, profe...",1
4,worst course even the first step for ide is no...,"[worst, course, even, first, step, ide, be, no...",1


In [10]:
merged_reviews = [res_pol["Processed Review Text"], res_stars["Processed Review Text"]]
review_texts = [item for sublist in merged_reviews for item in sublist]

### Word2Vec model creation

In [11]:
# Creating a Word2Vec vectorizer
word2vec_model = gensim.models.Word2Vec(review_texts)

In [12]:
word2vec_model.train(review_texts, total_examples=len(review_texts), epochs=10)
word2vec_model.wv.save("models/word2vec")

In [13]:
def word_vector(tokens, size):
    vec = np.zeros(size).reshape((1,size))
    count = 0
    for word in tokens:
        try:
            vec += word2vec_model.wv[word].reshape((1, size))
            count += 1
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

### Create Word2Vec arrays for star classification

In [14]:
wordvec_star_arrays = np.zeros((len(X_res_stars), 100))
for i in range(len(X_res_stars)):
    wordvec_star_arrays[i, :] = word_vector(res_stars["Processed Review Text"][i], 100)
wordvec_stars_df = pd.DataFrame(wordvec_star_arrays)
wordvec_stars_df.shape

(61200, 100)

In [15]:
wordvec_stars_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.140602,-0.357964,-0.096083,0.110623,0.303417,0.490394,0.430372,0.424608,0.378307,0.208588,...,-0.482405,-0.288498,-0.417408,0.235639,0.155991,0.022233,-0.430419,0.226439,-0.146093,0.008037
1,-0.008228,0.306805,0.339147,-0.833253,-0.090357,0.647605,-0.28193,-0.429494,0.138144,0.382717,...,-0.94169,-0.035862,0.047843,-0.128789,-0.45816,0.59614,0.367128,-0.157394,-0.212761,0.03302
2,0.47392,-0.488011,-0.310095,-0.218749,0.444057,0.652315,0.207375,-0.544526,0.328713,0.513099,...,-0.705119,-0.140125,0.083804,-0.779799,-0.175926,-0.391392,-0.47321,0.599841,0.064138,0.410167
3,1.167873,0.613004,-0.392788,-0.4369,-0.549742,0.786214,0.701737,0.162138,0.488215,-0.063179,...,0.881947,0.393778,0.555483,-0.174147,0.533954,0.390631,1.894485,1.021048,-0.190497,-0.029569
4,-0.239401,-0.435455,-1.312376,0.371424,0.316245,1.109977,-0.013161,0.215644,0.721349,0.404732,...,-0.560232,-0.053142,0.626911,0.643905,-0.432684,-0.765038,-0.268207,0.829686,0.243172,0.34257


### Create Word2Vec arrays for polarity classification

In [16]:
wordvec_pol_arrays = np.zeros((len(X_res_pol), 100))
for i in range(len(X_res_pol)):
    wordvec_pol_arrays[i, :] = word_vector(res_pol["Processed Review Text"][i], 100)
wordvec_pol_df = pd.DataFrame(wordvec_pol_arrays)
wordvec_pol_df.shape

(77196, 100)

In [17]:
wordvec_pol_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.289488,0.102818,-0.599188,-0.65349,-0.067246,0.81198,0.044278,-0.069073,0.517175,0.718169,...,-0.7466,0.199565,0.392061,-0.538661,-0.223359,-0.556851,1.163577,0.682047,-0.128337,-0.222991
1,0.393485,0.968417,-0.600583,-0.053143,-0.561964,0.535617,0.171578,-1.055742,1.171631,0.775926,...,-1.432936,-0.723664,0.241506,-0.097111,0.249231,-0.380946,1.169442,0.844949,0.083944,0.188423
2,0.292691,0.114567,-0.236382,-0.31322,-0.293878,0.440172,-0.100516,-0.178229,0.381006,0.322787,...,-0.668092,0.121995,-0.414142,-0.261001,-0.330022,-0.453942,0.395363,-0.106631,-0.153208,0.18001
3,0.52064,0.040128,-0.392393,-0.660355,-0.080681,0.692697,0.383475,0.188703,0.933029,0.560076,...,-0.859106,-0.242065,0.63536,-0.24483,0.314618,-0.653573,1.12078,0.334444,-0.370891,0.540782
4,0.289488,0.102818,-0.599188,-0.65349,-0.067246,0.81198,0.044278,-0.069073,0.517175,0.718169,...,-0.7466,0.199565,0.392061,-0.538661,-0.223359,-0.556851,1.163577,0.682047,-0.128337,-0.222991


### Create Classifiers

In [18]:
NB_classifier = naive_bayes.GaussianNB()
RF_classifier = RandomForestClassifier()
XGB_classifier = XGBClassifier()
SVM_classifier = svm.SVC()

In [19]:
cv = StratifiedKFold(n_splits=5)

### Run Models for star classification

Multinomial Naive Bayes

In [20]:
# Run Naive Bayes Model for polarity predicition and evaluate results
nb_pred = cross_val_predict(NB_classifier, wordvec_stars_df, res_stars["Stars"], cv=cv)
print(classification_report(y_res_stars, nb_pred))
print(confusion_matrix(y_res_stars, nb_pred))

              precision    recall  f1-score   support

           1       0.46      0.38      0.42     12240
           2       0.31      0.56      0.40     12240
           3       0.35      0.16      0.22     12240
           4       0.34      0.21      0.26     12240
           5       0.53      0.69      0.60     12240

    accuracy                           0.40     61200
   macro avg       0.40      0.40      0.38     61200
weighted avg       0.40      0.40      0.38     61200

[[4648 5882  544  491  675]
 [2293 6891 1244  921  891]
 [1763 5302 1926 1680 1569]
 [ 889 3106 1322 2538 4385]
 [ 452 1116  416 1758 8498]]


Random Forest

In [21]:
# Run Random Forest Model for polarity predicition and evaluate results
rf_pred = cross_val_predict(RF_classifier, wordvec_stars_df, res_stars["Stars"], cv=cv)
print(classification_report(y_res_stars, rf_pred))
print(confusion_matrix(y_res_stars, rf_pred))

              precision    recall  f1-score   support

           1       0.61      0.81      0.70     12240
           2       0.45      0.23      0.30     12240
           3       0.42      0.50      0.46     12240
           4       0.42      0.35      0.38     12240
           5       0.60      0.70      0.65     12240

    accuracy                           0.52     61200
   macro avg       0.50      0.52      0.50     61200
weighted avg       0.50      0.52      0.50     61200

[[9862  889  980  282  227]
 [3492 2813 4015 1284  636]
 [1553 1604 6180 1941  962]
 [ 820  762 2588 4239 3831]
 [ 330  209  790 2347 8564]]


XGBoost

In [22]:
# Run XGBoost Model for polarity predicition and evaluate results
cross_val_score(XGB_classifier, wordvec_stars_df, res_stars["Stars"], cv=cv)





















array([0.51781046, 0.51413399, 0.50923203, 0.51797386, 0.50514706])

Support Vector Machines

In [23]:
# Run SVM Model for polarity predicition and evaluate results
svm_pred = cross_val_predict(SVM_classifier, wordvec_stars_df, res_stars["Stars"], cv=cv, n_jobs=4)
print(classification_report(y_res_stars, svm_pred))
print(confusion_matrix(y_res_stars, svm_pred))

              precision    recall  f1-score   support

           1       0.58      0.63      0.60     12240
           2       0.39      0.40      0.40     12240
           3       0.40      0.38      0.39     12240
           4       0.46      0.35      0.40     12240
           5       0.63      0.76      0.69     12240

    accuracy                           0.50     61200
   macro avg       0.49      0.50      0.49     61200
weighted avg       0.49      0.50      0.49     61200

[[7653 3153  941  242  251]
 [3384 4938 2909  673  336]
 [1449 3277 4595 2023  896]
 [ 496 1117 2444 4264 3919]
 [ 233  274  489 1981 9263]]


### Run Models for polarity classification

Multinomial Naive Bayes

In [24]:
# Run Naive Bayes Model for polarity predicition and evaluate results
nb_pred = cross_val_predict(NB_classifier, wordvec_pol_df, res_pol["Polarity"], cv=cv)
print(classification_report(y_res_pol, nb_pred))
print(confusion_matrix(y_res_pol, nb_pred))

              precision    recall  f1-score   support

    negative       0.61      0.45      0.51     25732
     neutral       0.46      0.55      0.50     25732
    positive       0.69      0.74      0.71     25732

    accuracy                           0.58     77196
   macro avg       0.59      0.58      0.58     77196
weighted avg       0.59      0.58      0.58     77196

[[11466 11405  2861]
 [ 5770 14203  5759]
 [ 1579  5217 18936]]


Random Forest

In [25]:
# Run Random Forest Model for polarity predicition and evaluate results
rf_pred = cross_val_predict(RF_classifier, wordvec_pol_df, res_pol["Polarity"], cv=cv)
print(classification_report(y_res_pol, rf_pred))
print(confusion_matrix(y_res_pol, rf_pred))

              precision    recall  f1-score   support

    negative       0.75      0.56      0.64     25732
     neutral       0.59      0.75      0.66     25732
    positive       0.79      0.78      0.78     25732

    accuracy                           0.70     77196
   macro avg       0.71      0.70      0.70     77196
weighted avg       0.71      0.70      0.70     77196

[[14445  8905  2382]
 [ 3448 19289  2995]
 [ 1425  4238 20069]]


XGBoost

In [26]:
# Run XGBoost Model for polarity predicition and evaluate results
cross_val_score(XGB_classifier, wordvec_pol_df, res_pol["Polarity"], cv=cv)





















array([0.69611399, 0.68832178, 0.68462983, 0.68819224, 0.68948766])

Support Vector Machines

In [27]:
# Run SVM Model for polarity predicition and evaluate results
svm_pred = cross_val_predict(SVM_classifier, wordvec_pol_df, res_pol["Polarity"], cv=cv, n_jobs=4)
print(classification_report(y_res_pol, svm_pred))
print(confusion_matrix(y_res_pol, svm_pred))

              precision    recall  f1-score   support

    negative       0.69      0.67      0.68     25732
     neutral       0.56      0.61      0.58     25732
    positive       0.82      0.77      0.79     25732

    accuracy                           0.68     77196
   macro avg       0.69      0.68      0.69     77196
weighted avg       0.69      0.68      0.69     77196

[[17156  7591   985]
 [ 6603 15677  3452]
 [ 1196  4658 19878]]
