## BOW - Bag of Words

In [1]:
# Imports
import pandas as pd
import ast
import numpy as np

from utils.utils import isSubArray

from nltk import pos_tag
from nltk.tokenize import word_tokenize

from collections import Counter

from imblearn.under_sampling import RandomUnderSampler

from xgboost import XGBClassifier

from scipy import sparse

from sklearn import feature_extraction, naive_bayes, svm
from sklearn.ensemble import RandomForestClassifier
from sklearn import pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_predict, StratifiedKFold, cross_val_score, train_test_split, GridSearchCV

In [2]:
# Load Data
data = pd.read_csv("data/filtered_reviews_large.csv", names=["Review Text", "Stars", "Polarity", "Processed Review Text"])
# data = pd.read_csv("data/filtered_reviews.csv", names=["Review Text", "Stars", "Polarity"])

### Undersampling to balance dataset

In [3]:
reviews_df = data[["Processed Review Text", "Review Text"]]
reviews_df.head()

Unnamed: 0,Processed Review Text,Review Text
0,be really good course recommend have large pe...,it was really good course i recommend for havi...
1,be really good course recommend have large pe...,it was really good course i recommend for havi...
2,instructions final project project be biggest ...,the instructions for the final project the pro...
3,have like do assignments weren lock payment re...,would have liked to do the assignments if they...
4,coursera issue not course coursera app apple t...,coursera issue not the course itself coursera ...


In [4]:
under = RandomUnderSampler(random_state=1)

In [5]:
X, y_stars, y_pol = (reviews_df, data["Stars"].values.reshape(-1,1), data["Polarity"].values.reshape(-1,1))
X_res_stars, y_res_stars = under.fit_resample(X, y_stars)
print(Counter(y_res_stars))

Counter({1: 12240, 2: 12240, 3: 12240, 4: 12240, 5: 12240})


In [6]:
X_res_pol, y_res_pol = under.fit_resample(reviews_df, y_pol)
print(Counter(y_res_pol))

Counter({'negative': 25732, 'neutral': 25732, 'positive': 25732})


In [7]:
stars_data = {
    'Review Text': X_res_stars["Review Text"].ravel(),
    'Processed Review Text': X_res_stars["Processed Review Text"].ravel(),
    'Stars': y_res_stars.ravel()
}

pol_data = {
    'Review Text': X_res_pol["Review Text"].ravel(),
    'Processed Review Text': X_res_pol["Processed Review Text"].ravel(),
    'Polarity': y_res_pol.ravel()
}

res_stars = pd.DataFrame(stars_data)
res_pol = pd.DataFrame(pol_data)

### Create Bow Dataframe

In [8]:
merged_reviews = [res_pol["Processed Review Text"], res_stars["Processed Review Text"]]
review_texts = [item for sublist in merged_reviews for item in sublist]

In [9]:
# Creating a BOW vectorizer using Tf_idf - a more advanced version of the classic BOW algorithm
vectorizer = feature_extraction.text.TfidfVectorizer()

In [10]:
# Creating BOW vectors for dataset vocabulary
corpus = pd.Series(review_texts).values.astype('U')
vectorizer.fit(corpus)

TfidfVectorizer()

In [11]:
bow_stars_array = vectorizer.transform(res_stars["Processed Review Text"].values.astype('U'))
bow_stars_array.shape

(61200, 30766)

In [12]:
bow_pol_array = vectorizer.transform(res_pol["Processed Review Text"].values.astype('U'))
bow_pol_array.shape

(77196, 30766)

### Create Classifiers

In [13]:
NB_classifier = naive_bayes.MultinomialNB()
RF_classifier = RandomForestClassifier(random_state=48)
XGB_classifier = XGBClassifier()
SVM_classifier = svm.SVC(random_state=24)

### Run Models with cross validation

In [14]:
cv = StratifiedKFold(n_splits=5)

Multinomial Naive Bayes

In [15]:
# Run Naive Bayes Model for polarity predicition and evaluate results
nb_pred = cross_val_predict(NB_classifier, bow_stars_array, res_stars["Stars"], cv=cv)
print(classification_report(res_stars["Stars"], nb_pred))
print(confusion_matrix(res_stars["Stars"], nb_pred))

              precision    recall  f1-score   support

           1       0.60      0.54      0.57     12240
           2       0.35      0.43      0.39     12240
           3       0.38      0.38      0.38     12240
           4       0.43      0.31      0.36     12240
           5       0.62      0.71      0.66     12240

    accuracy                           0.47     61200
   macro avg       0.48      0.47      0.47     61200
weighted avg       0.48      0.47      0.47     61200

[[6623 3917 1109  323  268]
 [2574 5265 3249  787  365]
 [1148 3808 4658 1744  882]
 [ 423 1608 2655 3804 3750]
 [ 182  499  676 2186 8697]]


Random Forest

In [16]:
# Run Random Forest Model for polarity predicition and evaluate results
rf_pred = cross_val_predict(RF_classifier, bow_stars_array, res_stars["Stars"], cv=cv)
print(classification_report(res_stars["Stars"], rf_pred))
print(confusion_matrix(res_stars["Stars"], rf_pred))

              precision    recall  f1-score   support

           1       0.60      0.83      0.69     12240
           2       0.48      0.20      0.28     12240
           3       0.45      0.52      0.49     12240
           4       0.46      0.37      0.41     12240
           5       0.61      0.76      0.68     12240

    accuracy                           0.54     61200
   macro avg       0.52      0.54      0.51     61200
weighted avg       0.52      0.54      0.51     61200

[[10215   784   802   214   225]
 [ 4287  2427  3849  1028   649]
 [ 1669  1224  6381  2012   954]
 [  680   463  2471  4528  4098]
 [  316   139   542  1963  9280]]


XGBoost

In [17]:
# Run XGBoost Model for polarity predicition and evaluate results
cross_val_score(XGB_classifier, bow_stars_array, res_stars["Stars"], cv=cv)





















array([0.51143791, 0.50547386, 0.50620915, 0.50375817, 0.50326797])

Support Vector Machines

In [18]:
# Run SVM Model for polarity predicition and evaluate results
svm_pred = cross_val_predict(SVM_classifier, bow_stars_array, res_stars["Stars"], cv=cv, n_jobs=4)
print(classification_report(res_stars["Stars"], svm_pred))
print(confusion_matrix(res_stars["Stars"], svm_pred))

              precision    recall  f1-score   support

           1       0.63      0.79      0.70     12240
           2       0.46      0.28      0.35     12240
           3       0.45      0.52      0.48     12240
           4       0.49      0.42      0.45     12240
           5       0.66      0.75      0.70     12240

    accuracy                           0.55     61200
   macro avg       0.54      0.55      0.54     61200
weighted avg       0.54      0.55      0.54     61200

[[9610 1393  848  228  161]
 [3871 3409 3849  802  309]
 [1273 1897 6367 2065  638]
 [ 387  533 2553 5085 3682]
 [ 184  128  473 2221 9234]]


### Run models for polarity classification

Multinomial Naive Bayes

In [19]:
# Run Naive Bayes Model for polarity predicition and evaluate results
nb_pred = cross_val_predict(NB_classifier, bow_pol_array, res_pol["Polarity"], cv=cv)
print(classification_report(res_pol["Polarity"], nb_pred))
print(confusion_matrix(res_pol["Polarity"], nb_pred))

              precision    recall  f1-score   support

    negative       0.67      0.60      0.63     25732
     neutral       0.54      0.62      0.57     25732
    positive       0.79      0.76      0.77     25732

    accuracy                           0.66     77196
   macro avg       0.66      0.66      0.66     77196
weighted avg       0.66      0.66      0.66     77196

[[15324  8881  1527]
 [ 6068 15866  3798]
 [ 1338  4903 19491]]


Random Forest

In [20]:
# Run Random Forest Model for polarity predicition and evaluate results
rf_pred = cross_val_predict(RF_classifier, bow_pol_array, res_pol["Polarity"], cv=cv, n_jobs=4)
print(classification_report(res_pol["Polarity"], rf_pred))
print(confusion_matrix(res_pol["Polarity"], rf_pred))

              precision    recall  f1-score   support

    negative       0.77      0.62      0.69     25732
     neutral       0.63      0.74      0.68     25732
    positive       0.81      0.82      0.81     25732

    accuracy                           0.73     77196
   macro avg       0.74      0.73      0.73     77196
weighted avg       0.74      0.73      0.73     77196

[[16039  7618  2075]
 [ 3628 19144  2960]
 [ 1165  3537 21030]]


XGBoost

In [21]:
# Run XGBoost Model for polarity predicition and evaluate results
cross_val_score(XGB_classifier, bow_pol_array, res_pol["Polarity"], cv=cv, n_jobs=4)

array([0.69022021, 0.68670251, 0.68696159, 0.68307533, 0.68735022])

Support Vector Machines

In [22]:
# Run SVM Model for polarity predicition and evaluate results
svm_pred = cross_val_predict(SVM_classifier, bow_pol_array, res_pol["Polarity"], cv=cv, n_jobs=4)
print(classification_report(res_pol["Polarity"], svm_pred))
print(confusion_matrix(res_pol["Polarity"], svm_pred))

              precision    recall  f1-score   support

    negative       0.77      0.63      0.69     25732
     neutral       0.61      0.74      0.67     25732
    positive       0.84      0.81      0.83     25732

    accuracy                           0.73     77196
   macro avg       0.74      0.73      0.73     77196
weighted avg       0.74      0.73      0.73     77196

[[16218  8277  1237]
 [ 4000 18945  2787]
 [  851  3934 20947]]


### Splitting dataset into train and test for pattern matching

In [14]:
X_res_stars_train, X_res_stars_test, y_res_stars_train, y_res_stars_test = train_test_split(
    X_res_stars, y_res_stars, test_size=0.2, random_state=1
)

In [15]:
X_res_pol_train, X_res_pol_test, y_res_pol_train, y_res_pol_test = train_test_split(
    X_res_pol, y_res_pol, test_size=0.2, random_state=1
)

In [16]:
res_stars_train = X_res_stars_train.copy()
res_pol_train = X_res_pol_train.copy()

In [17]:
res_stars_train["Stars"] = y_res_stars_train
res_stars_train.head()

Unnamed: 0,Processed Review Text,Review Text,Stars
9145,buddhism conversion course,buddhism conversion course,1
35999,much respect richard shell talk captivate rea...,much respect for richard shell he talks and ca...,3
13667,nothing practical just talk lot talk,nothing practical just talking and lots of tal...,2
13322,enjoy videos not assignments want have fun l...,i enjoyed the videos but not the assignments i...,2
24514,course be well structure enough content provid...,the course is well structured with enough cont...,3


In [18]:
res_pol_train["Polarity"] = y_res_pol_train
res_pol_train.head()

Unnamed: 0,Processed Review Text,Review Text,Polarity
19725,be not able complete peer grade assignment fr...,i was not able to complete the peer graded ass...,negative
38147,language be understandable enough add subtitle...,language was understandable enough but adding ...,neutral
55579,thorough yet concise introduction specializati...,a thorough yet concise introduction to the spe...,positive
15314,poor labs labs do not teach match lecture slow...,poor labs labs do not teach match lectures slo...,negative
21273,be certificate,where is my certificate,negative


In [28]:
res_stars_train.to_csv('data/train_stars.csv', index=False, header=False)
res_pol_train.to_csv('data/train_polarity.csv', index=False, header=False)

### Increasing accuracy using star pattern matching

In [19]:
res_stars_test = X_res_stars_test.copy()
res_stars_test["Stars"] = y_res_stars_test
res_stars_test.head()

Unnamed: 0,Processed Review Text,Review Text,Stars
41297,have just complete online course write scienc...,i have just completed the online course writin...,4
36934,great information start zero,great information for those who start from zero,4
22370,too long video lecture least useful exercise s...,too long video lectures and least useful exerc...,2
50589,excellent teacher,excellent teacher,5
19174,course be teach instructors most monotone voic...,this course is taught by instructors with the ...,2


In [20]:
# Tokenize all reviews for pre-processing purposes
res_stars_train["Review Text"] = res_stars_train["Review Text"].apply(word_tokenize)
res_stars_train["Review Text"].head()

9145                        [buddhism, conversion, course]
35999    [much, respect, for, richard, shell, he, talks...
13667    [nothing, practical, just, talking, and, lots,...
13322    [i, enjoyed, the, videos, but, not, the, assig...
24514    [the, course, is, well, structured, with, enou...
Name: Review Text, dtype: object

In [21]:
res_stars_test["Review Text"] = res_stars_test["Review Text"].apply(word_tokenize)
res_stars_test["Review Text"].head()

41297    [i, have, just, completed, the, online, course...
36934    [great, information, for, those, who, start, f...
22370    [too, long, video, lectures, and, least, usefu...
50589                                 [excellent, teacher]
19174    [this, course, is, taught, by, instructors, wi...
Name: Review Text, dtype: object

In [22]:
# Tag words with their POS Tag
res_stars_train["Tagged Text"] = res_stars_train["Review Text"].apply(pos_tag)
res_stars_train["Tagged Text"].head()

9145      [(buddhism, NN), (conversion, NN), (course, NN)]
35999    [(much, JJ), (respect, NN), (for, IN), (richar...
13667    [(nothing, NN), (practical, JJ), (just, RB), (...
13322    [(i, NN), (enjoyed, VBD), (the, DT), (videos, ...
24514    [(the, DT), (course, NN), (is, VBZ), (well, RB...
Name: Tagged Text, dtype: object

In [23]:
res_stars_test["Tagged Text"] = res_stars_test["Review Text"].apply(pos_tag)
res_stars_test["Tagged Text"].head()

41297    [(i, NNS), (have, VBP), (just, RB), (completed...
36934    [(great, JJ), (information, NN), (for, IN), (t...
22370    [(too, RB), (long, RB), (video, JJ), (lectures...
50589                     [(excellent, NN), (teacher, NN)]
19174    [(this, DT), (course, NN), (is, VBZ), (taught,...
Name: Tagged Text, dtype: object

In [24]:
train_reviews_tags = []

for items in res_stars_train["Tagged Text"]:
    review_tags = []
    for item in items:
        review_tags.append(item[1])
    train_reviews_tags.append(review_tags)

res_stars_train["POS Tags"] = train_reviews_tags
res_stars_train["POS Tags"].head()

9145                                          [NN, NN, NN]
35999    [JJ, NN, IN, NN, NN, PRP, VBZ, CC, VBZ, PRP, P...
13667                  [NN, JJ, RB, VBG, CC, NNS, IN, VBG]
13322    [NN, VBD, DT, NNS, CC, RB, DT, NNS, RB, VBD, T...
24514    [DT, NN, VBZ, RB, VBN, IN, JJ, NN, VBN, IN, DT...
Name: POS Tags, dtype: object

In [25]:
test_reviews_tags = []

for items in res_stars_test["Tagged Text"]:
    review_tags = []
    for item in items:
        review_tags.append(item[1])
    test_reviews_tags.append(review_tags)

res_stars_test["POS Tags"] = test_reviews_tags
res_stars_test["POS Tags"].head()

41297    [NNS, VBP, RB, VBN, DT, JJ, NN, NN, IN, DT, NN...
36934                    [JJ, NN, IN, DT, WP, VBP, IN, NN]
22370    [RB, RB, JJ, NNS, CC, JJS, JJ, NNS, RB, RB, IN...
50589                                             [NN, NN]
19174    [DT, NN, VBZ, VBN, IN, NNS, IN, DT, RBS, NN, I...
Name: POS Tags, dtype: object

In [26]:
star_patterns = pd.read_csv("data/star_patterns.csv", names=["One Star", "Two Star", "Three Star", "Four Star", "Five Star"])
star_patterns.head()

Unnamed: 0,One Star,Two Star,Three Star,Four Star,Five Star
0,"['NN', 'CD', 'RB', 'CD', 'NNS']","['CD', 'CD', 'CD', 'CD', 'CD']","['JJ', 'NN', 'IN', 'VBN', 'TO']","['VB', 'DT', 'NN', 'DT', 'CD']","['FW', 'FW', 'FW', 'FW', 'FW', 'FW', 'FW']"
1,"['CD', 'RB', 'CD', 'NNS', 'TO']","['CD', 'CD', 'CD', 'CD', 'CD', 'CD']","['NN', 'MD', 'VB', 'NNS', 'VB']","['TO', 'VB', 'DT', 'NNS', 'PRP', 'VBP']","['NNP', 'NNP', 'NNP', 'NNP', 'NNP']"
2,"['NN', 'CD', 'RB', 'CD', 'NNS', 'TO']","['CD', 'CD', 'CD', 'CD', 'CD', 'CD', 'CD']","['VBP', 'NN', 'CC', 'DT', 'NN']","['VB', 'JJ', 'TO', 'VB', 'JJR', 'NNS']","['VBD', 'CD', 'IN', 'DT', 'JJS']"
3,"['CD', 'RB', 'CD', 'NNS', 'TO', 'VB']","['VB', 'NN', 'RB', 'RB', 'IN']","['NN', 'NNS', 'CC', 'VBP', 'TO']","['NNS', 'RB', 'VBP', 'RB', 'VBN']","['JJ', 'NN', 'NN', 'PRP', 'TO']"
4,"['NN', 'CD', 'RB', 'CD', 'NNS', 'TO', 'VB']","['VBD', 'VBN', 'RB', 'IN', 'PRP']","['NN', 'RB', 'IN', 'PRP', 'PRP']","['PRP', 'CD', 'NNS', 'RB', 'IN']","['NNP', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP']"


In [27]:
star_patterns["One Star"] = star_patterns["One Star"].apply(ast.literal_eval)
star_patterns["Two Star"] = star_patterns["Two Star"].apply(ast.literal_eval)
star_patterns["Three Star"] = star_patterns["Three Star"].apply(ast.literal_eval)
star_patterns["Four Star"] = star_patterns["Four Star"].apply(ast.literal_eval)
star_patterns["Five Star"] = star_patterns["Five Star"].apply(ast.literal_eval)

In [28]:
bow_occurences_array_train = np.zeros((res_stars_train.shape[0], 5))
bow_occurences_array_train.shape

(48960, 5)

In [29]:
bow_occurences_array_test = np.zeros((res_stars_test.shape[0], 5))
bow_occurences_array_test.shape

(12240, 5)

In [30]:
def check_occurrences_stars(data, star_grams, col_index, stars, occ_array):
    count = 0
    for index, row in data.iterrows():
        if row["Stars"] == stars:
            for n_gram in star_grams:
                if isSubArray(row["POS Tags"], n_gram, len(row["POS Tags"]), len(n_gram)):
                    occ_array[count, col_index] = 1
        count += 1

In [31]:
check_occurrences_stars(res_stars_test, star_patterns["One Star"], 0, 1, bow_occurences_array_test)
check_occurrences_stars(res_stars_train, star_patterns["One Star"], 0, 1, bow_occurences_array_train)

In [32]:
check_occurrences_stars(res_stars_train, star_patterns["Two Star"], 1, 2, bow_occurences_array_train)
check_occurrences_stars(res_stars_test, star_patterns["Two Star"], 1, 2, bow_occurences_array_test)

In [33]:
check_occurrences_stars(res_stars_train, star_patterns["Three Star"], 2, 3, bow_occurences_array_train)
check_occurrences_stars(res_stars_test, star_patterns["Three Star"], 2, 3, bow_occurences_array_test)

In [34]:
check_occurrences_stars(res_stars_train, star_patterns["Four Star"], 3, 4, bow_occurences_array_train)
check_occurrences_stars(res_stars_test, star_patterns["Four Star"], 3, 4, bow_occurences_array_test)

In [35]:
check_occurrences_stars(res_stars_train, star_patterns["Five Star"], 4, 5, bow_occurences_array_train)
check_occurrences_stars(res_stars_test, star_patterns["Five Star"], 4, 5, bow_occurences_array_test)

In [36]:
bow_stars_array_train = vectorizer.transform(res_stars_train["Processed Review Text"].values.astype('U'))
bow_stars_array_train.shape

(48960, 30766)

In [37]:
bow_stars_array_test = vectorizer.transform(res_stars_test["Processed Review Text"].values.astype('U'))
bow_stars_array_test.shape

(12240, 30766)

In [38]:
bow_occurrences_matrix_train = sparse.csr_matrix(bow_occurences_array_train)
bow_occurrences_matrix_train.shape

(48960, 5)

In [39]:
bow_occurrences_matrix_test = sparse.csr_matrix(bow_occurences_array_test)
bow_occurrences_matrix_test.shape

(12240, 5)

In [40]:
bow_stars_array_train_opt = sparse.hstack([bow_stars_array_train, bow_occurrences_matrix_train])
bow_stars_array_train_opt.shape

(48960, 30771)

In [41]:
bow_stars_array_test_opt = sparse.hstack([bow_stars_array_test, bow_occurrences_matrix_test])
bow_stars_array_test_opt.shape

(12240, 30771)

Run GridSearch for hyperparameter optimization

In [42]:
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'criterion': ['gini', 'entropy'],
    'max_features': ['auto', 'sqrt', 'log2'],
}

clf_rf = GridSearchCV(RF_classifier, param_grid=rf_param_grid, n_jobs=-1, cv=3)
clf_rf.fit(bow_stars_array_train_opt, res_stars_train["Stars"])

GridSearchCV(cv=3, estimator=RandomForestClassifier(random_state=48), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [50, 100, 200]})

In [53]:
rf_results = pd.DataFrame(clf_rf.cv_results_)
rf_results.iloc[clf_rf.best_index_]

mean_fit_time                                                978.395013
std_fit_time                                                   2.589243
mean_score_time                                                4.650207
std_score_time                                                 0.036278
param_criterion                                                    gini
param_max_features                                                 auto
param_n_estimators                                                  200
params                {'criterion': 'gini', 'max_features': 'auto', ...
split0_test_score                                              0.716728
split1_test_score                                              0.712623
split2_test_score                                              0.718444
mean_test_score                                                0.715931
std_test_score                                                 0.002442
rank_test_score                                                 

In [43]:
svm_param_grid = {
    'C': [0.1, 1.0, 10.0],
    'kernel': ['poly', 'rbf', 'sigmoid'],
    'gamma': [0.01, 0.1, 1],
}

clf_svm = GridSearchCV(SVM_classifier, param_grid=svm_param_grid, n_jobs=-1, cv=3)
clf_svm.fit(bow_stars_array_train_opt, res_stars_train["Stars"])

GridSearchCV(cv=3, estimator=SVC(random_state=24), n_jobs=-1,
             param_grid={'C': [0.1, 1.0, 10.0], 'gamma': [0.01, 0.1, 1],
                         'kernel': ['poly', 'rbf', 'sigmoid']})

In [54]:
svm_results = pd.DataFrame(clf_svm.cv_results_)
svm_results.iloc[clf_svm.best_index_]

mean_fit_time                                      853.794333
std_fit_time                                        27.047666
mean_score_time                                     83.823999
std_score_time                                       14.40211
param_C                                                  10.0
param_gamma                                                 1
param_kernel                                              rbf
params               {'C': 10.0, 'gamma': 1, 'kernel': 'rbf'}
split0_test_score                                    0.712377
split1_test_score                                     0.71299
split2_test_score                                     0.71636
mean_test_score                                      0.713909
std_test_score                                       0.001751
rank_test_score                                             1
Name: 25, dtype: object

Predict to evaluate performance

In [44]:
rf_pred = clf_rf.predict(bow_stars_array_test_opt)

print(classification_report(y_res_stars_test, rf_pred))
print(confusion_matrix(y_res_stars_test, rf_pred))

              precision    recall  f1-score   support

           1       0.77      0.86      0.81      2547
           2       0.77      0.72      0.74      2434
           3       0.62      0.59      0.61      2418
           4       0.58      0.50      0.54      2371
           5       0.70      0.78      0.74      2470

    accuracy                           0.69     12240
   macro avg       0.69      0.69      0.69     12240
weighted avg       0.69      0.69      0.69     12240

[[2181  170  130   35   31]
 [ 266 1756  272   86   54]
 [ 221  262 1436  345  154]
 [ 114   77  399 1195  586]
 [  44   19   83  391 1933]]


In [45]:
svm_pred = clf_svm.predict(bow_stars_array_test_opt)

print(classification_report(y_res_stars_test, svm_pred))
print(confusion_matrix(y_res_stars_test, svm_pred))

              precision    recall  f1-score   support

           1       0.81      0.83      0.82      2547
           2       0.74      0.74      0.74      2434
           3       0.60      0.61      0.60      2418
           4       0.56      0.56      0.56      2371
           5       0.75      0.71      0.73      2470

    accuracy                           0.69     12240
   macro avg       0.69      0.69      0.69     12240
weighted avg       0.69      0.69      0.69     12240

[[2119  232  142   42   12]
 [ 214 1809  313   77   21]
 [ 188  294 1474  378   84]
 [  77   86  425 1323  460]
 [  28   28  110  555 1749]]


### Increasing accuracy using polarity pattern matching

In [None]:
# Tokenize all reviews for pre-processing purposes
res_pol_train["Review Text"] = res_pol_train["Review Text"].apply(word_tokenize)
res_pol_train["Review Text"].head()

In [None]:
X_res_pol_test["Review Text"] = X_res_pol_test["Review Text"].apply(word_tokenize)
X_res_pol_test["Review Text"].head()

In [None]:
# Tag words with their POS Tag
res_pol_train["Tagged Text"] = res_pol_train["Review Text"].apply(pos_tag)
res_pol_train["Tagged Text"].head()

In [None]:
X_res_pol_test["Tagged Text"] = X_res_pol_test["Review Text"].apply(pos_tag)
X_res_pol_test["Tagged Text"].head()

In [None]:
train_reviews_tags = []

for index in range(len(res_pol_train["Tagged Text"])):
    review_tags = []
    for item in res_pol_train["Tagged Text"][index]:
        review_tags.append(item[1])
    train_reviews_tags.append(review_tags)

res_pol_train["POS Tags"] = train_reviews_tags
res_pol_train["POS Tags"].head()

In [None]:
test_reviews_tags = []

for index in range(len(X_res_pol_test["Tagged Text"])):
    review_tags = []
    for item in X_res_pol_test["Tagged Text"][index]:
        review_tags.append(item[1])
    test_reviews_tags.append(review_tags)

X_res_pol_test["POS Tags"] = test_reviews_tags
X_res_pol_test["POS Tags"].head()

In [None]:
pol_patterns = pd.read_csv("data/patterns.csv", names=["Neutral", "Negative"])
pol_patterns.head()

In [None]:
pol_patterns["Neutral"] = pol_patterns["Neutral"].apply(ast.literal_eval)
pol_patterns["Negative"] = pol_patterns["Negative"].apply(ast.literal_eval)

In [None]:
bow_occurences_array_train_pol = np.zeros((res_pol_train.shape[0], 2))
bow_occurences_array_train_pol.shape

In [None]:
bow_occurences_array_test_pol = np.zeros((X_res_pol_test.shape[0], 2))
bow_occurences_array_test_pol.shape

In [None]:
def check_occurrences_pol(data, polarity_grams, col_index, polarity, occ_array):
    count = 0
    for index, row in data.iterrows():
        if row["Polarity"] == polarity:
            for n_gram in polarity_grams:
                if isSubArray(row["POS Tags"], n_gram, len(row["POS Tags"]), len(n_gram)):
                    occ_array[count, col_index] = 1
        count += 1

In [None]:
check_occurrences_pol(res_pol_train, pol_patterns["Neutral"], 0, 1, bow_occurences_array_train_pol)
check_occurrences_pol(X_res_pol_test, pol_patterns["Neutral"], 0, 1, bow_occurences_array_test_pol)

In [None]:
check_occurrences_pol(res_pol_train, pol_patterns["Negative"], 1, 2, bow_occurences_array_train_pol)
check_occurrences_pol(X_res_pol_test, pol_patterns["Negative"], 1, 2, bow_occurences_array_test_pol)

In [None]:
bow_pol_array_train = vectorizer.transform(res_pol_train["Processed Review Text"].values.astype('U'))
bow_pol_array_train.shape

In [None]:
bow_pol_array_test = vectorizer.transform(X_res_pol_test["Processed Review Text"].values.astype('U'))
bow_pol_array_test.shape

In [None]:
bow_occurrences_matrix_train_pol = sparse.csr_matrix(bow_occurences_array_train_pol)
bow_occurrences_matrix_train_pol.shape

In [None]:
bow_occurrences_matrix_test_pol = sparse.csr_matrix(bow_occurences_array_test_pol)
bow_occurrences_matrix_test_pol.shape

In [None]:
bow_pol_array_train_opt = sparse.hstack([bow_pol_array_train, bow_occurrences_matrix_train_pol])
bow_pol_array_train_opt.shape

In [None]:
bow_pol_array_test_opt = sparse.hstack([bow_pol_array_test, bow_occurrences_matrix_test_pol])
bow_pol_array_test_opt.shape

#### Test model efficiency again using pattern matching

In [None]:
RF_classifier.fit(bow_pol_array_train_opt, res_pol_train["Polarity"])
rf_pred = RF_classifier.predict(bow_pol_array_test_opt)

print(classification_report(y_res_pol_test, rf_pred))
print(confusion_matrix(y_res_pol_test, rf_pred))

In [None]:
SVM_classifier.fit(bow_pol_array_train_opt, res_pol_train["Polarity"])
svm_pred = SVM_classifier.predict(bow_pol_array_test_opt)

print(classification_report(y_res_pol_test, svm_pred))
print(confusion_matrix(y_res_pol_test, svm_pred))