In [1]:
# Imports
import pandas as pd
import ast
import numpy as np
import pickle

from utils.utils import isSubArray

from nltk import pos_tag
from nltk.tokenize import word_tokenize

from collections import Counter

from imblearn.under_sampling import RandomUnderSampler

from scipy import sparse

from sklearn import feature_extraction, svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
# Load Data
data = pd.read_csv("data/filtered_reviews_large.csv", names=["Review Text", "Stars", "Polarity", "Processed Review Text"])
# data = pd.read_csv("data/filtered_reviews.csv", names=["Review Text", "Stars", "Polarity"])

### Undersampling to balance dataset

In [3]:
reviews_df = data[["Processed Review Text", "Review Text"]]
reviews_df.head()

Unnamed: 0,Processed Review Text,Review Text
0,be really good course recommend have large pe...,it was really good course i recommend for havi...
1,be really good course recommend have large pe...,it was really good course i recommend for havi...
2,instructions final project project be biggest ...,the instructions for the final project the pro...
3,have like do assignments weren lock payment re...,would have liked to do the assignments if they...
4,coursera issue not course coursera app apple t...,coursera issue not the course itself coursera ...


In [4]:
under = RandomUnderSampler(random_state=1)

In [5]:
X, y_stars = (reviews_df, data["Stars"].values.reshape(-1,1))
X_res_stars, y_res_stars = under.fit_resample(X, y_stars)
print(Counter(y_res_stars))

Counter({1: 12240, 2: 12240, 3: 12240, 4: 12240, 5: 12240})


In [6]:
stars_data = {
    'Review Text': X_res_stars["Review Text"].ravel(),
    'Processed Review Text': X_res_stars["Processed Review Text"].ravel(),
    'Stars': y_res_stars.ravel()
}

res_stars = pd.DataFrame(stars_data)

### Create Bow Dataframe

In [7]:
# Creating a BOW vectorizer using Tf_idf - a more advanced version of the classic BOW algorithm
vectorizer = pickle.load(open("models/tfidf.pickle", "rb"))

### Splitting dataset into train and test for pattern matching

In [8]:
X_res_stars_train, X_res_stars_test, y_res_stars_train, y_res_stars_test = train_test_split(
    X_res_stars, y_res_stars, test_size=0.2, random_state=1
)

In [9]:
res_stars_train = X_res_stars_train.copy()

In [10]:
res_stars_train["Stars"] = y_res_stars_train
res_stars_train.head()

Unnamed: 0,Processed Review Text,Review Text,Stars
9145,buddhism conversion course,buddhism conversion course,1
35999,much respect richard shell talk captivate rea...,much respect for richard shell he talks and ca...,3
13667,nothing practical just talk lot talk,nothing practical just talking and lots of tal...,2
13322,enjoy videos not assignments want have fun l...,i enjoyed the videos but not the assignments i...,2
24514,course be well structure enough content provid...,the course is well structured with enough cont...,3


### Increasing accuracy using star pattern matching

In [11]:
res_stars_test = X_res_stars_test.copy()
res_stars_test["Stars"] = y_res_stars_test
res_stars_test = res_stars_test.sample(n=10)

res_stars_test

Unnamed: 0,Processed Review Text,Review Text,Stars
39906,lecture be amaze specially lecture dr robert p...,the lectures were amazing specially the lectur...,4
18302,much text text have use difficult english,to much text that text has used difficult english,2
31809,be definitely informative help medical diction...,it was definitely informative but it would hel...,3
16259,very difficult understand content lab be good...,it s very difficult to understand the content ...,2
46068,be good course term tivity exercise majority ...,it is a good course in terms of the e tivity e...,4
30395,ll be very honest give opinion give star cou...,i ll be very honest and give my opinion i gave...,3
47149,course be great only problem be way teach prof...,the course was great but only the problem was ...,4
37781,overall solid seem assignments end chapter be ...,overall solid but it seems like the assignment...,4
34094,thank course however find have say be so much...,thank you for this course however i find mysel...,3
9177,course need lot work good run down ui ux need...,this course needs a lot of work it s a good ru...,1


In [12]:
# Tokenize all reviews for pre-processing purposes
res_stars_train["Review Text"] = res_stars_train["Review Text"].apply(word_tokenize)
res_stars_train["Review Text"].head()

9145                        [buddhism, conversion, course]
35999    [much, respect, for, richard, shell, he, talks...
13667    [nothing, practical, just, talking, and, lots,...
13322    [i, enjoyed, the, videos, but, not, the, assig...
24514    [the, course, is, well, structured, with, enou...
Name: Review Text, dtype: object

In [13]:
res_stars_test["Review Text"] = res_stars_test["Review Text"].apply(word_tokenize)
res_stars_test["Review Text"].head()

39906    [the, lectures, were, amazing, specially, the,...
18302    [to, much, text, that, text, has, used, diffic...
31809    [it, was, definitely, informative, but, it, wo...
16259    [it, s, very, difficult, to, understand, the, ...
46068    [it, is, a, good, course, in, terms, of, the, ...
Name: Review Text, dtype: object

In [14]:
# Tag words with their POS Tag
res_stars_train["Tagged Text"] = res_stars_train["Review Text"].apply(pos_tag)
res_stars_train["Tagged Text"].head()

9145      [(buddhism, NN), (conversion, NN), (course, NN)]
35999    [(much, JJ), (respect, NN), (for, IN), (richar...
13667    [(nothing, NN), (practical, JJ), (just, RB), (...
13322    [(i, NN), (enjoyed, VBD), (the, DT), (videos, ...
24514    [(the, DT), (course, NN), (is, VBZ), (well, RB...
Name: Tagged Text, dtype: object

In [15]:
res_stars_test["Tagged Text"] = res_stars_test["Review Text"].apply(pos_tag)
res_stars_test["Tagged Text"].head()

39906    [(the, DT), (lectures, NNS), (were, VBD), (ama...
18302    [(to, TO), (much, JJ), (text, NN), (that, IN),...
31809    [(it, PRP), (was, VBD), (definitely, RB), (inf...
16259    [(it, PRP), (s, VBZ), (very, RB), (difficult, ...
46068    [(it, PRP), (is, VBZ), (a, DT), (good, JJ), (c...
Name: Tagged Text, dtype: object

In [16]:
train_reviews_tags = []

for items in res_stars_train["Tagged Text"]:
    review_tags = []
    for item in items:
        review_tags.append(item[1])
    train_reviews_tags.append(review_tags)

res_stars_train["POS Tags"] = train_reviews_tags
res_stars_train["POS Tags"].head()

9145                                          [NN, NN, NN]
35999    [JJ, NN, IN, NN, NN, PRP, VBZ, CC, VBZ, PRP, P...
13667                  [NN, JJ, RB, VBG, CC, NNS, IN, VBG]
13322    [NN, VBD, DT, NNS, CC, RB, DT, NNS, RB, VBD, T...
24514    [DT, NN, VBZ, RB, VBN, IN, JJ, NN, VBN, IN, DT...
Name: POS Tags, dtype: object

In [17]:
test_reviews_tags = []

for items in res_stars_test["Tagged Text"]:
    review_tags = []
    for item in items:
        review_tags.append(item[1])
    test_reviews_tags.append(review_tags)

res_stars_test["POS Tags"] = test_reviews_tags
res_stars_test["POS Tags"].head()

39906     [DT, NNS, VBD, VBG, RB, DT, NNS, IN, NN, NN, NN]
18302               [TO, JJ, NN, IN, NN, VBZ, VBN, JJ, JJ]
31809    [PRP, VBD, RB, JJ, CC, PRP, MD, VB, IN, DT, JJ...
16259    [PRP, VBZ, RB, JJ, TO, VB, DT, NN, NN, VBZ, JJ...
46068    [PRP, VBZ, DT, JJ, NN, IN, NNS, IN, DT, NN, NN...
Name: POS Tags, dtype: object

In [18]:
star_patterns = pd.read_csv("data/star_patterns.csv", names=["One Star", "Two Star", "Three Star", "Four Star", "Five Star"])
star_patterns.head()

Unnamed: 0,One Star,Two Star,Three Star,Four Star,Five Star
0,"['NN', 'CD', 'RB', 'CD', 'NNS']","['CD', 'CD', 'CD', 'CD', 'CD']","['JJ', 'NN', 'IN', 'VBN', 'TO']","['VB', 'DT', 'NN', 'DT', 'CD']","['FW', 'FW', 'FW', 'FW', 'FW', 'FW', 'FW']"
1,"['CD', 'RB', 'CD', 'NNS', 'TO']","['CD', 'CD', 'CD', 'CD', 'CD', 'CD']","['NN', 'MD', 'VB', 'NNS', 'VB']","['TO', 'VB', 'DT', 'NNS', 'PRP', 'VBP']","['NNP', 'NNP', 'NNP', 'NNP', 'NNP']"
2,"['NN', 'CD', 'RB', 'CD', 'NNS', 'TO']","['CD', 'CD', 'CD', 'CD', 'CD', 'CD', 'CD']","['VBP', 'NN', 'CC', 'DT', 'NN']","['VB', 'JJ', 'TO', 'VB', 'JJR', 'NNS']","['VBD', 'CD', 'IN', 'DT', 'JJS']"
3,"['CD', 'RB', 'CD', 'NNS', 'TO', 'VB']","['VB', 'NN', 'RB', 'RB', 'IN']","['NN', 'NNS', 'CC', 'VBP', 'TO']","['NNS', 'RB', 'VBP', 'RB', 'VBN']","['JJ', 'NN', 'NN', 'PRP', 'TO']"
4,"['NN', 'CD', 'RB', 'CD', 'NNS', 'TO', 'VB']","['VBD', 'VBN', 'RB', 'IN', 'PRP']","['NN', 'RB', 'IN', 'PRP', 'PRP']","['PRP', 'CD', 'NNS', 'RB', 'IN']","['NNP', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP']"


In [19]:
star_patterns["One Star"] = star_patterns["One Star"].apply(ast.literal_eval)
star_patterns["Two Star"] = star_patterns["Two Star"].apply(ast.literal_eval)
star_patterns["Three Star"] = star_patterns["Three Star"].apply(ast.literal_eval)
star_patterns["Four Star"] = star_patterns["Four Star"].apply(ast.literal_eval)
star_patterns["Five Star"] = star_patterns["Five Star"].apply(ast.literal_eval)

In [20]:
bow_occurences_array_train = np.zeros((res_stars_train.shape[0], 5))
bow_occurences_array_train.shape

(48960, 5)

In [21]:
bow_occurences_array_test = np.zeros((res_stars_test.shape[0], 5))
bow_occurences_array_test.shape

(10, 5)

In [22]:
def check_occurrences_stars(data, star_grams, col_index, stars, occ_array):
    count = 0
    for index, row in data.iterrows():
        if row["Stars"] == stars:
            for n_gram in star_grams:
                if isSubArray(row["POS Tags"], n_gram, len(row["POS Tags"]), len(n_gram)):
                    occ_array[count, col_index] = 1
        count += 1

In [23]:
check_occurrences_stars(res_stars_test, star_patterns["One Star"], 0, 1, bow_occurences_array_test)
check_occurrences_stars(res_stars_train, star_patterns["One Star"], 0, 1, bow_occurences_array_train)

In [24]:
check_occurrences_stars(res_stars_train, star_patterns["Two Star"], 1, 2, bow_occurences_array_train)
check_occurrences_stars(res_stars_test, star_patterns["Two Star"], 1, 2, bow_occurences_array_test)

In [25]:
check_occurrences_stars(res_stars_train, star_patterns["Three Star"], 2, 3, bow_occurences_array_train)
check_occurrences_stars(res_stars_test, star_patterns["Three Star"], 2, 3, bow_occurences_array_test)

In [26]:
check_occurrences_stars(res_stars_train, star_patterns["Four Star"], 3, 4, bow_occurences_array_train)
check_occurrences_stars(res_stars_test, star_patterns["Four Star"], 3, 4, bow_occurences_array_test)

In [27]:
check_occurrences_stars(res_stars_train, star_patterns["Five Star"], 4, 5, bow_occurences_array_train)
check_occurrences_stars(res_stars_test, star_patterns["Five Star"], 4, 5, bow_occurences_array_test)

In [28]:
bow_stars_array_train = vectorizer.transform(res_stars_train["Processed Review Text"].values.astype('U'))
bow_stars_array_train.shape

(48960, 30766)

In [29]:
bow_stars_array_test = vectorizer.transform(res_stars_test["Processed Review Text"].values.astype('U'))
bow_stars_array_test.shape

(10, 30766)

In [30]:
bow_occurrences_matrix_train = sparse.csr_matrix(bow_occurences_array_train)
bow_occurrences_matrix_train.shape

(48960, 5)

In [31]:
bow_occurrences_matrix_test = sparse.csr_matrix(bow_occurences_array_test)
bow_occurrences_matrix_test.shape

(10, 5)

In [32]:
bow_stars_array_train_opt = sparse.hstack([bow_stars_array_train, bow_occurrences_matrix_train])
bow_stars_array_train_opt.shape

(48960, 30771)

In [33]:
bow_stars_array_test_opt = sparse.hstack([bow_stars_array_test, bow_occurrences_matrix_test])
bow_stars_array_test_opt.shape

(10, 30771)

### Model Evaluation

In [34]:
RF_classifier = RandomForestClassifier(n_estimators=200, criterion='gini', max_features='auto')

In [35]:
RF_classifier.fit(bow_stars_array_train_opt, res_stars_train["Stars"])

RandomForestClassifier(n_estimators=200)

In [36]:
rf_pred = RF_classifier.predict(bow_stars_array_test_opt)
rf_pred_proba = RF_classifier.predict_proba(bow_stars_array_test_opt)

In [39]:
res_stars_test[["Review Text", "Stars"]]

Unnamed: 0,Review Text,Stars
39906,"[the, lectures, were, amazing, specially, the,...",4
18302,"[to, much, text, that, text, has, used, diffic...",2
31809,"[it, was, definitely, informative, but, it, wo...",3
16259,"[it, s, very, difficult, to, understand, the, ...",2
46068,"[it, is, a, good, course, in, terms, of, the, ...",4
30395,"[i, ll, be, very, honest, and, give, my, opini...",3
47149,"[the, course, was, great, but, only, the, prob...",4
37781,"[overall, solid, but, it, seems, like, the, as...",4
34094,"[thank, you, for, this, course, however, i, fi...",3
9177,"[this, course, needs, a, lot, of, work, it, s,...",1


In [40]:
rf_pred

array([4, 2, 3, 2, 5, 3, 4, 3, 2, 1], dtype=int64)

In [41]:
rf_pred_proba

array([[0.02     , 0.05     , 0.035    , 0.6465625, 0.2484375],
       [0.1      , 0.709    , 0.106    , 0.05     , 0.035    ],
       [0.24     , 0.16     , 0.465    , 0.085    , 0.05     ],
       [0.05     , 0.69     , 0.12     , 0.075    , 0.065    ],
       [0.115    , 0.185    , 0.22     , 0.23     , 0.25     ],
       [0.045    , 0.06     , 0.795    , 0.07     , 0.03     ],
       [0.13     , 0.115    , 0.09     , 0.3375   , 0.3275   ],
       [0.065    , 0.11     , 0.36     , 0.345    , 0.12     ],
       [0.2      , 0.23     , 0.22     , 0.195    , 0.155    ],
       [0.83     , 0.05     , 0.06     , 0.035    , 0.025    ]])