In [5]:
# Dataframe manipulation
import pandas as pd
import numpy as np

# Text preprocessing
import re
import string
import spacy
from gensim.utils import simple_preprocess

# Text Vectolization
from gensim.models import Word2Vec

# Model Selection, Training and Testing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [6]:
raw = pd.read_csv("Datasets/kindle_reviews.csv", index_col=0)

In [7]:
raw.shape

(12000, 10)

In [8]:
raw.head()

Unnamed: 0.1,Unnamed: 0,asin,helpful,rating,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,11539,B0033UV8HI,"[8, 10]",3,"Jace Rankin may be short, but he's nothing to ...","09 2, 2010",A3HHXRELK8BHQG,Ridley,Entertaining But Average,1283385600
1,5957,B002HJV4DE,"[1, 1]",5,Great short read. I didn't want to put it dow...,"10 8, 2013",A2RGNZ0TRF578I,Holly Butler,Terrific menage scenes!,1381190400
2,9146,B002ZG96I4,"[0, 0]",3,I'll start by saying this is the first of four...,"04 11, 2014",A3S0H2HV6U1I7F,Merissa,Snapdragon Alley,1397174400
3,7038,B002QHWOEU,"[1, 3]",3,Aggie is Angela Lansbury who carries pocketboo...,"07 5, 2014",AC4OQW3GZ919J,Cleargrace,very light murder cozy,1404518400
4,1776,B001A06VJ8,"[0, 1]",4,I did not expect this type of book to be in li...,"12 31, 2012",A3C9V987IQHOQD,Rjostler,Book,1356912000


In [9]:
raw.isnull().sum()

Unnamed: 0         0
asin               0
helpful            0
rating             0
reviewText         0
reviewTime         0
reviewerID         0
reviewerName      38
summary            0
unixReviewTime     0
dtype: int64

The 2 columns we need are 'rating' and 'reviewText'.

We are going to use the 'rating' column to identify sentiment in the following way:

* rating > 3 --> Good(1)
* rating <=3   --> Bad(0)

#### Here is how we are going to proceed:

1. Extract 'reviewText' column and 'rating' column as our dataset. Let's call 'reviewText' column as reviews and 'rating' column as ratings from now on for convenience.

2. Preprocess reviews:
    * Remove HTML tags, URLs, punctuation, stop words, numbers, special characters, extra whitespace, 
      non-ASCII characters and emojis.
    * Convert to lower case.
    * Lemmatize
    
3. Convert ratings column to labels 0 (Bad) and 1 (Good).

4. Vectorize our reviews using Word2Vec model.

5. Train, Test split and find the best model.

# Extracting required columns

### The information in the 'summary' column is really important. Concatenating it with the 'reviewText' column.

In [10]:
raw['reviewText'] = raw['reviewText'] + raw['summary']

In [11]:
df = raw.loc[:, ['reviewText', 'rating']]

In [12]:
df.head()

Unnamed: 0,reviewText,rating
0,"Jace Rankin may be short, but he's nothing to ...",3
1,Great short read. I didn't want to put it dow...,5
2,I'll start by saying this is the first of four...,3
3,Aggie is Angela Lansbury who carries pocketboo...,3
4,I did not expect this type of book to be in li...,4


In [13]:
# Changing column names
df.columns = ['Reviews', 'Ratings']

In [14]:
df.head()

Unnamed: 0,Reviews,Ratings
0,"Jace Rankin may be short, but he's nothing to ...",3
1,Great short read. I didn't want to put it dow...,5
2,I'll start by saying this is the first of four...,3
3,Aggie is Angela Lansbury who carries pocketboo...,3
4,I did not expect this type of book to be in li...,4


# Pre-processing Reviews

* Remove HTML tags, URLs, punctuation, stop words, numbers, special characters, extra whitespace, 
  non-ASCII characters and emojis.
* Convert to lower case.
* Lemmatize

In [15]:
# Loading spaCy's English model
nlp = spacy.load('en_core_web_sm')

In [16]:
def preprocessor(text: str):
    """
    Takes in a single review(sentences) as input; 
    returns the review as a list of tokens after -
    removing elements not needed for sentiment analysis,
    lemmatization, and
    converting to lower case.
    """
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove all punctuations except apostrophes to preserve words like "I'll", which can be removed later using spaCy stopwords
    text = text.translate(str.maketrans('', '', string.punctuation[:6] + string.punctuation[7:]))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove non-word characters except apostrophes
    text = re.sub(r"[^\w\s']", ' ', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Lemmatization, tokenization and lower casing using spaCy
    doc = nlp(text)
    text = [token.lemma_ for token in doc if not token.is_stop]
    
    
    return text 

In [17]:
# Applying preprocessing to reviews, row-by-row
df['Cleaned_Reviews'] = df['Reviews'].apply(preprocessor)

In [18]:
df.head()

Unnamed: 0,Reviews,Ratings,Cleaned_Reviews
0,"Jace Rankin may be short, but he's nothing to ...",3,"[Jace, Rankin, short, mess, man, haul, saloon,..."
1,Great short read. I didn't want to put it dow...,5,"[great, short, read, want, read, sit, sex, sce..."
2,I'll start by saying this is the first of four...,3,"[start, say, book, expect, conclude, center, c..."
3,Aggie is Angela Lansbury who carries pocketboo...,3,"[aggie, Angela, Lansbury, carry, pocketbook, i..."
4,I did not expect this type of book to be in li...,4,"[expect, type, book, library, pleased, find, p..."


# Converting Ratings to Binary Labels

In [19]:
df["Sentiment"] = df["Ratings"].apply(lambda x: 1 if x >3 else 0) 

In [20]:
df.head()

Unnamed: 0,Reviews,Ratings,Cleaned_Reviews,Sentiment
0,"Jace Rankin may be short, but he's nothing to ...",3,"[Jace, Rankin, short, mess, man, haul, saloon,...",0
1,Great short read. I didn't want to put it dow...,5,"[great, short, read, want, read, sit, sex, sce...",1
2,I'll start by saying this is the first of four...,3,"[start, say, book, expect, conclude, center, c...",0
3,Aggie is Angela Lansbury who carries pocketboo...,3,"[aggie, Angela, Lansbury, carry, pocketbook, i...",0
4,I did not expect this type of book to be in li...,4,"[expect, type, book, library, pleased, find, p...",1


# Checking Balance of Dataset

In [21]:
df.Sentiment.value_counts()

0    6000
1    6000
Name: Sentiment, dtype: int64

# Extracting Independent Feature and Label

In [22]:
X = df["Cleaned_Reviews"]
y = df["Sentiment"]

In [23]:
X.head()

0    [Jace, Rankin, short, mess, man, haul, saloon,...
1    [great, short, read, want, read, sit, sex, sce...
2    [start, say, book, expect, conclude, center, c...
3    [aggie, Angela, Lansbury, carry, pocketbook, i...
4    [expect, type, book, library, pleased, find, p...
Name: Cleaned_Reviews, dtype: object

In [24]:
y[:5]

0    0
1    1
2    0
3    0
4    1
Name: Sentiment, dtype: int64

# Train, Test Split

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
X_train.shape, X_test.shape

((9600,), (2400,))

In [27]:
type(X_train)

pandas.core.series.Series

In [28]:
X_train

9182     [look, forward, book, come, double, space, par...
11091    [own, book, spouse, forget, library, book, unf...
6428     [cool, forgot, request, rate, come, make, unre...
288      [short, short, story, basically, scene, party,...
2626     [Secret, Service, agent, secrest, long, servic...
                               ...                        
11964    [download, book, read, review, usually, read, ...
5191     [far, hot, book, get, hand, ondon't, miss, Jac...
5390     [book, free, reservation, base, majority, revi...
860      [little, mushy, care, woman, folk, Characters,...
7270     [book, good, good, set, charaterswith, backgro...
Name: Cleaned_Reviews, Length: 9600, dtype: object

In [29]:
y_train

9182     1
11091    0
6428     1
288      0
2626     1
        ..
11964    0
5191     1
5390     0
860      0
7270     1
Name: Sentiment, Length: 9600, dtype: int64

# Vectorizing Text Reviews using Word2Vec

### Training Word2Vec model on Training set

In [30]:
w2v = Word2Vec(sentences=X_train.to_numpy(), vector_size=100, min_count=1)

In [31]:
# w2v.wv.key_to_index contains a dictionary of mappings from words to indices, where indices contain word meanings in the
# form of vectors inside 'w2v.wv'. See next line
list(w2v.wv.key_to_index.items())[:5]

[('book', 0), ('story', 1), ('read', 2), ('like', 3), ('character', 4)]

In [32]:
# 'book' is at 0th index inside the vetor space
w2v.wv[0]

array([-0.6005872 ,  0.83495295,  0.2699319 ,  1.0515556 ,  0.18396023,
       -1.9382683 ,  0.8509073 ,  2.779263  , -1.6269401 , -1.7756336 ,
        0.46466863, -1.8905969 , -0.73405206, -0.01776789,  0.10365278,
       -0.1963745 ,  1.5463321 , -0.67331904, -0.40553442, -2.6815524 ,
        0.43623888,  0.4356722 ,  1.0008947 , -0.94268316,  0.0540268 ,
       -0.14716877, -0.27938306,  0.32173848, -0.8121111 ,  0.11412724,
        1.1730609 , -0.0760606 ,  1.447395  , -1.6058823 , -0.6453135 ,
        0.76690054,  1.4554222 , -0.42311049, -0.08500042, -0.96553206,
        0.05180233, -1.1643429 , -1.0829024 ,  0.8755547 ,  0.6356789 ,
       -0.23974024, -0.82011986, -0.41640964,  1.0566709 ,  0.93209577,
        0.74078584, -0.38644564, -1.1123279 , -0.6704703 , -0.16635056,
       -0.35706902,  0.8471692 , -0.25027153, -1.1030922 ,  1.4460597 ,
       -0.6719116 ,  0.50084907,  0.16478167, -0.85446835, -1.1648436 ,
        1.355765  ,  0.45013866,  1.2163371 , -1.1545004 ,  0.49

In [33]:
w2v.wv['book']

array([-0.6005872 ,  0.83495295,  0.2699319 ,  1.0515556 ,  0.18396023,
       -1.9382683 ,  0.8509073 ,  2.779263  , -1.6269401 , -1.7756336 ,
        0.46466863, -1.8905969 , -0.73405206, -0.01776789,  0.10365278,
       -0.1963745 ,  1.5463321 , -0.67331904, -0.40553442, -2.6815524 ,
        0.43623888,  0.4356722 ,  1.0008947 , -0.94268316,  0.0540268 ,
       -0.14716877, -0.27938306,  0.32173848, -0.8121111 ,  0.11412724,
        1.1730609 , -0.0760606 ,  1.447395  , -1.6058823 , -0.6453135 ,
        0.76690054,  1.4554222 , -0.42311049, -0.08500042, -0.96553206,
        0.05180233, -1.1643429 , -1.0829024 ,  0.8755547 ,  0.6356789 ,
       -0.23974024, -0.82011986, -0.41640964,  1.0566709 ,  0.93209577,
        0.74078584, -0.38644564, -1.1123279 , -0.6704703 , -0.16635056,
       -0.35706902,  0.8471692 , -0.25027153, -1.1030922 ,  1.4460597 ,
       -0.6719116 ,  0.50084907,  0.16478167, -0.85446835, -1.1648436 ,
        1.355765  ,  0.45013866,  1.2163371 , -1.1545004 ,  0.49

In [34]:
w2v.wv['book'] == w2v.wv[0]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [35]:
w2v.wv

<gensim.models.keyedvectors.KeyedVectors at 0x23b726cd810>

#### Now we need to convert each review in our train and test sets into vectors. Here is how we proceed:

* Take each review, ie., each row in our review data.
* Find the vector representation of each word in the review - w2v.wv['each_word']
* Find mean of all vector representations - Average Word2Vec - so that all reviews/rows are the same dimension when feeding into our ML model; in other words do np.mean on all vectors, in each row.

The vectors coming out of Word2Vec will be vertical, ie., as a column. We will have to make it horizontal.

In [36]:
def vectorize(token_list):
    """
    Takes in a list of tokens(strings) from each row.
    Returns a single list of numbers, which is the 
    average of all vectors of each word in the list of tokens.
    """
    # Performing in multiple steps to make it clearer.
    
    # List of all vectors, where each vector represents a word
    list_of_vectors = [w2v.wv[word] for word in token_list if word in w2v.wv]
    
    # A single vector, which is the mean of all word vectors,
    # representing the whole review
    # Vector becomes an array of zeroes if none of the words in our test dataset were trained earlier, ie.,
    # does not exist in our vocabulary
    review_vector = np.mean(list_of_vectors, axis=0) if len(list_of_vectors) > 0 else np.zeros(w2v.vector_size)
    
    return review_vector.reshape((1,-1))[0]

In [37]:
w2v.vector_size

100

In [38]:
X_train

9182     [look, forward, book, come, double, space, par...
11091    [own, book, spouse, forget, library, book, unf...
6428     [cool, forgot, request, rate, come, make, unre...
288      [short, short, story, basically, scene, party,...
2626     [Secret, Service, agent, secrest, long, servic...
                               ...                        
11964    [download, book, read, review, usually, read, ...
5191     [far, hot, book, get, hand, ondon't, miss, Jac...
5390     [book, free, reservation, base, majority, revi...
860      [little, mushy, care, woman, folk, Characters,...
7270     [book, good, good, set, charaterswith, backgro...
Name: Cleaned_Reviews, Length: 9600, dtype: object

In [39]:
X_train_vec = X_train.apply(vectorize)
X_test_vec = X_test.apply(vectorize)

In [40]:
X_train_vec

9182     [-0.3713054, 0.6018393, 0.22446223, 0.45861426...
11091    [-0.261011, 0.4449302, 0.17890199, 0.41641265,...
6428     [-0.18843205, 0.3427687, 0.13637412, 0.1806732...
288      [-0.30452594, 0.5226672, 0.2623439, 0.3741243,...
2626     [-0.18806612, 0.32699624, 0.1499323, 0.1895907...
                               ...                        
11964    [-0.43158367, 0.8618006, 0.23146859, 0.6002357...
5191     [-0.15268582, 0.3892723, 0.23438805, 0.3143143...
5390     [-0.20790936, 0.49102977, 0.21573013, 0.320955...
860      [-0.22723317, 0.360711, 0.23147929, 0.22446719...
7270     [-0.1813082, 0.44273442, 0.21476181, 0.2576689...
Name: Cleaned_Reviews, Length: 9600, dtype: object

In [41]:
X_train_vec.shape

(9600,)

#### The resulting vector has 9600 rows(reviews) and only 1 column(independent feature). We need to convert in to shape (9600, 100).

In [42]:
X_train_vec = np.stack(X_train_vec.values)
X_test_vec = np.stack(X_test_vec.values)

In [43]:
X_train_vec

array([[-0.3713054 ,  0.6018393 ,  0.22446223, ..., -0.93489087,
         0.37932247,  0.05190236],
       [-0.261011  ,  0.4449302 ,  0.17890199, ..., -0.7900749 ,
         0.32882077,  0.05879445],
       [-0.18843205,  0.3427687 ,  0.13637412, ..., -0.72063047,
         0.20092858, -0.1364762 ],
       ...,
       [-0.20790936,  0.49102977,  0.21573013, ..., -0.822804  ,
         0.2511471 , -0.07002713],
       [-0.22723317,  0.360711  ,  0.23147929, ..., -0.9003039 ,
         0.10208084, -0.19857061],
       [-0.1813082 ,  0.44273442,  0.21476181, ..., -0.82287806,
         0.20454086, -0.12464349]], dtype=float32)

In [44]:
X_train_vec.shape

(9600, 100)

#### All of our reviews have been vectorized

# Model Creation

### Finding the best model

In [45]:
# Models to train
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

In [46]:
def evaluate_models(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    return accuracy, report, model

In [47]:
# Evaluating different models
results = {}
for model_name, model in models.items():
    accuracy, report, m = evaluate_models(model, X_train_vec, X_test_vec, y_train, y_test)
    results[model_name] = {'Model': m, 'Accuracy': accuracy, 'Report': report}

In [48]:
# Printing results
for model_name, result in results.items():
    print(model_name, result['Accuracy'], sep=": ")

Logistic Regression: 0.7333333333333333
Support Vector Machine: 0.7308333333333333
Random Forest: 0.7208333333333333
Gradient Boosting: 0.7245833333333334


In [49]:
model = results['Logistic Regression']['Model']
model.fit(X_train_vec, y_train)

In [50]:
def predict_sentiment(review):
    processed_tokens = preprocessor(review)
    vector = vectorize(processed_tokens)
    print("Good Review") if model.predict([vector]) else print("Bad Review")

In [51]:
predict_sentiment("Very good")

Good Review


In [52]:
predict_sentiment("Very Bad")

Bad Review


In [53]:
predict_sentiment("Could have been better")

Bad Review


In [54]:
predict_sentiment("Best trimmer I have ever used")

Good Review


In [55]:
predict_sentiment("5 stars")
# Wrong Output

Bad Review
