In [77]:
# Dataframe manipulation
import pandas as pd
import numpy as np

# Text preprocessing
import re
import string
import spacy
from gensim.utils import simple_preprocess

# Text Vectolization
from gensim.models import Word2Vec

# Model Selection, Training and Testing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [78]:
raw = pd.read_csv("Datasets/kindle_reviews.csv", index_col=0)

In [79]:
raw.head()

Unnamed: 0.1,Unnamed: 0,asin,helpful,rating,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,11539,B0033UV8HI,"[8, 10]",3,"Jace Rankin may be short, but he's nothing to ...","09 2, 2010",A3HHXRELK8BHQG,Ridley,Entertaining But Average,1283385600
1,5957,B002HJV4DE,"[1, 1]",5,Great short read. I didn't want to put it dow...,"10 8, 2013",A2RGNZ0TRF578I,Holly Butler,Terrific menage scenes!,1381190400
2,9146,B002ZG96I4,"[0, 0]",3,I'll start by saying this is the first of four...,"04 11, 2014",A3S0H2HV6U1I7F,Merissa,Snapdragon Alley,1397174400
3,7038,B002QHWOEU,"[1, 3]",3,Aggie is Angela Lansbury who carries pocketboo...,"07 5, 2014",AC4OQW3GZ919J,Cleargrace,very light murder cozy,1404518400
4,1776,B001A06VJ8,"[0, 1]",4,I did not expect this type of book to be in li...,"12 31, 2012",A3C9V987IQHOQD,Rjostler,Book,1356912000


In [80]:
raw.isnull().sum()

Unnamed: 0         0
asin               0
helpful            0
rating             0
reviewText         0
reviewTime         0
reviewerID         0
reviewerName      38
summary            0
unixReviewTime     0
dtype: int64

The 2 columns we need are 'rating' and 'reviewText'.

We are going to use the 'rating' column to identify sentiment in the following way:

* rating >= 3 --> Good(1)
* rating <3   --> Bad(0)

#### Here is how we are going to proceed:

1. Extract 'reviewText' column and 'rating' column as our dataset. Let's call 'reviewText' column as reviews and 'rating' column as ratings from now on for convenience.

2. Preprocess reviews:
    * Remove HTML tags, URLs, punctuation, stop words, numbers, special characters, extra whitespace, 
      non-ASCII characters and emojis.
    * Convert to lower case.
    * Lemmatize
    
3. Convert ratings column to labels 0 (Bad) and 1 (Good).

4. Vectorize our reviews using Word2Vec model.

5. Train, Test split and find the best model.

# Extracting required columns

In [81]:
df = raw.loc[:, ['reviewText', 'rating']]

In [82]:
df.head()

Unnamed: 0,reviewText,rating
0,"Jace Rankin may be short, but he's nothing to ...",3
1,Great short read. I didn't want to put it dow...,5
2,I'll start by saying this is the first of four...,3
3,Aggie is Angela Lansbury who carries pocketboo...,3
4,I did not expect this type of book to be in li...,4


In [83]:
# Changing column names
df.columns = ['Reviews', 'Ratings']

In [84]:
df.head()

Unnamed: 0,Reviews,Ratings
0,"Jace Rankin may be short, but he's nothing to ...",3
1,Great short read. I didn't want to put it dow...,5
2,I'll start by saying this is the first of four...,3
3,Aggie is Angela Lansbury who carries pocketboo...,3
4,I did not expect this type of book to be in li...,4


# Pre-processing Reviews

* Remove HTML tags, URLs, punctuation, stop words, numbers, special characters, extra whitespace, 
  non-ASCII characters and emojis.
* Convert to lower case.
* Lemmatize

In [85]:
# Loading spaCy's English model
nlp = spacy.load('en_core_web_sm')

In [107]:
def preprocessor(text: str) -> str:
    """
    Takes in a single review(sentence) as input; 
    returns the review after -
    removing elements not needed for sentiment analysis and
    converting to lower case.
    """
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove all punctuations except apostrophes to preserve words like "I'll", which can be removed later using spaCy stopwords
    text = text.translate(str.maketrans('', '', string.punctuation[:6] + string.punctuation[7:]))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove non-word characters except apostrophes
    text = re.sub(r"[^\w\s']", ' ', text)
    
    # Lemmatization and lower casing using spaCy
    doc = nlp(text)
    text = ' '.join([token.lemma_ for token in doc if not token.is_stop])
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text 

In [109]:
# Applying preprocessing to reviews, row-by-row
df['Cleaned_Reviews'] = df['Reviews'].apply(preprocessor)

In [110]:
df.head()

Unnamed: 0,Reviews,Ratings,Cleaned_Reviews
0,"Jace Rankin may be short, but he's nothing to ...",3,Jace Rankin short mess man haul saloon underta...
1,Great short read. I didn't want to put it dow...,5,great short read want read sit sex scene great...
2,I'll start by saying this is the first of four...,3,start say book expect conclude center child Al...
3,Aggie is Angela Lansbury who carries pocketboo...,3,aggie Angela Lansbury carry pocketbook instead...
4,I did not expect this type of book to be in li...,4,expect type book library pleased find price right


# Converting Ratings to Binary Labels

In [112]:
df["Sentiment"] = df["Ratings"].apply(lambda x: 1 if x >=3 else 0) 

In [113]:
df.head()

Unnamed: 0,Reviews,Ratings,Cleaned_Reviews,Sentiment
0,"Jace Rankin may be short, but he's nothing to ...",3,Jace Rankin short mess man haul saloon underta...,1
1,Great short read. I didn't want to put it dow...,5,great short read want read sit sex scene great...,1
2,I'll start by saying this is the first of four...,3,start say book expect conclude center child Al...,1
3,Aggie is Angela Lansbury who carries pocketboo...,3,aggie Angela Lansbury carry pocketbook instead...,1
4,I did not expect this type of book to be in li...,4,expect type book library pleased find price right,1


# Checking Balance of Dataset

In [114]:
df.Sentiment.value_counts()

1    8000
0    4000
Name: Sentiment, dtype: int64

# Extracting Independent Feature and Label

In [115]:
X = df["Cleaned_Reviews"]
y = df["Sentiment"]

In [116]:
X.head()

0    Jace Rankin short mess man haul saloon underta...
1    great short read want read sit sex scene great...
2    start say book expect conclude center child Al...
3    aggie Angela Lansbury carry pocketbook instead...
4    expect type book library pleased find price right
Name: Cleaned_Reviews, dtype: object

In [117]:
y[:5]

0    1
1    1
2    1
3    1
4    1
Name: Sentiment, dtype: int64

# Vectorizing Text Reviews using Word2Vec