## The Libraries Used

In [2]:
#PANDAS, REGULAR EXPRESSION and NUMPY
import pandas as pd
import re
import numpy

#SKLEARN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest

In [3]:
pwd

'/Users/siddharthmandgi/Desktop/Amazon_Reviews_to_Rating'

In [4]:
ls

[34mAmazon_Datasets [m[m/                  amazon_reviews_to_Sentiment.ipynb
DASK.ipynb                         amazon_reviews_to_ratings.ipynb


In [5]:
cd '/Users/siddharthmandgi/Desktop/Amazon_Reviews_to_Rating/Amazon_Datasets /'

/Users/siddharthmandgi/Desktop/Amazon_Reviews_to_Rating/Amazon_Datasets 


## Importing The Dataset

In [34]:
books = pd.read_table('/Users/siddharthmandgi/Desktop/Amazon_Reviews_to_Rating/Amazon_Datasets /amazon_reviews_us_Books_v1_00.tsv',error_bad_lines=False)
books.head(5)

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,25933450,RJOVP071AVAJO,0439873800,84656342,There Was an Old Lady Who Swallowed a Shell!,Books,5,0.0,0.0,N,Y,Five Stars,I love it and so does my students!,2015-08-31
1,US,1801372,R1ORGBETCDW3AI,1623953553,729938122,I Saw a Friend,Books,5,0.0,0.0,N,Y,"Please buy ""I Saw a Friend""! Your children wil...",My wife and I ordered 2 books and gave them as...,2015-08-31
2,US,5782091,R7TNRFQAOUTX5,142151981X,678139048,"Black Lagoon, Vol. 6",Books,5,0.0,0.0,N,Y,Shipped fast.,Great book just like all the others in the ser...,2015-08-31
3,US,32715830,R2GANXKDIFZ6OI,014241543X,712432151,If I Stay,Books,5,0.0,0.0,N,N,Five Stars,So beautiful,2015-08-31
4,US,14005703,R2NYB6C3R8LVN6,1604600527,800572372,Stars 'N Strips Forever,Books,5,2.0,2.0,N,Y,Five Stars,Enjoyed the author's story and his quilts are ...,2015-08-31


In [8]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10236850 entries, 0 to 10236849
Data columns (total 15 columns):
marketplace          object
customer_id          int64
review_id            object
product_id           object
product_parent       int64
product_title        object
product_category     object
star_rating          object
helpful_votes        float64
total_votes          float64
vine                 object
verified_purchase    object
review_headline      object
review_body          object
review_date          object
dtypes: float64(2), int64(2), object(11)
memory usage: 1.1+ GB


In [9]:
books.columns

Index(['marketplace', 'customer_id', 'review_id', 'product_id',
       'product_parent', 'product_title', 'product_category', 'star_rating',
       'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
       'review_headline', 'review_body', 'review_date'],
      dtype='object')

In [10]:
books.isnull().sum()

marketplace             0
customer_id             0
review_id               0
product_id              0
product_parent          0
product_title           0
product_category        0
star_rating            60
helpful_votes          93
total_votes            93
vine                   93
verified_purchase      93
review_headline       163
review_body           287
review_date          1128
dtype: int64

In [11]:
books = books.dropna()

In [12]:
books.shape

(10235459, 15)

In [13]:
ratings = books['star_rating'].unique().tolist() #non uniform datatype
ratings

['5', '2', '4', '3', '1', 5, 1, 3, 4, 2]

In [14]:
books['star_rating'] = books['star_rating'].astype(int)

In [15]:
ratings = books['star_rating'].unique().tolist() #uniform datatype
ratings

[5, 2, 4, 3, 1]

## Creating Sentiments

In [16]:
def partition(x):
    if x<3:
        return 'Negative'
    elif x>3:
        return 'Positive'
    return 'Neutral'

actualScore = books['star_rating']
positiveNegative = actualScore.map(partition)
books['star_rating'] = positiveNegative

In [17]:
books.head(10)

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,25933450,RJOVP071AVAJO,0439873800,84656342,There Was an Old Lady Who Swallowed a Shell!,Books,Positive,0.0,0.0,N,Y,Five Stars,I love it and so does my students!,2015-08-31
1,US,1801372,R1ORGBETCDW3AI,1623953553,729938122,I Saw a Friend,Books,Positive,0.0,0.0,N,Y,"Please buy ""I Saw a Friend""! Your children wil...",My wife and I ordered 2 books and gave them as...,2015-08-31
2,US,5782091,R7TNRFQAOUTX5,142151981X,678139048,"Black Lagoon, Vol. 6",Books,Positive,0.0,0.0,N,Y,Shipped fast.,Great book just like all the others in the ser...,2015-08-31
3,US,32715830,R2GANXKDIFZ6OI,014241543X,712432151,If I Stay,Books,Positive,0.0,0.0,N,N,Five Stars,So beautiful,2015-08-31
4,US,14005703,R2NYB6C3R8LVN6,1604600527,800572372,Stars 'N Strips Forever,Books,Positive,2.0,2.0,N,Y,Five Stars,Enjoyed the author's story and his quilts are ...,2015-08-31
5,US,36205738,R13U5PBJI1H94K,0399170863,559876774,The Liar,Books,Negative,1.0,1.0,N,N,PREDICTABLE ALMOST FROM PAGE 1,Two or three pages into the book I suspected h...,2015-08-31
6,US,44121167,R1H8UVH990F8VE,1517007240,299984591,Devil in the Details (Book 2: The Monastery Mu...,Books,Positive,2.0,2.0,N,N,The Monastery Murders - Book 2: Devil in the D...,"&#34;Secrets in the Shallows,&#34; Book 1 of &...",2015-08-31
7,US,16519255,R2MC0N30WZMRQ5,0671728725,821650353,Knowing When to Stop: A Memoir,Books,Positive,0.0,0.0,N,Y,Five Stars,I love it!,2015-08-31
8,US,49361350,R2NO2HXK16Y4J,1111349533,419457767,The American Pageant,Books,Positive,0.0,0.0,N,Y,Five Stars,It was a great purchase.,2015-08-31
9,US,11423253,R245YIAVJK82ZL,812211637X,285887177,Punjabi C.L. Bible / Common Language Version,Books,Positive,0.0,0.0,N,Y,Five Stars,Quality product fast shipping.,2015-08-31


## EDA With spaCy

In [11]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_sm
nlp = en_core_web_sm.load()

In [19]:
stopwords = list(STOP_WORDS) 

In [20]:
import string
punctuations = string.punctuation

In [21]:
from spacy.lang.en import English
parser = English()

In [22]:
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [word.lemma_.lower() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens]
    mytokens = [word for word in mytokens if word not in stopwords and word not in punctuations]
    return mytokens

In [23]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

In [24]:
#Custom Transformer using spaCy
class predictors(TransformerMixin):
    
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self,deep=True):
        return{}
    
# Basic function to clean the text
def clean_text(text):
    return text.strip().lower()

In [25]:
tfdifVect = TfidfVectorizer(tokenizer = spacy_tokenizer)

##  Visualization with spaCy

In [10]:
from spacy import displacy

### Breaking down a review into dependencies

In [32]:
doc = nlp(books['review_body'][8]) 
displacy.render(doc,style='dep',jupyter=True)

## MACHINE LEARNING ALGORITHMS

In [250]:
data = books.head(100000)

In [251]:
#X = data.drop('star_rating',axis=1)
X = data['review_body']
y = data['star_rating']

In [252]:
X.shape[0]

100000

In [253]:
y.shape[0]

100000

In [27]:
from sklearn.model_selection import train_test_split

In [254]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=42)

In [255]:
X_train

75232    My daughter loves these books!  We buy them al...
48963    Jeff Shavitz’s book, “Size Doesn’t Matter: Why...
44972                    old book, old info, in bad shape.
13572                                            very good
92740                                            thank you
                               ...                        
6267                                    abc in a cute way.
54895    Cute idea of a story but the way the sentences...
76832    Susan May Warren in her new book “Always on My...
860      Wonderful, uplifting books for self and for gi...
15800    Excellent thesis on Senator Warren's ideas and...
Name: review_body, Length: 80000, dtype: object

In [256]:
y_train

75232    Positive
48963    Positive
44972    Negative
13572    Positive
92740    Positive
           ...   
6267     Positive
54895    Negative
76832    Positive
860      Positive
15800    Positive
Name: star_rating, Length: 80000, dtype: object

### Support Vector Machines

In [47]:
from sklearn.svm import LinearSVC
classifier = LinearSVC()

In [48]:
pipe = Pipeline([('cleaner', predictors()),
                 ('tfidfVect', tfdifVect),
                ('classifier',classifier),])

In [49]:
pipe.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('cleaner', <__main__.predictors object at 0x1273df550>),
                ('tfidfVect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=N...
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function spacy_tokenizer at 0x138358290>,
                                 use_idf=True, vocabulary=None)),
                ('classifier',
                 LinearSVC(C=1.0, class_weight

In [50]:
sample_prediction = pipe.predict(X_test)

In [51]:
count = 1
for (sample,pred) in zip(X_test[0:10],sample_prediction[0:10]):
    print('\n',count,')')
    print(sample,"'PREDICTION':", pred)
    count += 1


 1 )
Here I am.  Now I am here.  I am.  Yes.  Enjoy being. 'PREDICTION': Positive

 2 )
Great book, simply love it! It was a pleasure coloring, made me feel so much more relaxed. I highly recommend it and can't wait for the next one to be published. 'PREDICTION': Positive

 3 )
Phenomenal read for anyone in transition! This book daily takes you through spiritual disciplines that help you stay focused on the fullest life in Christ and how to handle the post grad experience.  So thankful for Mrs. Robin and her words of encouragement.  Would recommend to the closest of friends! 'PREDICTION': Positive

 4 )
very abstract... did nothing for me... waste of money do not buy!!!! no trading advise what so ever! 'PREDICTION': Negative

 5 )
My review is strictly for The Hobbit & The Lord of the Rings Deluxe Pocket set.<br /><br />I love this set of books. Before I purchased, I took some time to read reviews on Amazon. Most reviewers mentioned that the font size was small. I have no issue with b

In [52]:
#Test Accuracy
print("Accuracy:", pipe.score(X_test,y_test))

Accuracy: 0.89605


In [53]:
pipe.predict(["I love this book"])

array(['Positive'], dtype=object)

In [54]:
pipe.predict(["Waste of Money!"])

array(['Negative'], dtype=object)

In [55]:
pipe.predict(["Fake! got stones instead!"])

array(['Negative'], dtype=object)

### Logistic Regression

In [37]:
from sklearn.linear_model import LogisticRegression

In [38]:
classifier =  LogisticRegression()

In [68]:
pipe = Pipeline([('cleaner', predictors()),
                ('tfidfVect', tfdifVect),
                ('classifier',classifier)])

In [69]:
pipe.fit(X_train,y_train)



Pipeline(memory=None,
         steps=[('cleaner', <__main__.predictors object at 0x17c67dd90>),
                ('tfidfVect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=N...
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function spacy_tokenizer at 0x138358290>,
                                 use_idf=True, vocabulary=None)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                         

In [60]:
sample_prediction = pipe.predict(X_test)

In [70]:
#Test Accuracy
print("Accuracy:", pipe.score(X_test,y_test))

Accuracy: 0.89605


### Decision Trees

In [71]:
from sklearn.tree import DecisionTreeClassifier

In [72]:
classifier = DecisionTreeClassifier()

In [73]:
pipe = Pipeline([('cleaner', predictors()),
                ('tfidfVect', tfdifVect),
                ('classifier',classifier)])

In [74]:
pipe.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('cleaner', <__main__.predictors object at 0x17a050fd0>),
                ('tfidfVect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=N...
                                 tokenizer=<function spacy_tokenizer at 0x138358290>,
                                 use_idf=True, vocabulary=None)),
                ('classifier',
                 DecisionTreeClassifier(class_weight=None, criterion='gini',
                                        max_depth=None, max_features=None,
               

In [76]:
sample_prediction = pipe.predict(X_test)

In [75]:
print("Accuracy:", pipe.score(X_test,y_test))

Accuracy: 0.84955


## ENSEMBLE ALGORITHMS

In [196]:
from sklearn import model_selection

In [197]:
seed = 7

### Bagging

In [245]:
from sklearn.ensemble import BaggingClassifier

In [246]:
model = BaggingClassifier(n_estimators=100, random_state=seed)

In [247]:
pipe = Pipeline([('cleaner', predictors()),
                ('tfidfVect', tfdifVect),
                ('classifier', model)])

In [248]:
pipe.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('cleaner', <__main__.predictors object at 0x2a3f03d10>),
                ('tfidfVect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=N...
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function spacy_tokenizer at 0x126c0db00>,
                                 use_idf=True, vocabulary=None)),
                ('classifier',
                 BaggingClassifier(base_estima

In [249]:
print("Accuracy:", pipe.score(X_test,y_test))

Accuracy: 0.877


### Boosting

In [220]:
from sklearn.ensemble import AdaBoostClassifier

In [221]:
model = AdaBoostClassifier(n_estimators=100, random_state=seed)

In [222]:
pipe = Pipeline([('cleaner', predictors()),
                ('tfidfVect', tfdifVect),
                ('classifier', model)])

In [223]:
pipe.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('cleaner', <__main__.predictors object at 0x2d90bcb50>),
                ('tfidfVect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function spacy_tokenizer at 0x126c0db00>,
                                 use_idf=True, vocabulary=None)),
                ('classifier',
                 AdaBoost

In [224]:
print("Accuracy:", pipe.score(X_test,y_test))

Accuracy: 0.872


## Random Forest

In [82]:
from sklearn.ensemble import RandomForestClassifier

In [83]:
classifier = RandomForestClassifier(n_estimators = 100, random_state = 42)

In [87]:
pipe = Pipeline([('cleaner', predictors()),
                ('tfidfVect', tfdifVect),
                ('classifier',classifier)])

In [89]:
pipe.fit(X_train,y_train)

In [68]:
sample_prediction = pipe.predict(X_test)

In [86]:
print("Accuracy:", pipe.score(X_test,y_test))

Accuracy: 0.88475


## Generating Batch-Wise Predictions for our Big Data (1+ GB)

We will be using **LOGISTIC REGRESSION** for each batch since it provided the highest accuracy overall.

In [64]:
import numpy as np
data_list = np.array_split(books, 5)

In [65]:
data_list[0]

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,25933450,RJOVP071AVAJO,0439873800,84656342,There Was an Old Lady Who Swallowed a Shell!,Books,Positive,0.0,0.0,N,Y,Five Stars,I love it and so does my students!,2015-08-31
1,US,1801372,R1ORGBETCDW3AI,1623953553,729938122,I Saw a Friend,Books,Positive,0.0,0.0,N,Y,"Please buy ""I Saw a Friend""! Your children wil...",My wife and I ordered 2 books and gave them as...,2015-08-31
2,US,5782091,R7TNRFQAOUTX5,142151981X,678139048,"Black Lagoon, Vol. 6",Books,Positive,0.0,0.0,N,Y,Shipped fast.,Great book just like all the others in the ser...,2015-08-31
3,US,32715830,R2GANXKDIFZ6OI,014241543X,712432151,If I Stay,Books,Positive,0.0,0.0,N,N,Five Stars,So beautiful,2015-08-31
4,US,14005703,R2NYB6C3R8LVN6,1604600527,800572372,Stars 'N Strips Forever,Books,Positive,2.0,2.0,N,Y,Five Stars,Enjoyed the author's story and his quilts are ...,2015-08-31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2047383,US,35585422,R39HG02Y8V7LWE,0310435307,260370434,NIV Giant Print Compact Bible,Books,Positive,0.0,1.0,N,Y,Excellent.,Perfect print size and easy to handle. Too bad...,2015-02-24
2047384,US,8699723,R2X6DNUCOZV015,0979278031,313033861,Jetty Man,Books,Positive,0.0,0.0,N,Y,Five Stars,Great series of books based in our area,2015-02-24
2047385,US,48359513,R3PZ4X28BR5289,1596435828,513092252,Giants Beware! (The Chronicles of Claudette),Books,Positive,1.0,1.0,N,N,Four Stars,Amazing book- great story with incredible illu...,2015-02-24
2047386,US,7032112,R198Y7OVTU9IQ4,0764143573,455553794,Barron's Law Dictionary: Mass Market Edition (...,Books,Positive,0.0,0.0,N,Y,Five Stars,Great,2015-02-24


In [66]:
import pickle

In [67]:
count = 1
sample_predictions_list_final = []
loaded_models_final = []
for i in data_list:
    X = i['review_body']
    y = i['star_rating']
    X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=42)
    classifier = LogisticRegression(multi_class='auto')
    pipe = Pipeline([('cleaner', predictors()),
                ('tfidfVect', tfdifVect),
                ('classifier', classifier)])
    pipe.fit(X_train,y_train)
    filename = str(count) + 'sentimental_model.sav'
    pickle.dump(pipe, open(filename, 'wb'))
    loaded_models_final.append(pickle.load(open(filename, 'rb')))
    sample_predictions = loaded_models_final[count-1].predict(X_test)
    sample_predictions_list_final.append(sample_predictions)
    print(count)
    print("Accuracy for the Batch:", pipe.score(X_test,y_test))
    count += 1 



1
Accuracy for the Batch: 0.9007813511341681




2
Accuracy for the Batch: 0.902984473119225




3
Accuracy for the Batch: 0.8923327935440222




4
Accuracy for the Batch: 0.8882807099817058




5
Accuracy for the Batch: 0.8849491596628393


In [71]:
loaded_models_final #All 5 Models for all 5 batches

[Pipeline(memory=None,
          steps=[('cleaner', <__main__.predictors object at 0x12e0fbcd0>),
                 ('tfidfVect',
                  TfidfVectorizer(analyzer='word', binary=False,
                                  decode_error='strict',
                                  dtype=<class 'numpy.float64'>,
                                  encoding='utf-8', input='content',
                                  lowercase=True, max_df=1.0, max_features=None,
                                  min_df=1, ngram_range=(1, 1), norm='l2',
                                  preprocessor=None, smooth_idf=True,
                                  stop_words=N...
                                  token_pattern='(?u)\\b\\w\\w+\\b',
                                  tokenizer=<function spacy_tokenizer at 0x126c0db00>,
                                  use_idf=True, vocabulary=None)),
                 ('classifier',
                  LogisticRegression(C=1.0, class_weight=None, dual=False,
         

In [72]:
len(sample_predictions_list_final)

5

In [73]:
Predictions_DataFrame = pd.DataFrame({'Batch1': sample_predictions_list_final[0],
                                      'Batch2': sample_predictions_list_final[1],
                                      'Batch3': sample_predictions_list_final[2],
                                      'Batch4': sample_predictions_list_final[3],
                                      'Batch5': sample_predictions_list_final[4]})

In [74]:
Predictions_DataFrame.to_csv('Predictions_DataFrame') # Ratings match the size of X_test
Predictions_DataFrame

Unnamed: 0,Batch1,Batch2,Batch3,Batch4,Batch5
0,Positive,Positive,Neutral,Positive,Positive
1,Positive,Positive,Negative,Positive,Positive
2,Positive,Positive,Positive,Positive,Positive
3,Positive,Positive,Positive,Positive,Positive
4,Positive,Positive,Positive,Positive,Positive
...,...,...,...,...,...
409414,Positive,Positive,Positive,Positive,Positive
409415,Positive,Positive,Positive,Positive,Positive
409416,Positive,Positive,Positive,Positive,Positive
409417,Positive,Positive,Positive,Positive,Positive


In [77]:
ratings = Predictions_DataFrame['Batch1'].unique().tolist() #confirmation for those batches which seem to                                                            #have l unique value only
ratings

['Positive', 'Negative', 'Neutral']

In [78]:
ratings = Predictions_DataFrame['Batch4'].unique().tolist() #confirmation for those batches which seem to                                                            #have l unique value only
ratings

['Positive', 'Negative', 'Neutral']