## The following python program is using TF-IDF text representations and performing text classication for sentiment analysis on Amazon Review data.

In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
import re
from bs4 import BeautifulSoup
import bs4
import contractions
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import classification_report

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shravanvasista/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shravanvasista/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shravanvasista/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
#! pip install bs4 # in case you don't have it installed

# Dataset: https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Kitchen_v1_00.tsv.gz

## Read Data

In [3]:
#data = pd.read_csv('amazon_reviews_us_Kitchen_v1_00.tsv', sep='\t', usecols=['star_rating','review_body'])
df = pd.read_csv('https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Kitchen_v1_00.tsv.gz', compression='gzip', sep='\t', warn_bad_lines=False, error_bad_lines=False)
df.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,37000337,R3DT59XH7HXR9K,B00303FI0G,529320574,Arthur Court Paper Towel Holder,Kitchen,5.0,0.0,0.0,N,Y,Beautiful. Looks great on counter,Beautiful. Looks great on counter.,2015-08-31
1,US,15272914,R1LFS11BNASSU8,B00JCZKZN6,274237558,Olde Thompson Bavaria Glass Salt and Pepper Mi...,Kitchen,5.0,0.0,1.0,N,Y,Awesome & Self-ness,I personally have 5 days sets and have also bo...,2015-08-31
2,US,36137863,R296RT05AG0AF6,B00JLIKA5C,544675303,Progressive International PL8 Professional Man...,Kitchen,5.0,0.0,0.0,N,Y,Fabulous and worth every penny,Fabulous and worth every penny. Used for clean...,2015-08-31
3,US,43311049,R3V37XDZ7ZCI3L,B000GBNB8G,491599489,Zyliss Jumbo Garlic Press,Kitchen,5.0,0.0,1.0,N,Y,Five Stars,A must if you love garlic on tomato marinara s...,2015-08-31
4,US,13763148,R14GU232NQFYX2,B00VJ5KX9S,353790155,"1 X Premier Pizza Cutter - Stainless Steel 14""...",Kitchen,5.0,0.0,0.0,N,Y,Better than sex,Worth every penny! Buy one now and be a pizza ...,2015-08-31


## Keep Reviews and Ratings

In [4]:
df = df.dropna(axis = 0)
df = df[['star_rating','review_body']]
print(df.head())

   star_rating                                        review_body
0          5.0                Beautiful.  Looks great on counter.
1          5.0  I personally have 5 days sets and have also bo...
2          5.0  Fabulous and worth every penny. Used for clean...
3          5.0  A must if you love garlic on tomato marinara s...
4          5.0  Worth every penny! Buy one now and be a pizza ...


In [5]:
print(df.shape)

(4874562, 2)


# Labelling Reviews:
## The reviews with rating 4,5 are labelled to be 1 and 1,2 are labelled as 0. Discard the reviews with rating 3'

In [6]:
print(df.star_rating.value_counts())

5.0    3124553
4.0     731693
1.0     426852
3.0     349533
2.0     241931
Name: star_rating, dtype: int64


In [7]:
pos_label = df[df["star_rating"].isin([4,5])]
pos_label["label"]=1
print(pos_label.head(3),"\n",pos_label.shape)

   star_rating                                        review_body  label
0          5.0                Beautiful.  Looks great on counter.      1
1          5.0  I personally have 5 days sets and have also bo...      1
2          5.0  Fabulous and worth every penny. Used for clean...      1 
 (3856246, 3)


In [8]:
neutral_label = df[df["star_rating"].isin([3])]
print(neutral_label.head(3),"\n",neutral_label.shape)

    star_rating                                        review_body
9           3.0  Should have come with a kit to install drain t...
28          3.0  Was ok....case was good for price...arrived on...
34          3.0  Not equal to the brand, soft and thin, bust wh... 
 (349533, 2)


In [9]:
neg_label = df[df["star_rating"].isin([1,2])]
neg_label["label"]=0
print(neg_label.head(3),"\n",neg_label.shape)

    star_rating                                        review_body  label
5           1.0  The description says &#34;Suitable for all typ...      0
24          1.0                    I hate it I cook in regular pot      0
25          2.0  The velcro does not hold well.  Does stat cold...      0 
 (668783, 3)


 ## We select 200000 reviews randomly with 100,000 positive and 100,000 negative reviews.



In [10]:
rnd_pos_labels = pos_label.sample(n=100000)
rnd_neg_labels = neg_label.sample(n=100000)

In [11]:
exp_data = pd.concat([rnd_pos_labels,rnd_neg_labels]).reset_index(drop=True)
print(exp_data.head(10),"\n",exp_data.tail(10))

   star_rating                                        review_body  label
0          5.0  The price was perfect, its and solid and there...      1
1          5.0  Our toaster oven pan was worn out. So we purch...      1
2          5.0  These coffee urns are a great buy. Well worth ...      1
3          5.0  Beautiful craftsmanship. Unbeliveable detail i...      1
4          5.0  I bought this mug for my wife for Xmas in 2012...      1
5          5.0  great treat jar, have several in various sizes...      1
6          5.0  This has been my go-to vegetable peeler for ne...      1
7          5.0  great product.  Second time to purchase.  gave...      1
8          5.0  These are great like all the reviews state. Gr...      1
9          5.0  great product! super fancy! well made!! bought...      1 
         star_rating                                        review_body  label
199990          1.0  Crap.  Not even good enough for a kids Hallowe...      0
199991          1.0  I purchased this t

# Data Cleaning

## Convert the all reviews into the lower case.

In [12]:
## Stats Data Frame stores the count of characters after each pre-processing task
stats = pd.DataFrame()
stats['count_before_clean'] = exp_data['review_body'].str.len()
print(stats.head(),"\n",stats['count_before_clean'].sum(),"\n",stats['count_before_clean'].mean())

   count_before_clean
0                 116
1                 125
2                 143
3                 144
4                 372 
 64614217 
 323.071085


In [13]:
exp_data['review_body'] = exp_data['review_body'].str.lower()
print(exp_data.head())

   star_rating                                        review_body  label
0          5.0  the price was perfect, its and solid and there...      1
1          5.0  our toaster oven pan was worn out. so we purch...      1
2          5.0  these coffee urns are a great buy. well worth ...      1
3          5.0  beautiful craftsmanship. unbeliveable detail i...      1
4          5.0  i bought this mug for my wife for xmas in 2012...      1


## Remove the HTML and URLs from the reviews

In [14]:
exp_data['after_url_clean'] = exp_data['review_body'].apply(lambda x: bs4.BeautifulSoup(x, 'lxml').get_text())
exp_data['after_url_clean'] = exp_data['after_url_clean'].apply(lambda x: re.sub(r'http\S+', '', x))
stats['count_after_url_clean'] = exp_data['after_url_clean'].str.len()
print(exp_data.head(),"\n",stats['count_after_url_clean'].sum(),"\n",stats['count_after_url_clean'].mean())

   star_rating                                        review_body  label  \
0          5.0  the price was perfect, its and solid and there...      1   
1          5.0  our toaster oven pan was worn out. so we purch...      1   
2          5.0  these coffee urns are a great buy. well worth ...      1   
3          5.0  beautiful craftsmanship. unbeliveable detail i...      1   
4          5.0  i bought this mug for my wife for xmas in 2012...      1   

                                     after_url_clean  
0  the price was perfect, its and solid and there...  
1  our toaster oven pan was worn out. so we purch...  
2  these coffee urns are a great buy. well worth ...  
3  beautiful craftsmanship. unbeliveable detail i...  
4  i bought this mug for my wife for xmas in 2012...   
 63855211 
 319.276055


## Strip Whitespaces

In [15]:
exp_data['after_space_clean'] = exp_data['after_url_clean'].str.strip()
stats['count_after_spaces_clean'] = exp_data['after_space_clean'].str.len()
print(exp_data.head(),"\n",stats['count_after_spaces_clean'].sum(),"\n",stats['count_after_spaces_clean'].mean())

   star_rating                                        review_body  label  \
0          5.0  the price was perfect, its and solid and there...      1   
1          5.0  our toaster oven pan was worn out. so we purch...      1   
2          5.0  these coffee urns are a great buy. well worth ...      1   
3          5.0  beautiful craftsmanship. unbeliveable detail i...      1   
4          5.0  i bought this mug for my wife for xmas in 2012...      1   

                                     after_url_clean  \
0  the price was perfect, its and solid and there...   
1  our toaster oven pan was worn out. so we purch...   
2  these coffee urns are a great buy. well worth ...   
3  beautiful craftsmanship. unbeliveable detail i...   
4  i bought this mug for my wife for xmas in 2012...   

                                   after_space_clean  
0  the price was perfect, its and solid and there...  
1  our toaster oven pan was worn out. so we purch...  
2  these coffee urns are a great buy. wel

## Perform contractions on the reviews.

In [16]:
exp_data['after_contraction_fix'] = exp_data['after_space_clean'].apply(lambda x: [contractions.fix(word) for word in x.split()])
exp_data['after_contraction_fix'] = [' '.join(map(str, l)) for l in exp_data['after_contraction_fix']]
stats['count_after_expanding_contractions'] = exp_data['after_contraction_fix'].str.len()
print(exp_data.head(),"\n",stats['count_after_expanding_contractions'].sum(),"\n",stats['count_after_expanding_contractions'].mean())

   star_rating                                        review_body  label  \
0          5.0  the price was perfect, its and solid and there...      1   
1          5.0  our toaster oven pan was worn out. so we purch...      1   
2          5.0  these coffee urns are a great buy. well worth ...      1   
3          5.0  beautiful craftsmanship. unbeliveable detail i...      1   
4          5.0  i bought this mug for my wife for xmas in 2012...      1   

                                     after_url_clean  \
0  the price was perfect, its and solid and there...   
1  our toaster oven pan was worn out. so we purch...   
2  these coffee urns are a great buy. well worth ...   
3  beautiful craftsmanship. unbeliveable detail i...   
4  i bought this mug for my wife for xmas in 2012...   

                                   after_space_clean  \
0  the price was perfect, its and solid and there...   
1  our toaster oven pan was worn out. so we purch...   
2  these coffee urns are a great buy. 

## Remove non-alphabetical characters

In [17]:
exp_data['after_nonalpha_clean'] = exp_data.after_contraction_fix.str.replace(r'[^a-zA-Z]\s?',r' ',regex=True)
stats['count_after_nonalpha_clean'] = exp_data['after_nonalpha_clean'].str.len()
print(exp_data.head(),"\n",stats['count_after_nonalpha_clean'].sum(),"\n",stats['count_after_nonalpha_clean'].mean())

   star_rating                                        review_body  label  \
0          5.0  the price was perfect, its and solid and there...      1   
1          5.0  our toaster oven pan was worn out. so we purch...      1   
2          5.0  these coffee urns are a great buy. well worth ...      1   
3          5.0  beautiful craftsmanship. unbeliveable detail i...      1   
4          5.0  i bought this mug for my wife for xmas in 2012...      1   

                                     after_url_clean  \
0  the price was perfect, its and solid and there...   
1  our toaster oven pan was worn out. so we purch...   
2  these coffee urns are a great buy. well worth ...   
3  beautiful craftsmanship. unbeliveable detail i...   
4  i bought this mug for my wife for xmas in 2012...   

                                   after_space_clean  \
0  the price was perfect, its and solid and there...   
1  our toaster oven pan was worn out. so we purch...   
2  these coffee urns are a great buy. 

## Remove the extra spaces between the words

In [18]:
exp_data['after_nonalpha_clean'] = exp_data['after_nonalpha_clean'].str.strip()
stats['count_after_nonalpha_clean'] = exp_data['after_nonalpha_clean'].str.len()
print(exp_data.head(),"\n",stats['count_after_nonalpha_clean'].sum(),"\n",stats['count_after_nonalpha_clean'].mean())

   star_rating                                        review_body  label  \
0          5.0  the price was perfect, its and solid and there...      1   
1          5.0  our toaster oven pan was worn out. so we purch...      1   
2          5.0  these coffee urns are a great buy. well worth ...      1   
3          5.0  beautiful craftsmanship. unbeliveable detail i...      1   
4          5.0  i bought this mug for my wife for xmas in 2012...      1   

                                     after_url_clean  \
0  the price was perfect, its and solid and there...   
1  our toaster oven pan was worn out. so we purch...   
2  these coffee urns are a great buy. well worth ...   
3  beautiful craftsmanship. unbeliveable detail i...   
4  i bought this mug for my wife for xmas in 2012...   

                                   after_space_clean  \
0  the price was perfect, its and solid and there...   
1  our toaster oven pan was worn out. so we purch...   
2  these coffee urns are a great buy. 

# Pre-processing

## remove the stop words 

In [19]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
exp_data['after_stopwords_removal'] = exp_data['after_nonalpha_clean'].apply(lambda x: ' '.join([x for x in x.split() if x not in stop]))
stats['count_after_removing_stopwords'] = exp_data['after_stopwords_removal'].str.len()
print(exp_data.head(),"\n",stats['count_after_removing_stopwords'].sum(),"\n",stats['count_after_removing_stopwords'].mean())

   star_rating                                        review_body  label  \
0          5.0  the price was perfect, its and solid and there...      1   
1          5.0  our toaster oven pan was worn out. so we purch...      1   
2          5.0  these coffee urns are a great buy. well worth ...      1   
3          5.0  beautiful craftsmanship. unbeliveable detail i...      1   
4          5.0  i bought this mug for my wife for xmas in 2012...      1   

                                     after_url_clean  \
0  the price was perfect, its and solid and there...   
1  our toaster oven pan was worn out. so we purch...   
2  these coffee urns are a great buy. well worth ...   
3  beautiful craftsmanship. unbeliveable detail i...   
4  i bought this mug for my wife for xmas in 2012...   

                                   after_space_clean  \
0  the price was perfect, its and solid and there...   
1  our toaster oven pan was worn out. so we purch...   
2  these coffee urns are a great buy. 

## Perform Lemmatization  

In [20]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
exp_data['after_lemmatization'] =  exp_data['after_stopwords_removal'].apply(lambda x: ' '.join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(x)]))
stats['count_after_lemmatization'] = exp_data['after_lemmatization'].str.len()
print(exp_data.head(),"\n",stats['count_after_lemmatization'].sum(),"\n",stats['count_after_lemmatization'].mean())

   star_rating                                        review_body  label  \
0          5.0  the price was perfect, its and solid and there...      1   
1          5.0  our toaster oven pan was worn out. so we purch...      1   
2          5.0  these coffee urns are a great buy. well worth ...      1   
3          5.0  beautiful craftsmanship. unbeliveable detail i...      1   
4          5.0  i bought this mug for my wife for xmas in 2012...      1   

                                     after_url_clean  \
0  the price was perfect, its and solid and there...   
1  our toaster oven pan was worn out. so we purch...   
2  these coffee urns are a great buy. well worth ...   
3  beautiful craftsmanship. unbeliveable detail i...   
4  i bought this mug for my wife for xmas in 2012...   

                                   after_space_clean  \
0  the price was perfect, its and solid and there...   
1  our toaster oven pan was worn out. so we purch...   
2  these coffee urns are a great buy. 

# TF-IDF Feature Extraction

In [21]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(exp_data['after_lemmatization'], exp_data['label'], test_size=0.2, random_state=30)
print(X_train.shape,Y_train.shape,"\n",X_test.shape,Y_test.shape)

(160000,) (160000,) 
 (40000,) (40000,)


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer= TfidfVectorizer()
tf_x_train = vectorizer.fit_transform(X_train)
tf_x_test = vectorizer.transform(X_test)
print(tf_x_train,"\n",tf_x_test)

  (0, 37185)	0.16412963075389586
  (0, 11707)	0.21129962797950347
  (0, 31537)	0.1069937751028401
  (0, 6901)	0.49596453999485274
  (0, 26365)	0.1231476144817881
  (0, 41777)	0.4086859246376329
  (0, 17102)	0.18208612005233038
  (0, 10054)	0.15596926375422737
  (0, 29264)	0.2572252268631152
  (0, 3121)	0.22940357339763728
  (0, 35473)	0.12475602403727826
  (0, 36276)	0.15348870599383874
  (0, 26970)	0.13599310668959044
  (0, 159)	0.23081670639911528
  (0, 4468)	0.2555402405836904
  (0, 5289)	0.31713255034884197
  (0, 31814)	0.18909294298846485
  (1, 51829)	0.09250212934074889
  (1, 42428)	0.20390386576925912
  (1, 168)	0.32536864882549227
  (1, 51882)	0.23088744478475887
  (1, 34497)	0.2756534446598014
  (1, 45026)	0.35627056943154756
  (1, 48637)	0.3992269351206103
  (1, 8613)	0.17925205547070538
  :	:
  (159999, 51510)	0.11490017065546101
  (159999, 31356)	0.07434579855271252
  (159999, 7351)	0.07816338115084633
  (159999, 41351)	0.05908218726144503
  (159999, 21294)	0.06344057395828

# Perceptron

In [23]:
from sklearn.linear_model import Perceptron
perceptron = Perceptron(tol=1e-3, random_state=0)
perceptron.fit(tf_x_train,Y_train)
y_test_pred=perceptron.predict(tf_x_test)
y_train_pred=perceptron.predict(tf_x_train)
test_report=classification_report(Y_test,y_test_pred,output_dict=True)
train_report=classification_report(Y_train,y_train_pred,output_dict=True)

In [24]:
train_metrics = [train_report['accuracy'],train_report['1']['precision'],train_report['1']['recall'],train_report['1']['f1-score']]
print(*train_metrics,sep="\n")
test_metrics = [test_report['accuracy'],test_report['1']['precision'],test_report['1']['recall'],test_report['1']['f1-score']]
print(*test_metrics,sep="\n")

0.902575
0.9163555084691016
0.8861305113757043
0.9009895959044195
0.8567
0.8714949610986371
0.8361304543860528
0.8534465125792596


# SVM

In [25]:
from sklearn.svm import LinearSVC
svm = LinearSVC(random_state=0)
svm.fit(tf_x_train,Y_train)
y_test_pred=svm.predict(tf_x_test)
y_train_pred=svm.predict(tf_x_train)
test_report=classification_report(Y_test,y_test_pred,output_dict=True)
train_report=classification_report(Y_train,y_train_pred,output_dict=True)

In [26]:
train_metrics = [train_report['accuracy'],train_report['1']['precision'],train_report['1']['recall'],train_report['1']['f1-score']]
print(*train_metrics,sep="\n")
test_metrics = [test_report['accuracy'],test_report['1']['precision'],test_report['1']['recall'],test_report['1']['f1-score']]
print(*test_metrics,sep="\n")

0.93375
0.9344553588187449
0.9330076587663514
0.9337309476474486
0.896075
0.9004662477194405
0.8901357647412454
0.8952712065099638


# Logistic Regression

In [27]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=1000,solver='saga')
lr.fit(tf_x_train,Y_train)
y_test_pred=lr.predict(tf_x_test)
y_train_pred=lr.predict(tf_x_train)
test_report=classification_report(Y_test,y_test_pred,output_dict=True)
train_report=classification_report(Y_train,y_train_pred,output_dict=True)

In [28]:
train_metrics = [train_report['accuracy'],train_report['1']['precision'],train_report['1']['recall'],train_report['1']['f1-score']]
print(*train_metrics,sep="\n")
test_metrics = [test_report['accuracy'],test_report['1']['precision'],test_report['1']['recall'],test_report['1']['f1-score']]
print(*test_metrics,sep="\n")

0.91381875
0.916782002566748
0.910356201351841
0.9135578026166491
0.899225
0.9054673182651192
0.8910876208606783
0.8982199217270546


# Naive Bayes

In [29]:
from sklearn.naive_bayes import MultinomialNB
mul_model = MultinomialNB()
mul_model.fit(tf_x_train,Y_train)
y_train_pred = mul_model.predict(tf_x_train)
y_test_pred = mul_model.predict(tf_x_test)
test_report=classification_report(Y_test, y_test_pred,output_dict=True)
train_report=classification_report(Y_train, y_train_pred,output_dict=True)

In [30]:
train_metrics = [train_report['accuracy'],train_report['1']['precision'],train_report['1']['recall'],train_report['1']['f1-score']]
print(*train_metrics,sep="\n")
test_metrics = [test_report['accuracy'],test_report['1']['precision'],test_report['1']['recall'],test_report['1']['f1-score']]
print(*test_metrics,sep="\n")\

0.8879375
0.8897451022226684
0.8857432001899074
0.8877396411174695
0.87015
0.8747398873268031
0.863433695706628
0.8690500201694231
