# Sentiment Analysis of Amazon Product Reviews

## Import libraries and download data

In [1]:
import os
import json
import gzip
from urllib.request import urlopen
import numpy as np
import pandas as pd
import random
import string
import nltk
from nltk.tokenize import WhitespaceTokenizer
from nltk.corpus import stopwords
from nltk import classify
from nltk import NaiveBayesClassifier
import string
import re
import itertools

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Kindle_Store_5.json.gz

--2021-08-19 22:13:08--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Kindle_Store_5.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 538673481 (514M) [application/octet-stream]
Saving to: ‘Kindle_Store_5.json.gz.1’


2021-08-19 22:13:33 (20.4 MB/s) - ‘Kindle_Store_5.json.gz.1’ saved [538673481/538673481]



In [3]:
# load the data
data = []
with gzip.open('Kindle_Store_5.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))
# total length of list, this number equals total number of products
print(len(data))
# first row of the list
print(data[0])

2222983
{'overall': 4.0, 'verified': True, 'reviewTime': '07 3, 2014', 'reviewerID': 'A2LSKD2H9U8N0J', 'asin': 'B000FA5KK0', 'style': {'Format:': ' Kindle Edition'}, 'reviewerName': 'sandra sue marsolek', 'reviewText': 'pretty good story, a little exaggerated, but I liked it pretty well.  liked the characters, the plot..it had mystery, action, love, all of the main things. I think most western lovers would injoy this book', 'summary': 'pretty good story', 'unixReviewTime': 1404345600}


##Preprocess data

In [4]:
# convert list into pandas dataframe
df = pd.DataFrame.from_dict(data)
print(len(df))

2222983


In [5]:
# save df to drive
df.to_csv('/content/drive/MyDrive/Colab Notebooks/res_df.csv')

In [7]:
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,4.0,True,"07 3, 2014",A2LSKD2H9U8N0J,B000FA5KK0,{'Format:': ' Kindle Edition'},sandra sue marsolek,"pretty good story, a little exaggerated, but I...",pretty good story,1404345600,,
1,5.0,True,"05 26, 2014",A2QP13XTJND1QS,B000FA5KK0,{'Format:': ' Kindle Edition'},Tpl,"If you've read other max brand westerns, you k...",A very good book,1401062400,,
2,5.0,True,"09 16, 2016",A8WQ7MAG3HFOZ,B000FA5KK0,{'Format:': ' Kindle Edition'},Alverne F. Anderson,"Love Max, always a fun twist",Five Stars,1473984000,,
3,5.0,True,"03 3, 2016",A1E0MODSRYP7O,B000FA5KK0,{'Format:': ' Kindle Edition'},Jeff,"As usual for him, a good book",a good,1456963200,,
4,5.0,True,"09 10, 2015",AYUTCGVSM1H7T,B000FA5KK0,{'Format:': ' Kindle Edition'},DEHS - EddyRapcon,MB is one of the original western writers and ...,A Western,1441843200,2.0,


In [5]:
# create a new df with only necessary columns
new_df = df[['overall','reviewText']].copy()
new_df.head()

Unnamed: 0,overall,reviewText
0,4.0,"pretty good story, a little exaggerated, but I..."
1,5.0,"If you've read other max brand westerns, you k..."
2,5.0,"Love Max, always a fun twist"
3,5.0,"As usual for him, a good book"
4,5.0,MB is one of the original western writers and ...


In [6]:
# split df into positive and negative reviews based on the overall star rating. 
#omitting 3 stars as they could be either + or -
pos_reviews = new_df.loc[new_df['overall'] > 3]
neg_reviews = new_df.loc[new_df['overall'] < 3]

In [7]:
# add column to label review type and combine df back to one 
pos_reviews.insert(2, 'reaction', 'positive')
neg_reviews.insert(2, 'reaction', 'negative')

all_reviews = pd.concat([pos_reviews, neg_reviews], ignore_index=True)
all_reviews.head()

Unnamed: 0,overall,reviewText,reaction
0,4.0,"pretty good story, a little exaggerated, but I...",positive
1,5.0,"If you've read other max brand westerns, you k...",positive
2,5.0,"Love Max, always a fun twist",positive
3,5.0,"As usual for him, a good book",positive
4,5.0,MB is one of the original western writers and ...,positive


In [8]:
# remove reviews that do not contain text
all_reviews['reviewText'].isnull().sum()
reviews.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2024661 entries, 0 to 2025033
Data columns (total 3 columns):
 #   Column      Dtype  
---  ------      -----  
 0   overall     float64
 1   reviewText  object 
 2   reaction    object 
dtypes: float64(1), object(2)
memory usage: 61.8+ MB


In [None]:
# send word arrays to a txt file, save in drive (for use in text gen)
wordsTxt = sample_df["review.words"].values.flatten()
wordsArr = itertools.chain(wordsTxt)
temp=list(wordsArr)
arrOut=np.concatenate(temp, axis=None)
np.savetxt('/content/drive/MyDrive/Colab Notebooks/KindlePreprocessed.txt', arrOut, delimiter=None, fmt="%s",encoding='utf8')

In [9]:
# create a sample df with equal number of + and - reviews 
sample_df = reviews.groupby('reaction').apply(lambda x: x.sample(n=10000)).reset_index(drop = True)
sample_df['reaction'].value_counts()

positive    10000
negative    10000
Name: reaction, dtype: int64

In [10]:
# create lists of + and - reviews from the sample df
pos_df = sample_df.loc[sample_df['reaction'] == 'positive']
pos_list = pos_df['reviewText'].tolist()

neg_df = sample_df.loc[sample_df['reaction'] == 'negative']
neg_list = neg_df['reviewText'].tolist()

In [11]:
# clean the lists (make all lowercase, convert to strings, and remove punctuation)
pos_list_lowered = [word.lower() for word in pos_list]
neg_list_lowered = [word.lower() for word in neg_list]

pos_list_to_string = ' '.join([str(elem) for elem in pos_list_lowered]) 
neg_list_to_string = ' '.join([str(elem) for elem in neg_list_lowered])

nltk.download('stopwords')

stop = set(stopwords.words('english') + list(string.punctuation))

tokenizer = WhitespaceTokenizer()

filtered_pos_list1 = [w for w in tokenizer.tokenize(pos_list_to_string) if w not in stop]
filtered_pos_list = [w.strip(string.punctuation) for w in filtered_pos_list1]

filtered_neg_list1 = [w for w in tokenizer.tokenize(neg_list_to_string) if w not in stop]
filtered_neg_list = [w.strip(string.punctuation) for w in filtered_neg_list1]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [12]:
# view the frequency distribution of the most common + and - words
freq_dist_pos = nltk.FreqDist(filtered_pos_list)
freq_dist_neg = nltk.FreqDist(filtered_neg_list)

print(freq_dist_pos.most_common(15))
print(freq_dist_neg.most_common(15))

[('book', 10369), ('story', 6557), ('read', 6177), ('love', 4814), ('one', 4084), ('series', 3679), ('characters', 3046), ('great', 2870), ('good', 2842), ('like', 2621), ('really', 2616), ('books', 2308), ('loved', 2223), ('well', 2179), ('get', 2101)]
[('book', 11325), ('story', 6370), ('like', 4728), ('read', 4408), ('one', 4286), ('would', 3225), ('really', 2968), ('author', 2812), ('characters', 2740), ('get', 2585), ('it', 2507), ('good', 2408), ('time', 2349), ('even', 2301), ('books', 2285)]


In [13]:
# convert the sets of words to feature sets
def word_features(words):
    return dict([(word, True) for word in words.split()])

##Train model

In [14]:
# combine into one set for training and testing
positive_features = [(word_features(f), 'pos') for f in filtered_pos_list]
negative_features = [(word_features(f), 'neg') for f in filtered_neg_list]
labeledwords = positive_features + negative_features

In [15]:
# shuffle the words and train and test the model
random.shuffle(labeledwords)
train_set, test_set = labeledwords[2000:], labeledwords[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

##Evaluate model

In [16]:
# test the model w examples
print(classifier.classify(word_features('I loved this book, the story was great')))
print(classifier.classify(word_features('I did not like this book, the characters were bland')))

pos
neg


In [17]:
# calculate the accuracy of the model
print(nltk.classify.accuracy(classifier, test_set))

0.638


In [18]:
# show the model's most informative features from the set
classifier.show_most_informative_features(15)

Most Informative Features
                    blah = True              neg : pos    =     82.1 : 1.0
                  poorly = True              neg : pos    =     57.3 : 1.0
                 deleted = True              neg : pos    =     43.3 : 1.0
            heartwarming = True              pos : neg    =     36.1 : 1.0
                skimming = True              neg : pos    =     36.0 : 1.0
               unlikable = True              neg : pos    =     35.2 : 1.0
                   waste = True              neg : pos    =     33.5 : 1.0
              disjointed = True              neg : pos    =     33.2 : 1.0
                 useless = True              neg : pos    =     32.5 : 1.0
                juvenile = True              neg : pos    =     32.3 : 1.0
                   bland = True              neg : pos    =     31.9 : 1.0
               contrived = True              neg : pos    =     29.8 : 1.0
            inconsistent = True              neg : pos    =     28.5 : 1.0

## Classify generated product reviews

In [35]:
# obtain the generated reviews and convert to array
path_to_file = '/content/drive/MyDrive/Colab Notebooks/edited_words.txt'
example = open(path_to_file, 'rb').read().decode(encoding='utf-8')
example = example.replace('\n', ' ')
ex_str = str(example)
ex_arr = ex_str.split("\"")

In [36]:
# loop through reviews array and input each into classifier
n = 1
for i in ex_arr:
  print("Review ", n, ":", classifier.classify(word_features(i)))
  n = n+1

Review  1 : pos
Review  2 : pos
Review  3 : neg
Review  4 : neg
Review  5 : neg
Review  6 : pos
Review  7 : neg
Review  8 : neg
Review  9 : neg
Review  10 : pos
