# Sentiment Analysis 
#### by Robby Jeffries
#### 02-24-2022

## Import Data

In [1]:
import os
import numpy as np
import pandas as pd

In [3]:
# set working directory
os.chdir('/Users/robbyjeffries/MSEACapstone/Data')

In [4]:
df = pd.read_csv('CSV_completed/Electronics_clean.csv', sep='\t')

In [5]:
df.head()

Unnamed: 0,overall,verified,reviewTime,asin,reviewerName,reviewText,summary,sentiment,year
0,4.0,True,"08 12, 2014",70524076,LINDSAY BUENA,thought book fine use much thought exampl basi...,colleg physic,1,2014
1,4.0,True,"05 13, 2014",70524076,Hassan,return book two week ago know got messag say t...,return,1,2014
2,5.0,False,"02 23, 2014",70524076,Nena,book not perfect condit expect rent textbook t...,rent book expect,1,2014
3,3.0,True,"10 13, 2014",151004714,Sande,keep read messud know seem click charact write...,ok,1,2014
4,2.0,True,"10 2, 2014",151004714,MSY,sorri could not get rambl bore book time thoug...,disappoint,0,2014


***

# Import Metadata

In [6]:
df_meta = pd.read_csv('Metadata_completed/meta_Electronics_clean.csv', sep = '\t')

In [7]:
df_meta.head()

Unnamed: 0,asin,title,brand
0,11300000,Genuine Geovision 1 Channel 3rd Party NVR IP S...,GeoVision
1,43396828,"Books ""Handbook of Astronomical Image Processi...",33 Books Co.
2,60009810,One Hot Summer,Visit Amazon's Carolina Garcia Aguilera Page
3,60219602,Hurray for Hattie Rabbit: Story and pictures (...,Visit Amazon's Dick Gackenbach Page
4,60786817,sex.lies.murder.fame.: A Novel,Visit Amazon's Lolita Files Page


***

# Merge Data and Metadata on asin

In [8]:
joined = pd.merge(df, df_meta, on='asin')
joined.shape

(3023754, 11)

In [10]:
joined.shape[0]-df.shape[0]

101323

In [11]:
joined.sample(10)

Unnamed: 0,overall,verified,reviewTime,asin,reviewerName,reviewText,summary,sentiment,year,title,brand
1107040,2.0,True,"12 27, 2014",B0071V3B0W,Lucretia Parker,constantli loos bluetooth earpiec hate someth ...,good bad ugli,0,2014,Azeca AZM04 Clip On Bluetooth Headset with Ret...,Azeca
2945579,5.0,True,"03 11, 2014",B00FJW12TM,Patricia Nugent,love took month receiv china orang instead pur...,good news bad news,1,2014,Semi-Purple Ultra Thin Silicone Gel Keyboard P...,Live2Pedal
1764824,5.0,True,"10 29, 2014",B00CP4J7OY,gagirl,great leappad love recharg pack must,five star,1,2014,"LeapFrog LeapPad2 Power Learning Tablet, Green",LeapFrog
748249,5.0,True,"07 29, 2014",B004G7D0EG,Kenneth Barnes,got exactli order perfect,awesom,1,2014,SanDisk 32GB MicroSDHC High Speed Class 4 Card...,SanDisk
2320875,3.0,False,"09 7, 2014",B00HQ883QW,hassan albaghli,unfortun work expect alway need restart work p...,unfortun,1,2014,NETGEAR AC1200 Wireless WiFi Range Extender (E...,NETGEAR
682005,5.0,True,"03 6, 2014",B003ZJ7ETI,Clarence Powers,cabl purchas work well whole experi process po...,purchas f optic digit cabl,1,2014,TOSLink Optical Digital Audio Cable SPDIF Dolb...,Cmple
151082,5.0,True,"08 8, 2014",B000219896,mcgman,work great travel daughter friend like watch m...,perfect trip,1,2014,Boostaroo Portable Amplifier - Clear,ThinkGeek
1976737,5.0,False,"06 11, 2014",B00ECQUY2M,icykyl,walkman e decid die one day year half honestli...,somewhat downgrad not decis,1,2014,Sony NWZE385 16 GB Walkman MP3 Video Player (B...,Sony
554112,4.0,True,"12 9, 2014",B0028N6VN2,Morghanna,usual linksi instal easili work well,good router,1,2014,Linksys Wireless-N PCI Adapter with Dual-Band ...,Linksys
2302280,5.0,True,"06 28, 2014",B00HI154M8,Bryan,advertis great product,great product,1,2014,"Loopilops Lightning Cable, Apple Certified MFI...",Loopilops


In [12]:
# total the number of NaN rows
nan_in_df = joined.isnull().sum().sum()
  
# printing the number of NaN values present in the whole dataframe
print('Number of NaN values present: ' + str(nan_in_df))

Number of NaN values present: 18303


In [13]:
# Count the number of NaN in each column
for row in joined.columns:
    nan_in_df = joined[row].isnull().sum().sum()
    print('NaN in ' + row + ': ' + str(nan_in_df))

NaN in overall: 0
NaN in verified: 0
NaN in reviewTime: 0
NaN in asin: 0
NaN in reviewerName: 0
NaN in reviewText: 3527
NaN in summary: 7204
NaN in sentiment: 0
NaN in year: 0
NaN in title: 12
NaN in brand: 7560


In [14]:
# drop NaN rows
joined = joined.dropna()

In [15]:
# total the number of NaN rows
nan_in_df = joined.isnull().sum().sum()
  
# printing the number of NaN values present in the whole dataframe
print('Number of NaN values present: ' + str(nan_in_df))

Number of NaN values present: 0


In [64]:
products = pd.read_csv('CSV_completed/product_match.csv', sep='\t')

In [65]:
products.head()

Unnamed: 0,sams_name,amazon_name,score
0,CG PORT CHARGER LRG,Charger,90
1,INCIPIO FEATHER,Incipio Feather Case for Microsoft Surface Pro...,90
2,LG LUCID,Black Micro Bluetooth Hands Free Heaphone For ...,90
3,SOUND RUSH,Panasonic RP-HXS200M-K Sound Rush On-Ear Headp...,90
4,DELL INSPIRON 11,Dell Inspiron 1150,94


In [66]:
products.shape

(31, 3)

## Filter joined dataset using matching products list

In [67]:
prod_match = joined[joined['title'].isin(products['amazon_name'])]

In [68]:
prod_match.shape

(189, 11)

In [69]:
prod_match.head()

Unnamed: 0,overall,verified,reviewTime,asin,reviewerName,reviewText,summary,sentiment,year,title,brand
1745593,4.0,True,"10 17, 2014",B00CKX7VMK,Jeffrey Danowitz,surpris easi use clear speech even outdoor win...,better expect,1,2014,"Etymotic QSA Personal Sound Amplifier, Platinu...",Etymotic Research
1745594,3.0,False,"10 13, 2014",B00CKX7VMK,Suzan T.,not comfort ear,three star,1,2014,"Etymotic QSA Personal Sound Amplifier, Platinu...",Etymotic Research
1745595,5.0,True,"09 19, 2014",B00CKX7VMK,LSB16,good better two sieman year ago,five star,1,2014,"Etymotic QSA Personal Sound Amplifier, Platinu...",Etymotic Research
1745596,1.0,True,"09 3, 2014",B00CKX7VMK,S. Murphy,bought mom good review disappoint tri first on...,bought mom good review disappoint,0,2014,"Etymotic QSA Personal Sound Amplifier, Platinu...",Etymotic Research
1745597,2.0,True,"07 30, 2014",B00CKX7VMK,JED,disappoint bean person sound amplifi hard get ...,disappoint bean person sound amplifi,0,2014,"Etymotic QSA Personal Sound Amplifier, Platinu...",Etymotic Research


## Data Cleaning

In [70]:
import re
import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

all_stopwords = stopwords.words('english')
all_stopwords.remove('not')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/robbyjeffries/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [71]:
def clean(x):
    review = re.sub('[^a-zA-Z]', ' ', x)
    #review = re.sub('~', ' ', df['reviewText'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    return review

In [72]:
clean(prod_match['reviewText'].values[0])

'surpri easi use clear speech even outdoor wind level allow hear tabl crowd restaur parti satisfi bean left ear batteri last use time drawback phone'

In [73]:
prod_match['reviewText'] = prod_match['reviewText'].apply(lambda x: clean(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prod_match['reviewText'] = prod_match['reviewText'].apply(lambda x: clean(x))


In [74]:
prod_match['summary'] = prod_match['summary'].apply(lambda x: clean(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prod_match['summary'] = prod_match['summary'].apply(lambda x: clean(x))


In [75]:
prod_match.sample(10)

Unnamed: 0,overall,verified,reviewTime,asin,reviewerName,reviewText,summary,sentiment,year,title,brand
2222967,5.0,True,"12 25, 2014",B00GK6DMW6,Axel,ivdont know bad review cover mine fit perfectl...,ivdont know bad review cover,1,2014,Incipio Feather Case for Microsoft Surface Pro...,Incipio
2970621,5.0,True,"01 4, 2014",B00HJBX8NE,Holly Odenbaugh,love comput end get charger anoth person work ...,great,1,2014,Dell Inspiron 1150,Dell
1745594,3.0,False,"10 13, 2014",B00CKX7VMK,Suzan T.,not comfort ear,three star,1,2014,"Etymotic QSA Personal Sound Amplifier, Platinu...",Etymotic Research
1746294,5.0,True,"05 29, 2014",B00CL6TKQ6,JMR,exactli order deliv quickli work perfectli wou...,order,1,2014,Charger,ZhiZhu
2223019,1.0,True,"03 29, 2014",B00GK6DMW6,darok2,would love give product good rate clear use ma...,warp kickstand surfac pro,0,2014,Incipio Feather Case for Microsoft Surface Pro...,Incipio
1746326,5.0,False,"08 16, 2014",B00CL6TKQ6,Jose,good,five star,1,2014,Charger,ZhiZhu
3016090,1.0,True,"11 30, 2014",B00NKEUHEW,Kevin Wagoner,junk,one star,0,2014,inDigi Bluetooth Smart Watch Phone For iPhone ...,inDigi
2275514,3.0,True,"10 4, 2014",B00H4FFHYS,MaryEl,easi chang middl space bar terribl hit right p...,easi chang,1,2014,Dell Inspiron 15 (3521 / 5521) Laptop Keyboard...,Dell
2275521,1.0,True,"05 16, 2014",B00H4FFHYS,Amazon Customer,unlik origin keyboard replac issu space bar re...,bewar,0,2014,Dell Inspiron 15 (3521 / 5521) Laptop Keyboard...,Dell
2965997,5.0,False,"08 15, 2014",B00H76ENFI,Rami,amazingli fast gb flash drive happier get soli...,great flash drive,1,2014,ULTRA 64GB Flash Drive,Ultra


In [76]:
prod_match.to_csv('CSV_completed/Electronics_clean.csv', sep='\t', index=False)

## Model Building

We will use a Support Vector Machine

A support vector machine (SVM) is a supervised machine learning model that uses classification algorithms for two-group classification problems. After giving an SVM model sets of labeled training data for each category, they’re able to categorize new text.

The basics of Support Vector Machines and how it works are best understood with a simple example. Let’s imagine we have two tags: red and yellow, and our data has two features: x and y. We want a classifier that, given a pair of (x,y) coordinates, outputs if it’s either red or yellow. We plot our already labeled training data on a plane:

**TF-IDF (term frequency-inverse document frequency)** is a statistical measure that evaluates how relevant a word is to a document in a collection of documents.

In [77]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [87]:
# Tokenizing Raw text reviews
tfidf = TfidfVectorizer(max_features=5000)
X=prod_match['reviewText']
y=prod_match['sentiment']
# y=df['star_rating']

X = tfidf.fit_transform(X)

In [88]:
X

<189x1303 sparse matrix of type '<class 'numpy.float64'>'
	with 4453 stored elements in Compressed Sparse Row format>

In [89]:
print(X[:2,]) # Text Reviews got recoded in numbers

  (0, 831)	0.19767480761675008
  (0, 320)	0.23673124514823543
  (0, 1174)	0.11990429988460971
  (0, 607)	0.1776411362647969
  (0, 93)	0.1776411362647969
  (0, 332)	0.17262069284617454
  (0, 618)	0.20718619070651617
  (0, 94)	0.18990344177634533
  (0, 979)	0.21944849621806461
  (0, 813)	0.23673124514823543
  (0, 950)	0.21944849621806461
  (0, 255)	0.20718619070651617
  (0, 1143)	0.21944849621806461
  (0, 506)	0.1776411362647969
  (0, 34)	0.18990344177634533
  (0, 624)	0.23673124514823543
  (0, 1275)	0.23673124514823543
  (0, 793)	0.23673124514823543
  (0, 364)	0.1640672040799107
  (0, 1084)	0.21944849621806461
  (0, 192)	0.18990344177634533
  (0, 1230)	0.222451990998135
  (0, 334)	0.15084700424485997
  (0, 1137)	0.23673124514823543
  (1, 202)	0.7547622975684325
  (1, 757)	0.2790137505874156
  (1, 332)	0.5937046413431214


In [90]:
from random import sample, seed

seed(2022)
# Random Sample of features 
sample(tfidf.get_feature_names(), 10)

['spill',
 'keyboard',
 'read',
 'stud',
 'lip',
 'trip',
 'book',
 'soft',
 'plea',
 'xp']

In [91]:
# Partion Data into Train and Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2022)

# SVC Model

In [92]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC # support vector machines for classification (SVR is for regression)

tuned_parameters = [{'kernel': ['linear', 'poly'], 'C': [1, 10, 100, 1000]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(random_state = 2022), tuned_parameters, cv=5, scoring='%s_macro' % score).fit(X_train, y_train)

    print("Best parameters set found on train set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on train set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The scores are computed on test set.")
    print()
    print(classification_report(y_test, clf.predict(X_test)))
    print()

# Tuning hyper-parameters for precision



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Best parameters set found on train set:

{'C': 10, 'kernel': 'linear'}

Grid scores on train set:

0.640 (+/-0.479) for {'C': 1, 'kernel': 'linear'}
0.579 (+/-0.504) for {'C': 1, 'kernel': 'poly'}
0.690 (+/-0.298) for {'C': 10, 'kernel': 'linear'}
0.579 (+/-0.504) for {'C': 10, 'kernel': 'poly'}
0.690 (+/-0.298) for {'C': 100, 'kernel': 'linear'}
0.579 (+/-0.504) for {'C': 100, 'kernel': 'poly'}
0.690 (+/-0.298) for {'C': 1000, 'kernel': 'linear'}
0.579 (+/-0.504) for {'C': 1000, 'kernel': 'poly'}

Detailed classification report:

The scores are computed on test set.

              precision    recall  f1-score   support

           0       0.43      0.23      0.30        13
           1       0.68      0.84      0.75        25

    accuracy                           0.63        38
   macro avg       0.55      0.54      0.53        38
weighted avg       0.59      0.63      0.60        38


# Tuning hyper-parameters for recall

Best parameters set found on train set:

{'C': 10, 'kernel'

***

## Vader Model

https://towardsdatascience.com/sentimental-analysis-using-vader-a3415fef7664

In [93]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/robbyjeffries/nltk_data...


In [95]:
prod_match['score'] = prod_match['reviewText'].apply(lambda review: sid.polarity_scores(review))

prod_match['score']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prod_match['score'] = prod_match['reviewText'].apply(lambda review: sid.polarity_scores(review))


1745593    {'neg': 0.0, 'neu': 0.836, 'pos': 0.164, 'comp...
1745594    {'neg': 0.513, 'neu': 0.487, 'pos': 0.0, 'comp...
1745595    {'neg': 0.0, 'neu': 0.408, 'pos': 0.592, 'comp...
1745596    {'neg': 0.116, 'neu': 0.748, 'pos': 0.136, 'co...
1745597    {'neg': 0.134, 'neu': 0.784, 'pos': 0.082, 'co...
                                 ...                        
3016090    {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...
3019978    {'neg': 0.064, 'neu': 0.709, 'pos': 0.226, 'co...
3019979    {'neg': 0.169, 'neu': 0.708, 'pos': 0.123, 'co...
3019980    {'neg': 0.203, 'neu': 0.65, 'pos': 0.146, 'com...
3020104    {'neg': 0.094, 'neu': 0.836, 'pos': 0.07, 'com...
Name: score, Length: 189, dtype: object