# Sentiment Analysis 
#### by Robby Jeffries
#### 02-24-2022

## Import Data

In [1]:
import os
import numpy as np
import pandas as pd

In [None]:
# set working directory
os.chdir('/Users/robbyjeffries/MSEACapstone/Data')

# import raw review data for the Electronics category
raw = pd.read_csv('CSV_cleaned/Electronics.csv')

In [None]:
# create a dataframe with the relevent columns from the raw dataframe
df = raw[['overall', 'verified', 'reviewTime', 'asin', 'reviewerName', 'reviewText', 'summary']]

In [None]:
df.head()

In [None]:
df.shape

In [None]:
# total the number of NaN rows
nan_in_df = df.isnull().sum().sum()
  
# printing the number of NaN values present in the whole dataframe
print('Number of NaN values present: ' + str(nan_in_df))

In [None]:
# drop NaN rows
df = df.dropna()

In [None]:
# total the number of NaN rows
nan_in_df = df.isnull().sum().sum()
  
# printing the number of NaN values present in the whole dataframe
print('Number of NaN values present: ' + str(nan_in_df))

In [None]:
# Creating Sentiment Column
df['sentiment'] = df['overall'].apply(lambda x: 1 if x>=3 else 0)
df.head()

In [None]:
# Extract review year and place it in a new column
df['year'] = df['reviewTime'].apply(lambda x: x[-4:])
df.head()

In [None]:
# Filter only 2014
df = df[df.year == '2014']
df.shape

In [None]:
df.head()

In [None]:
df.to_csv('CSV_completed/Electronics_clean.csv', sep='\t', index=False)

In [None]:
df.head()

***

# Import Metadata

In [None]:
# import metadata for the Electronics category
meta_raw = pd.read_csv('Metadata_completed/meta_Electronics.csv')

In [None]:
# create a dataframe with a subset of the columns in the raw metadata
df_meta = meta_raw[['asin', 'title', 'brand']]

In [None]:
df_meta.head()

In [None]:
df_meta_dup = df_meta.groupby(df_meta.columns.tolist(),as_index=False).size().sort_values(by='size',ascending=False)

In [None]:
df_meta_dup[df_meta_dup['size']==2].shape

In [None]:
df_meta.to_csv('Metadata_completed/meta_Electronics_clean.csv', sep='\t', index=False)


***

# Merge Data and Metadata on asin

In [None]:
joined = pd.merge(raw, meta_raw, on='asin')
joined.shape

In [None]:
joined.shape[0]-raw.shape[0]

In [None]:
joined.sample(10)

In [None]:
joined.sample(10)

In [None]:
# total the number of NaN rows
nan_in_df = joined.isnull().sum().sum()
  
# printing the number of NaN values present in the whole dataframe
print('Number of NaN values present: ' + str(nan_in_df))

In [None]:
# Count the number of NaN in each column
for row in joined.columns:
    nan_in_df = joined[row].isnull().sum().sum()
    print('NaN in ' + row + ': ' + str(nan_in_df))

In [None]:
# drop NaN rows
joined = joined.dropna()

In [None]:
# total the number of NaN rows
nan_in_df = joined.isnull().sum().sum()
  
# printing the number of NaN values present in the whole dataframe
print('Number of NaN values present: ' + str(nan_in_df))

In [None]:
df_meta.shape

In [None]:
df_meta['asin'].nunique()

In [None]:
joined.shape

In [None]:
joined['title']

In [None]:
for row in joined.index:
    print(joined['asin'][row], joined['title'][row])

## Data Cleaning

In [None]:
import re
import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

all_stopwords = stopwords.words('english')
all_stopwords.remove('not')

In [None]:
corpus = []

for i in range(0,len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['reviewText'][i])
    #review = re.sub('~', ' ', df['reviewText'][i])
    print(i)
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
corpus = []

def clean(x):
    review = re.sub('[^a-zA-Z]', ' ', x)
    #review = re.sub('~', ' ', df['reviewText'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)
    return review

In [None]:
clean(df['reviewText'].values[0])

In [None]:
df['reviewText'] = df['reviewText'].apply(lambda x: clean(x))

In [None]:
df['summary'] = df['summary'].apply(lambda x: clean(x))

In [None]:
df.iloc[[366]]

In [None]:
df.sample(10)

In [None]:
df.to_csv('CSV_completed/Electronics_clean.csv', sep='\t', index=False)

# Create dataframe showing each unique product with the date of its first reivew

### Clean data

In [None]:
# create a df with all amazon reviews
all_reviews = raw[['overall', 'verified', 'reviewTime', 'asin', 'reviewerName', 'reviewText', 'summary']]

In [None]:
# merge all_reviews with metadata
df2 = pd.merge(all_reviews, meta_raw, on='asin')
df2.shape

In [None]:
df2 = df2[['overall', 'verified', 'reviewTime', 'asin', 'reviewerName', 'reviewText', 'summary', 'title', 'brand']]

In [None]:
# total the number of NaN rows
nan_in_df = df2.isnull().sum().sum()
  
# printing the number of NaN values present in the whole dataframe
print('Number of NaN values present: ' + str(nan_in_df))

In [None]:
df2 = df2.dropna()

In [None]:
# Count the number of NaN in each column
for row in df2.columns:
    nan_in_df = df2[row].isnull().sum().sum()
    print('NaN in ' + row + ': ' + str(nan_in_df))

In [None]:
df2.sample(5)

In [None]:
# Convert each reviewTime value from a str to a datetime format
df2['reviewTime'] = df2['reviewTime'].apply(lambda x: datetime.strptime(x, '%m %d, %Y'))

In [None]:
df2.head()

### Create df with each unique product and its earliest review date

In [None]:
# create a pandas series with two columns
# the first column has a row for each unique product title
# the second column contains a list of each reviewTime for each product
df3 = df2.groupby('title').apply(lambda x: x['reviewTime'].unique())

In [None]:
type(df3)

In [None]:
df4 = pd.DataFrame({'title':df3.index, 'reviewDate':df3.values})

In [None]:
df4.head()

### For each list of dates, keep only the oldest date

In [None]:
df4['firstReview'] = ''

In [None]:
df4.iloc[0]

In [None]:
for i, row in df4.iterrows():
    df4['firstReview'][i] = min(row['reviewDate'])

In [None]:
df4 = df4.sort_values(by='firstReview')
df4.head()

In [None]:
df4.shape

In [None]:
# filter df4 (df of unique products and the date of their first review) to 2014 and 2015
filtered_df = df4[['title', 'firstReview']]
filtered_df.shape

In [None]:
type(filtered_df['firstReview'][0])

In [None]:
filtered_df.head()

In [None]:
filtered_df['firstReview'] = pd.to_datetime(filtered_df['firstReview'])

In [None]:
filtered_df.head()

In [None]:
date_df = filtered_df[(filtered_df['firstReview'] >= '2014-01-01') & (filtered_df['firstReview'] <= '2015-12-31')]

In [None]:
date_df.to_csv('CSV_completed/product_with_first_review_date.csv', index=False)

### Convert reviewTime to date format

In [None]:
from datetime import datetime

date_string = '12 31, 2009'

date = date.strptime(date_string, '%m %d, %Y')
print(date)

## Model Building

We will use a Support Vector Machine

A support vector machine (SVM) is a supervised machine learning model that uses classification algorithms for two-group classification problems. After giving an SVM model sets of labeled training data for each category, they’re able to categorize new text.

The basics of Support Vector Machines and how it works are best understood with a simple example. Let’s imagine we have two tags: red and yellow, and our data has two features: x and y. We want a classifier that, given a pair of (x,y) coordinates, outputs if it’s either red or yellow. We plot our already labeled training data on a plane:

**TF-IDF (term frequency-inverse document frequency)** is a statistical measure that evaluates how relevant a word is to a document in a collection of documents.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

In [None]:
# Tokenizing Raw text reviews
tfidf = TfidfVectorizer(max_features=5000)
X=df['reviewClean']
y=df['overall']
# y=df['star_rating']

X = tfidf.fit_transform(X)

In [None]:
X

In [None]:
print(X[:2,]) # Text Reviews got recoded in numbers

In [None]:
from random import sample, seed

seed(2022)
# Random Sample of features 
sample(tfidf.get_feature_names(), 10)

In [None]:
# Partion Data into Train and Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2022)

In [None]:
# Model
clf = LinearSVC(loss='hinge') # tweek parametters here to make it better (or worse)

# Training Model
clf.fit(X_train, y_train)

In [None]:
# Testing Model
y_pred = clf.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

### Testing the model on few reviews

In [None]:
# Let's understand how the algorith works
x = df['reviewClean']
x

In [None]:
vec = tfidf.transform(x) # tokenizing using previously created features
vec

In [None]:
clf.predict(vec)

In [None]:
df['prediction']=pd.Series(clf.predict(vec))


In [None]:
df.head()

In [None]:
# Creating Sentiment Column
df['predictionSentiment'] = df['prediction'].apply(lambda x: 1 if x>=3 else 0)
df.head()

In [None]:
# Calculating the MSE with sklearn
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(df['overall'], df['prediction'])
print(mse)

In [None]:
# Calculating the MSE with sklearn
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(df['sentiment'], df['predictionSentiment'])
print(mse)

In [None]:
df['sentiment'].values == df['predictionSentiment']