# Amazon Reviews Sentiment Analysis

In [None]:
import numpy as np
import pandas as pd

In [None]:
# Importing the Data
%cd ..
df = pd.read_csv('/Users/robbyjeffries/MSEACapstone/Data/CSV_cleaned/Grocery_and_Gourmet_Food.csv')
df = df[['reviewText', 'overall']]
df.head()

Tentatively, I will classify reviews that scored 3 and above as positive and the rest as negative.

In [None]:
# Creating Sentiment Column
df['sentiment'] = df['overall'].apply(lambda x: 1 if x>=3 else 0)
df.head()

### Text Preprocessing

Before we start building our model, we need to get rid of special characters, duplicates characters and standardize the formatting of the reviews. 

In [None]:
# Dependencies

if False: # Change it to true if you haven't installed it
    !pip install text-preprocessing


In [None]:
from text_preprocessing import preprocess_text
from text_preprocessing import to_lower, remove_email, remove_url, remove_punctuation, lemmatize_word

In [None]:
text_to_process = df['reviewText']

text_to_process

In [None]:
text_to_process = df['reviewText']

# Preprocess text using custom preprocess functions in the pipeline 
preprocess_functions = [to_lower, remove_email, remove_url, remove_punctuation, lemmatize_word]
preprocessed_text = preprocess_text(text_to_process, preprocess_functions)
print(preprocessed_text)
# output: helllo i am john doe my email is visit our website

In [21]:
from text_preprocessing import preprocess_text
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql import DataFrame as SparkDataFrame


def preprocess_text_spark(df: SparkDataFrame, 
                          target_column: str, 
                          preprocessed_column_name: str = 'preprocessed_text'
                         ) -> SparkDataFrame:
    """ Preprocess text in a column of a PySpark DataFrame by leveraging PySpark UDF to preprocess text in parallel """
    _preprocess_text = udf(preprocess_text, StringType())
    new_df = df.withColumn(preprocessed_column_name, _preprocess_text(df[target_column]))
    return new_df

ModuleNotFoundError: No module named 'pyspark'

***

In [None]:
import preprocess_kgptalkie as ps
import re

In [None]:
def get_clean(x):
    '''
    Reformat a string by removing duplicates, and special characters
    e.g: you're -> you are
         i'm -> i am
         I llllovvee iit!! -> i love it
         white_dog -> white dog
    '''
    x = str(x).lower().replace('\\', '').replace('_', ' ')
    x = ps.cont_exp(x) # you're -> you are; i'm -> i am
    x = ps.remove_emails(x)
    x = ps.remove_urls(x)
    x = ps.remove_html_tags(x)
    x = ps.remove_accented_chars(x)
    x = ps.remove_special_chars(x)
    x = re.sub("(.)\\1{2,}", "\\1", x) # (e.g: I llllovvee iit -> I love it)
    return x

In [None]:
df['reviewText'] = df['reviewText'].apply(lambda x: get_clean(x))

In [None]:
df.head()

### Model Building

We will use a Support Vector Machine

A support vector machine (SVM) is a supervised machine learning model that uses classification algorithms for two-group classification problems. After giving an SVM model sets of labeled training data for each category, they’re able to categorize new text.

The basics of Support Vector Machines and how it works are best understood with a simple example. Let’s imagine we have two tags: red and yellow, and our data has two features: x and y. We want a classifier that, given a pair of (x,y) coordinates, outputs if it’s either red or yellow. We plot our already labeled training data on a plane:

In [None]:
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline


X, y = make_blobs(n_samples=50, centers=2,
                  random_state=0, cluster_std=0.60)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')

plt.plot([0.6], [2.5], 'x', color='black', markeredgewidth=2, markersize=10) # Unlabelled Point;

A support vector machine takes these data points and outputs the hyperplane (which in two dimensions it’s simply a line) that best separates the tags. This line is the decision boundary: anything that falls to one side of it we will classify as red, and anything that falls to the other as yellow.

In [None]:
xfit = np.linspace(-1, 3.5)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')
plt.plot([0.6], [2.5], 'x', color='red', markeredgewidth=2, markersize=10) # Unlabelled Point


plt.plot(xfit, 0.17 * xfit + 2.2, '-k')

plt.xlim(-1, 3.5);

Now, we want to apply this algorithm for text classification, and the first thing we need is a way to transform a piece of text into a vector of numbers so we can run SVM with them. In other words, which features do we have to use in order to classify texts using SVM?

The most common answer is word frequencies

**TF-IDF (term frequency-inverse document frequency)** is a statistical measure that evaluates how relevant a word is to a document in a collection of documents.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

In [None]:
# Tokenizing Raw text reviews
tfidf = TfidfVectorizer(max_features=5000)
X=df['review_body']
y=df['sentiment']
# y=df['star_rating']

X = tfidf.fit_transform(X)

In [None]:
X

In [None]:
print(X[:2,]) # Text Reviews got recoded in numbers

In [None]:
from random import sample, seed

seed(2022)
# Random Sample of features 
sample(tfidf.get_feature_names(), 10)

In [None]:
# Partion Data into Train and Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2022)

In [None]:
# Model
clf = LinearSVC(loss='hinge') # tweek parametters here to make it better (or worse)

# Training Model
clf.fit(X_train, y_train)

In [None]:
# Testing Model
y_pred = clf.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

### Testing the model on few reviews

In [None]:
# Let's understand how the algorith works
x = ['I love this phone, will definitely recommend',
     'This PHONE is terrible, I want a refund!', 
     'I believe this phone fulfill his purpose, but it could have been better']

for i in range(len(x)):
       x[i] = get_clean(x[i])
x

In [None]:
vec = tfidf.transform(x) # tokenizing using previously created features
vec

In [None]:
clf.predict(vec)

In [None]:
# Exporting the model
import pickle


pickle.dump(clf, open('Model/amazon_svc', 'wb'))