# IBM Advanced Data Science Capstone Project
## Sentiment Analysis of Amazon Customer Reviews
### Harsh V Singh, Apr 2021

## Extract, Transform, Load (ETL)

This notebook contains the comprehensive step-by-step process used for cleaning and preparing the raw data. 

1. The data that we are using for this project is avaiable to us in the form of two csv files (train.csv/ test.csv). We will read these files into memory and then store them in parquet files with the same name. *Spark csv reader is not able to handle commas within the quoted text of the reviews. Hence, we will first read the files into Pandas dataframes and then export them into parquet files*.

2. Since the training data is quite large, we will conduct the initial data exploration and analysis on a sample set of ~10,000 rows. Once we have finalized the ETL steps, we will implement them onto the entire train and test sets.

3. As part of data exploration, we will look at the distribution of heading and review text lengths and number of words. We will also look at the most common words in the review texts, both for stopwords and other words.

4. As part of data processing, we will use the **nltk** package to remove stopwords, clean and tokenize the text, and lemmatize the token words. 

5. Our target variable will be based on a transformation of the review ratings. Ratings above 3 (i.e. 4/5) will be categorized as positive while ratings below 3 will be categorized as negative. *For the purpose of sentiment analysis, we will ignore all reviews with rating 3 as their categorization is ambiguous*.

6. Lastly, we will convert the tokenized arrays into count-based sparse vectors and TFIDF-based sparse vectors which will be used as our final feature sets.

In [None]:
import numpy as np
import pandas as pd
import math
import csv
import time
from pathlib import Path
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

import seaborn as sns
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import pyarrow

import string
from langdetect import detect, detect_langs

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from collections import defaultdict
from collections import Counter

nltk.download("stopwords", quiet=True)
nltk.download("punkt", quiet=True)
nltk.download("wordnet", quiet=True)
nltk.download("averaged_perceptron_tagger", quiet=True)
ENGLISH_STOP_WORDS = set(stopwords.words("english"))

import findspark
findspark.init()

from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType, StringType
from pyspark.sql.functions import udf, rand
conf = SparkConf().setMaster("local[*]") \
    .setAll([("spark.driver.memory", "16g"),\
            ("spark.executor.memory", "8g"), \
            ("spark.driver.maxResultSize", "16g")])
sc = SparkContext.getOrCreate(conf=conf)
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .getOrCreate()

ETL_SAMPLE_SIZE = 10000

In [None]:
#spark.sparkContext.stop()

In [None]:
def getElapsedTime(startTime, endTime):
    elapsedTime = endTime - startTime
    return("Process time = %.2f seconds."%(elapsedTime))

In [None]:
def savePandasDFToParquet(csvPath, parqPath, rawSchema, printTime=False):
    startTime = time.time()
    pandasDF = pd.read_csv(csvPath, header=None)
    pandasDF.columns = rawSchema.names
    pandasDF.to_parquet(parqPath, engine="pyarrow")
    endTime = time.time()
    if printTime:
        print(getElapsedTime(startTime=startTime, endTime=endTime))
    return


In [None]:
def readSparkDFFromParquet(csvPath, parqPath, rawSchema, printTime=False):
    parquetFile = Path(parqPath)
    if (parquetFile.is_file() == False):
        print("Parquet file not found... converting %s to parquet!"%(csvPath))
        savePandasDFToParquet(csvPath=csvPath, parqPath=parqPath, rawSchema=rawSchema, printTime=printTime)
    sparkDF = spark.read.parquet(parqPath)
    return (sparkDF)


In [None]:
rawSchema = StructType([
    StructField("rating", IntegerType(), True),
    StructField("review_heading", StringType(), True),
    StructField("review_text", StringType(), True)
    ])

In [None]:
trainRaw = readSparkDFFromParquet(csvPath="data/train.csv", parqPath="data/train.parquet", rawSchema=rawSchema, printTime=True)
testRaw = readSparkDFFromParquet(csvPath="data/test.csv", parqPath="data/test.parquet", rawSchema=rawSchema, printTime=True)
trainRaw.show(5)
print("There are %d/ %d samples in the training/ test data."%(trainRaw.count(), testRaw.count()))
print("Sample review text: %s"%(trainRaw.take(1)[0]["review_text"]))

In [None]:
sampleRaw = trainRaw.orderBy(rand()).limit(ETL_SAMPLE_SIZE).toPandas()
sampleRaw.head()

In [None]:
def detectTextLanguage(text):
    try:
        lang = detect(text)
    except:
        lang = "error"
    return lang

langDetectUDF = udf(lambda x: detectTextLanguage(x), StringType())

In [None]:
sampleRaw["lang"] = sampleRaw.apply(lambda x: detectTextLanguage(x["review_text"]), axis=1)
sampleRaw.drop(sampleRaw[sampleRaw["lang"] != "en"].index, inplace=True)
sampleRaw.drop(columns="lang", inplace=True)

print("There are %d samples left after dropping non-english language reviews."%(sampleRaw.shape[0]))

In [None]:
def plotHistograms(datasets, titles, figTitle, figSize=(18,6), numCols=1):
    fig = plt.figure(figsize=figSize)
    sns.set_theme()
    sns.set_style("white")
    
    numRows = math.ceil(len(datasets) / numCols)
    for i in range(len(datasets)):
        fig.add_subplot(numRows, numCols, i+1)
        sns.histplot(data=datasets[i])
        plt.xlabel("")
        plt.ylabel("")
        plt.title(titles[i])
    
    fig.suptitle(figTitle)
    plt.show()

In [None]:
plotHistograms(
    datasets=[
        sampleRaw['review_heading'].str.len(),
        sampleRaw['review_text'].str.len()],
    titles=["Review Headings", "Review Text"],
    figTitle="Distribution of String Lengths (Sample Data)",
    figSize=(18,6), numCols=2
)

In [None]:
plotHistograms(
    datasets=[
        sampleRaw['review_heading'].str.split().map(lambda x: len(x)),
        sampleRaw['review_text'].str.split().map(lambda x: len(x))
        ],
    titles=["Review Headings", "Review Text"],
    figTitle="Distribution of Word Counts (Sample Data)",
    figSize=(18,6), numCols=2
)

In [None]:
def getSortedWordCounts(wordCounts, topN=0):
    sortedCounts = [[k, v] for k, v in sorted(wordCounts.items(), key=lambda item: -item[1])]
    sortedCounts = pd.DataFrame(sortedCounts, columns = ["word", "count"]) 
    if(topN > 0):
        sortedCounts = sortedCounts.head(min(topN, sortedCounts.shape[0]))
    return (sortedCounts)


In [None]:
def getWordTokensFromText(textData):
    rawTokens = word_tokenize(textData)
    cleanTokens = [w.lower().translate(str.maketrans('', '', string.punctuation)) for w in rawTokens]
    wordList = [word for word in cleanTokens if word.isalpha()]
    return (wordList)

In [None]:
def getTopWords(wordList, stopWords, topN=25):
    stopCounts = defaultdict(int)
    otherCounts = defaultdict(int)
    for word in wordList:
        if word in stopWords:
            stopCounts[word] += 1
        else:
            otherCounts[word] += 1

    topStopWords = getSortedWordCounts(stopCounts, topN)
    topOtherWords = getSortedWordCounts(otherCounts, topN)

    return ({"stopWords": topStopWords, "otherWords": topOtherWords})

In [None]:
sampleTokenized = sampleRaw.copy(deep=True)
sampleTokenized["review_heading"] = [getWordTokensFromText(text) for text in sampleTokenized["review_heading"]]
sampleTokenized["review_text"] = [getWordTokensFromText(text) for text in sampleTokenized["review_text"]]

headingWords = sampleTokenized["review_heading"].apply(pd.Series).stack().reset_index(drop = True).to_list()
textWords = sampleTokenized["review_text"].apply(pd.Series).stack().reset_index(drop = True).to_list()

topHeadingWords = getTopWords(wordList=headingWords, stopWords=ENGLISH_STOP_WORDS, topN=25)
topTextWords = getTopWords(wordList=textWords, stopWords=ENGLISH_STOP_WORDS, topN=25)

print("There are %d words in the review texts of %d samples."%(len(textWords), sampleTokenized.shape[0]))


In [None]:
def plotBars(datasets, titles, x, y, figTitle, figSize=(12,6), numCols=1):
    fig = plt.figure(figsize=figSize)
    sns.set_theme()
    sns.set_style("white")
    
    numRows = math.ceil(len(datasets) / numCols)
    for i in range(len(datasets)):
        fig.add_subplot(numRows, numCols, i+1)
        sns.barplot(data=datasets[i], x=x, y=y)
        plt.xlabel("")
        plt.ylabel("")
        plt.title(titles[i])
    fig.suptitle(figTitle)
    plt.show()

In [None]:
plotBars(
    datasets=[topHeadingWords["stopWords"], topHeadingWords["otherWords"], topTextWords["stopWords"], topTextWords["otherWords"]], 
    titles=["Headings - Stop Words", "Headings - Other Words", "Text - Stop Words", "Text - Other Words"],
    x="count", y="word", 
    figTitle="Count of Top Words in Headings and Review Texts (Sample Data)", 
    figSize=(20,12), numCols=2)

In [None]:
sampleProcessed = sampleTokenized.copy()
sampleProcessed["review_content"] = sampleProcessed["review_heading"] + sampleProcessed["review_text"]
sampleProcessed.loc[sampleProcessed["rating"] < 3, "review_sentiment"] = 0
sampleProcessed.loc[sampleProcessed["rating"] > 3, "review_sentiment"] = 1
sampleProcessed.drop(columns=["review_heading", "review_text", "rating"], inplace=True)
sampleProcessed.dropna(axis=0, inplace=True)
sampleProcessed.head()

In [None]:
def removeStopWordsFromText(textData, stopWords):
    relevantText = [word for word in textData if word not in stopWords]
    return (relevantText)

In [None]:
sampleProcessed["review_content"] = [removeStopWordsFromText(text, ENGLISH_STOP_WORDS) for text in sampleProcessed["review_content"]]
sampleProcessed.head()

In [None]:
def getWordnetPos(word):
  tag = nltk.pos_tag([word])[0][1][0].upper()
  tagDictionary = {
      "J": wordnet.ADJ,
      "N": wordnet.NOUN,
      "V": wordnet.VERB,
      "R": wordnet.ADV
      }
  return (tagDictionary.get(tag, wordnet.NOUN))

def getLemmatizedText(textData, lemmatizer):
  lemText = [lemmatizer.lemmatize(word, getWordnetPos(word)) for word in textData]
  return (lemText)

In [None]:
startTime = time.time()
lemmatizer = nltk.stem.WordNetLemmatizer()
sampleProcessed["review_content"] = [getLemmatizedText(text, lemmatizer) for text in sampleProcessed["review_content"]]
endTime = time.time()
print(getElapsedTime(startTime=startTime, endTime=endTime))
sampleProcessed.head()


In [None]:
countVect = CountVectorizer()
reviewCounts = countVect.fit_transform(sampleProcessed["review_content"].apply(" ".join))
print("Review content is transformed into a %s with %s elements."%(type(reviewCounts), reviewCounts.shape, ))

In [None]:
X_train, X_test, y_train, y_test =  train_test_split(reviewCounts, sampleProcessed["review_sentiment"], test_size=0.3, random_state=123)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
clf = MultinomialNB().fit(X_train, y_train)
predicted = clf.predict(X_test)
print("MultinomialNB Accuracy:", metrics.accuracy_score(y_test, predicted))

In [None]:
tfVect = TfidfVectorizer()
reviewTF = tfVect.fit_transform(sampleProcessed["review_content"].apply(" ".join))
print("Review texts are transformed into a %s with %s elements."%(type(reviewTF), reviewTF.shape, ))

In [None]:
startTime = time.time()
trainClean = trainRaw.withColumn("lang", langDetectFunc("review_text"))
trainClean = trainClean.filter(trainDF["lang"] == "en")
trainClean = trainClean.drop("lang")
trainClean.show(5)
endTime = time.time()
print(getElapsedTime(startTime=startTime, endTime=endTime))
#print("There are %d samples left after dropping non-english language reviews."%(trainClean.count()))