In [27]:
!pip install contractions
! pip install statistics
# Dataset: https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Beauty_v1_00.tsv.gz



In [28]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
import contractions
from statistics import mean
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score, precision_score

In [29]:
nltk.download('wordnet', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

True

##### Defining stop words to be removed from the review dataset

In [30]:
stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'to', 'from', 'on', 'here', 'there', 'when', 'where', 'why', 'how', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain']

## Read Data

##### Reading data from https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Beauty_v1_00.tsv.gz using read_csv function from pandas. Here this function will first uncompress the gzip file using gzip tool and then read the data from tsv file into pandas dataFrame.

In [31]:
data = pd.read_csv('https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Beauty_v1_00.tsv.gz', compression='gzip', sep='\t', on_bad_lines='skip')

  data = pd.read_csv('https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Beauty_v1_00.tsv.gz', compression='gzip', sep='\t', on_bad_lines='skip')


## Keep Reviews and Ratings

##### Extracting just the star_rating and review_body columns from the entire dataFrame. 
##### Also dropped all the nan values from the dataset.

In [32]:
data = data.loc[:, ["star_rating", "review_body"]]
data = data.dropna()
data = data.reset_index(drop = True)

##### Divided the dataset into 3 classes such that star_rating 1&2 belong to class 1, star_rating 3 belongs to class 2, and star_rating 4&5 belong to class 3.

In [33]:
def classifyRatingGroup(x):
    if x==1 or x==2 or x=='1' or x=='2':
        return 1
    elif x==3 or x=='3':
        return 2
    elif x==4 or x==5 or x=='4' or x=='5':
        return 3
data['rating_group'] = data['star_rating'].apply(classifyRatingGroup)


 ## We form three classes and select 20000 reviews randomly from each class.



##### Grouped the dataframe using the new rating_group class and extracted 20,000 random reviews from each class.

In [34]:
rgData = data.groupby('rating_group')

temp=[]
for group, data in rgData:
    temp.append(data.sample(20000, random_state=0))
    
fData = pd.concat([temp[0], temp[1], temp[2]])
fData = fData.sample(frac=1)

In [35]:
x1 = mean([len(w) for w in list(fData["review_body"])])

# Data Cleaning



# Pre-Processing

##### ExpandContraction is used to remove contractions from the reviews (eg: won't => will not).
##### CleanData will remove all the HTML tags, URL tokens, any special symbol other than alphabets and numbers, and numbers that occur alone.

In [36]:
def expandContraction(text):
    expandedWords = []
    for w in text.split():
        expandedWords.append(contractions.fix(w))
    return ' '.join(expandedWords)

def cleanData(review):
    review = re.sub(r'<.*>', ' ', review)            #Removes HTML tags
    review = re.sub(r'http[s]?://\S+', ' ', review)  #Removes URL tokens
    review = re.sub(r'[^a-zA-Z0-9\s]', ' ', review)  #Removes all characters other than alphabets and numbers
    review = re.sub(r'\s[0-9]*', ' ', review)        #Removes all numbers occuring independently
    review = re.sub(r'^[0-9]*\s', '', review)        #Removes all numbers at the start of a string
    return review

##### 1) Convert the reviews into lowercase
##### 2) Expand Contractions
##### 3) Clean data

In [37]:
fData['review_body'] = fData['review_body'].apply(lambda x: x.lower())
fData['review_body'] = fData['review_body'].apply(expandContraction)
fData['review_body'] = fData['review_body'].apply(cleanData)

In [38]:
x2 = mean([len(word) for word in list(fData["review_body"])])
print("Average length of review before and after data cleaning: "+str(x1)+", "+str(x2))

Average length of review before and after data cleaning: 270.2979833333333, 238.55773333333335


## Perform Lemmatization  & Remove the Stop Words


##### 1) removeStopWords will remove stop words that are present in nltk stop_words from the review
##### 2) posTagger assigns part of speech to tokens in review
##### 3) lemmatizeText performs lemmatization on the review based on POS tags assigned by posTagger

In [39]:
lemmatizer = WordNetLemmatizer()

def removeStopWords(text):
    tmp=[]
    for w in text:
        if w not in stop_words:
            tmp.append(w)
    return tmp

def joinList(text):
    return ' '.join(text)

def posTagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:         
        return None

def lemmatizeText(text):
    posTagged = nltk.pos_tag(nltk.word_tokenize(text)) 
    wordnetTagged = list(map(lambda x: (x[0], posTagger(x[1])), posTagged))

    lemmatizedSentence = []
    for word, tag in wordnetTagged:
        if tag is None:
            lemmatizedSentence.append(word)
        else:       
            lemmatizedSentence.append(lemmatizer.lemmatize(word, tag))
    
    return lemmatizedSentence

##### Perform lemmatization, remove stop words and join the pre-processed tokens into sentences to pass into tf-idf vectorizer

In [40]:
fData['review_body'] = fData['review_body'].apply(lemmatizeText)
fData['review_body'] = fData['review_body'].apply(removeStopWords)
fData['review_body'] = fData['review_body'].apply(joinList)

In [41]:
x3 = mean([len(word) for word in list(fData["review_body"])])
print("Average length of review before and after data pre-processing: "+str(x2)+", "+str(x3))

Average length of review before and after data pre-processing: 238.55773333333335, 153.46318333333335


# TF-IDF Feature Extraction

##### Split the dataset into train and test data. Stratify is used to create a balanced test data consisting of equal data of all the classes. I have splitted the 80% data into training set and 20% into testing set.

In [42]:
X_train, X_test, y_train, y_test = train_test_split(fData['review_body'], fData['rating_group'], stratify=fData['rating_group'], test_size=0.2)

##### Creating TF-IDF vector from review_body using TfidfVectorizer. We have set the max features to be taken into account as 10,000 and ngram_range represents the lower and upper boundary of the range of ngrams to be extracted in the feature vector.
##### Next I have converted the TF-IDF features of training data and test data into pandas dataframe.

In [43]:
tfidf = TfidfVectorizer(ngram_range = (1,3), max_features = 10000)
tfidf.fit(fData['review_body'].values)
trainFeatures = tfidf.transform(X_train)
testFeatures = tfidf.transform(X_test)

trainFeatures = pd.DataFrame(trainFeatures.toarray(), columns=tfidf.get_feature_names_out())
testFeatures = pd.DataFrame(testFeatures.toarray(), columns=tfidf.get_feature_names_out())

In [44]:
def printVal(f1, recall, precision):
    print("For class 1 Precision, Recall, and F1 score is "+str(precision[0])+", "+str(recall[0])+", and "+str(f1[0])+" respectively.")
    print("For class 2 Precision, Recall, and F1 score is "+str(precision[1])+", "+str(recall[1])+", and "+str(f1[1])+" respectively.")
    print("For class 3 Precision, Recall, and F1 score is "+str(precision[2])+", "+str(recall[2])+", and "+str(f1[2])+" respectively.")
    print("Average Precision, Recall, and F1 score are "+str(mean(f1))+", "+str(mean(recall))+", and "+str(mean(precision))+" respectively")

# Perceptron

##### Created Perceptron model using scikit learn to train model based on tf-idf features of training set reviews and rating_group of the respective reviews

In [45]:
from sklearn.linear_model import Perceptron
perceptronModel = Perceptron()
perceptronModel.fit(trainFeatures, y_train)

Perceptron()

##### Calculating F1 score, recall and precision for Perceptron model

In [46]:
PPrediction = perceptronModel.predict(testFeatures)
f1 = f1_score(PPrediction, y_test, average=None)
recall = recall_score(PPrediction, y_test, average=None)
precision = precision_score(PPrediction, y_test, average=None)
printVal(f1, recall, precision)

For class 1 Precision, Recall, and F1 score is 0.64175, 0.7286403633267102, and 0.6824405157516948 respectively.
For class 2 Precision, Recall, and F1 score is 0.51825, 0.5963751438434983, and 0.5545746388443018 respectively.
For class 3 Precision, Recall, and F1 score is 0.843, 0.6742651469706059, and 0.7492500833240752 respectively.
Average Precision, Recall, and F1 score are 0.6620884126400239, 0.6664268847136048, and 0.6676666666666666 respectively


# SVM

##### Created Linear Support Vector Classifier model using scikit learn to train model based on tf-idf features of training set reviews and rating_group of the respective reviews

In [47]:
from sklearn.svm import LinearSVC
LSVC = LinearSVC(C=0.1)
LSVC.fit(trainFeatures, y_train)

LinearSVC(C=0.1)

##### Calculating F1 score, recall and precision for Linear SVC model

In [48]:
LSVCPrediction = LSVC.predict(testFeatures)
f1 = f1_score(LSVCPrediction, y_test, average=None)
recall = recall_score(LSVCPrediction, y_test, average=None)
precision = precision_score(LSVCPrediction, y_test, average=None)
printVal(f1, recall, precision)

For class 1 Precision, Recall, and F1 score is 0.74925, 0.7230398069963812, and 0.7359116022099447 respectively.
For class 2 Precision, Recall, and F1 score is 0.5945, 0.6567246616956642, and 0.6240650833223987 respectively.
For class 3 Precision, Recall, and F1 score is 0.8295, 0.783656117146906, and 0.8059266456157396 respectively.
Average Precision, Recall, and F1 score are 0.721967777049361, 0.7211401952796505, and 0.7244166666666667 respectively


# Logistic Regression

##### Created Logistic Regression model using scikit learn to train model based on tf-idf features of training set reviews and rating_group of the respective reviews

In [49]:
from sklearn.linear_model import LogisticRegression
logisticRegressionModel = LogisticRegression(max_iter=5000, solver = 'saga')
logisticRegressionModel.fit(trainFeatures, y_train)

LogisticRegression(max_iter=5000, solver='saga')

##### Calculating F1 score, recall and precision for Logistic Regression model

In [50]:
LRPrediction = logisticRegressionModel.predict(testFeatures)
f1 = f1_score(LRPrediction, y_test, average=None)
recall = recall_score(LRPrediction, y_test, average=None)
precision = precision_score(LRPrediction, y_test, average=None)
printVal(f1, recall, precision)

For class 1 Precision, Recall, and F1 score is 0.73625, 0.7344139650872819, and 0.735330836454432 respectively.
For class 2 Precision, Recall, and F1 score is 0.62225, 0.6401748971193416, and 0.6310851926977687 respectively.
For class 3 Precision, Recall, and F1 score is 0.8185, 0.7981472452462214, and 0.8081955072821526 respectively.
Average Precision, Recall, and F1 score are 0.7248705121447845, 0.7242453691509483, and 0.7256666666666667 respectively


# Naive Bayes

##### Created Multinomial Naive Bayes model using scikit learn to train model based on tf-idf features of training set reviews and rating_group of the respective reviews

In [51]:
from sklearn.naive_bayes import MultinomialNB
naiveBayesModel = MultinomialNB()
naiveBayesModel.fit(trainFeatures, y_train)

MultinomialNB()

##### Calculating F1 score, recall and precision for Multinomial Naive Bayes model

In [52]:
NBPrediction = naiveBayesModel.predict(testFeatures)
f1 = f1_score(NBPrediction, y_test, average=None)
recall = recall_score(NBPrediction, y_test, average=None)
precision = precision_score(NBPrediction, y_test, average=None)
printVal(f1, recall, precision)

For class 1 Precision, Recall, and F1 score is 0.70375, 0.7255154639175257, and 0.7144670050761421 respectively.
For class 2 Precision, Recall, and F1 score is 0.6355, 0.6195466731659761, and 0.6274219424904356 respectively.
For class 3 Precision, Recall, and F1 score is 0.79225, 0.7888971869554394, and 0.7905700386678309 respectively.
Average Precision, Recall, and F1 score are 0.7108196620781362, 0.7113197746796471, and 0.7105 respectively
