In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
import re
from bs4 import BeautifulSoup


 

[nltk_data] Downloading package wordnet to C:\Users\Sai Kumar
[nltk_data]     Peddholla\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# ! pip install bs4 # in case you don't have it installed

# Dataset: https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Beauty_v1_00.tsv.gz

## Read Data

In [2]:
 dataframe = pd.read_table('amazon_reviews_us_Beauty_v1_00.tsv', on_bad_lines='skip'); 

  dataframe = pd.read_table('amazon_reviews_us_Beauty_v1_00.tsv', on_bad_lines='skip');


## Keep Reviews and Ratings

In [5]:
print(list(dataframe)) 
df = dataframe[['star_rating','review_body']]

['marketplace', 'customer_id', 'review_id', 'product_id', 'product_parent', 'product_title', 'product_category', 'star_rating', 'helpful_votes', 'total_votes', 'vine', 'verified_purchase', 'review_headline', 'review_body', 'review_date']


 ## We form three classes and select 20000 reviews randomly from each class.



In [6]:
class1 = df.loc[df['star_rating'].isin([1,2])]
class2 = df.loc[df['star_rating'].isin([3])]
class3 = df.loc[df['star_rating'].isin([4,5])]

class1 = class1.sample(n=20000)
class2 = class2.sample(n=20000)
class3 = class3.sample(n=20000)

class1['star_rating'] = class1['star_rating'].apply(lambda x: 1)
class2['star_rating'] = class2['star_rating'].apply(lambda x: 2)
class3['star_rating'] = class3['star_rating'].apply(lambda x: 3)

sample_data = pd.concat([class1, class2, class3], axis=0)
print(sample_data)


         star_rating                                        review_body
4605644            1  The tube is tiny, about half the size of a reg...
2455833            1  I bought this as a travel kit, used it once an...
2920492            1  I DO NOT RECOMMEND<br />I WANT MY MONEY BACK<b...
941661             1  My boyfriend got the PERFUME for me, for Chris...
648648             1                               Does not stay in .-.
...              ...                                                ...
2812603            3  I absolutely love this flat iron. It works bet...
3406498            3  These mitts were  purchased for my hubby for C...
3445375            3  I have thick, frizz prone, color treated hair ...
2570781            3                                              Great
3646551            3  This is my fourth order of this paper. Its per...

[60000 rows x 2 columns]


# Data Cleaning



# Pre-processing

In [7]:
import contractions

# https://stackoverflow.com/questions/45999415/removing-html-tags-in-pandas
# https://stackoverflow.com/questions/51994254/removing-url-from-a-column-in-pandas-dataframe

average_length_before_cleaning = sample_data['review_body'].str.len().mean()

sample_data['review_body'] = sample_data['review_body'].apply(lambda x: str(x).lower())
sample_data['review_body'] = sample_data['review_body'].apply(lambda x: str(x).strip())
sample_data['review_body'] = sample_data['review_body'].str.replace(r'<[^<>]*>', '', regex=True)
sample_data['review_body'] = sample_data['review_body'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
sample_data['review_body'] = sample_data['review_body'].str.replace(r'[^a-zA-Z]',' ',regex=True)
sample_data['review_body'] = sample_data['review_body'].apply(lambda x: contractions.fix(str(x)))

average_length_after_cleaning = sample_data['review_body'].str.len().mean()

print(str(average_length_before_cleaning)+","+str(average_length_after_cleaning))


287.49901660110675,285.05855


## remove the stop words 

In [8]:
from nltk.corpus import stopwords
nltk.download('stopwords')
# https://stackoverflow.com/questions/29523254/python-remove-stop-words-from-pandas-dataframe
stop_words = stopwords.words('english')

average_length_before_preprocessing = sample_data['review_body'].str.len().mean()

sample_data['review_body'] = sample_data['review_body'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

print(sample_data)

[nltk_data] Downloading package stopwords to C:\Users\Sai Kumar
[nltk_data]     Peddholla\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


         star_rating                                        review_body
4605644            1  tube tiny half size regular chapstick color su...
2455833            1  bought travel kit used threw rather pack bring...
2920492            1  recommendi want money backit cut whole skin wa...
941661             1  boyfriend got perfume christmas ran ordered th...
648648             1                                               stay
...              ...                                                ...
2812603            3  absolutely love flat iron works better chi str...
3406498            3  mitts purchased hubby christmas couples massag...
3445375            3  thick frizz prone color treated hair natural w...
2570781            3                                              great
3646551            3  fourth order paper perfect gets job done trans...

[60000 rows x 2 columns]


## perform lemmatization  

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WhitespaceTokenizer
# https://stackoverflow.com/questions/47557563/lemmatization-of-all-pandas-cells

lemmatizer = WordNetLemmatizer()

def lemmatize_words(text):
    words = text.split()
    words = [lemmatizer.lemmatize(word,pos='v') for word in words]
    return ' '.join(words)

sample_data['review_body'] = sample_data['review_body'].apply(lemmatize_words)
print(sample_data)


average_length_after_preprocessing = sample_data['review_body'].str.len().mean()


print(str(average_length_before_preprocessing)+","+str(average_length_after_preprocessing))


# TF-IDF Feature Extraction

In [121]:
from sklearn.feature_extraction.text import TfidfVectorizer
# https://stackoverflow.com/questions/37593293/how-to-get-tfidf-with-pandas-dataframe

vectorizer = TfidfVectorizer()
feature_vector = vectorizer.fit_transform(sample_data['review_body'])
print(feature_vector)


# Perceptron

In [122]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Perceptron.html

# splitting the data into train and test
rating_data = np.array(sample_data['star_rating'])
rating_data = rating_data.astype('int')
review_train_data,review_test_data,rating_train_data,rating_test_data = train_test_split(feature_vector, rating_data, test_size=0.20)


# https://python-course.eu/machine-learning/perceptron-class-in-sklearn.php
perceptron = Perceptron(random_state=7,max_iter=1000,tol=0.001)

perceptron_model = perceptron.fit(review_train_data,rating_train_data)
rating_predicted_data = perceptron_model.predict(review_test_data)


target_names = ['Rating 1', 'Rating 2', 'Rating 3']
report = classification_report(rating_test_data, rating_predicted_data, target_names=target_names, output_dict=True)

print(str(report['Rating 1']['precision'])+","+str(report['Rating 1']['recall'])+","+str(report['Rating 1']['f1-score']))
print(str(report['Rating 2']['precision'])+","+str(report['Rating 2']['recall'])+","+str(report['Rating 2']['f1-score']))
print(str(report['Rating 3']['precision'])+","+str(report['Rating 3']['recall'])+","+str(report['Rating 3']['f1-score']))
print(str(report['weighted avg']['precision'])+","+str(report['weighted avg']['recall'])+","+str(report['weighted avg']['f1-score']))



0.6037473976405274,0.6447628458498024,0.6235814120176801
0.49827265479670474,0.47831632653061223,0.4880905896134322
0.6727133367399081,0.6530257936507936,0.6627233828341303
0.5924648705156038,0.5931666666666666,0.5924727788932864


# SVM

In [123]:
from sklearn import svm

# https://www.datacamp.com/tutorial/svm-classification-scikit-learn-python

svm_classifier = svm.SVC(kernel='linear')

svm_model = svm_classifier.fit(review_train_data,rating_train_data)
rating_predicted_data = svm_model.predict(review_test_data)

target_names = ['Rating 1', 'Rating 2', 'Rating 3']
report = classification_report(rating_test_data, rating_predicted_data, target_names=target_names,output_dict=True)


# print(report)

print(str(report['Rating 1']['precision'])+","+str(report['Rating 1']['recall'])+","+str(report['Rating 1']['f1-score']))
print(str(report['Rating 2']['precision'])+","+str(report['Rating 2']['recall'])+","+str(report['Rating 2']['f1-score']))
print(str(report['Rating 3']['precision'])+","+str(report['Rating 3']['recall'])+","+str(report['Rating 3']['f1-score']))
print(str(report['weighted avg']['precision'])+","+str(report['weighted avg']['recall'])+","+str(report['weighted avg']['f1-score']))


0.6907216494845361,0.678606719367589,0.6846105919003115
0.5698040164529398,0.6007653061224489,0.5848752017881536
0.7562982005141388,0.7296626984126984,0.7427417318858874
0.6732556105068278,0.6703333333333333,0.6715624274988268


# Logistic Regression

In [124]:
from sklearn.linear_model import LogisticRegression

# https://towardsdatascience.com/logistic-regression-using-python-sklearn-numpy-mnist-handwriting-recognition-matplotlib-a6b31e2b166a

logisticRegr = LogisticRegression(max_iter=10000)

logisticRegression_model = logisticRegr.fit(review_train_data,rating_train_data)
rating_predicted_data = logisticRegression_model.predict(review_test_data)

target_names = ['Rating 1', 'Rating 2', 'Rating 3']
report = classification_report(rating_test_data, rating_predicted_data, target_names=target_names,output_dict=True)

# print(report)

print(str(report['Rating 1']['precision'])+","+str(report['Rating 1']['recall'])+","+str(report['Rating 1']['f1-score']))
print(str(report['Rating 2']['precision'])+","+str(report['Rating 2']['recall'])+","+str(report['Rating 2']['f1-score']))
print(str(report['Rating 3']['precision'])+","+str(report['Rating 3']['recall'])+","+str(report['Rating 3']['f1-score']))
print(str(report['weighted avg']['precision'])+","+str(report['weighted avg']['recall'])+","+str(report['weighted avg']['f1-score']))



0.6951128752170678,0.692193675889328,0.6936502042331971
0.5834601725012684,0.5867346938775511,0.5850928516916816
0.7514278619319593,0.7504960317460317,0.7509616577739173
0.6775614945327769,0.6773333333333333,0.6774447841259841


# Naive Bayes

In [125]:

from sklearn.naive_bayes import MultinomialNB
import numpy as np
from sklearn.metrics import classification_report

# https://scikit-learn.org/stable/modules/naive_bayes.html
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html

naiveBayes_model = MultinomialNB(alpha=0.85).fit(review_train_data,rating_train_data)
rating_predicted_data = naiveBayes_model.predict(review_test_data)


target_names = ['Rating 1', 'Rating 2', 'Rating 3']
report = classification_report(rating_test_data, rating_predicted_data, target_names=target_names, output_dict=True)

print(str(report['Rating 1']['precision'])+","+str(report['Rating 1']['recall'])+","+str(report['Rating 1']['f1-score']))
print(str(report['Rating 2']['precision'])+","+str(report['Rating 2']['recall'])+","+str(report['Rating 2']['f1-score']))
print(str(report['Rating 3']['precision'])+","+str(report['Rating 3']['recall'])+","+str(report['Rating 3']['f1-score']))
print(str(report['weighted avg']['precision'])+","+str(report['weighted avg']['recall'])+","+str(report['weighted avg']['f1-score']))


0.6969368392518298,0.6351284584980237,0.6645986816595579
0.5412556053811659,0.6158163265306122,0.5761336515513126
0.7289015840041547,0.6961805555555556,0.7121654192566281
0.6568211237575274,0.6493333333333333,0.6516825289901468
