In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
import re
from bs4 import BeautifulSoup

[nltk_data] Downloading package wordnet to C:\Users\Sai Kumar
[nltk_data]     Peddholla\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:
# ! pip install bs4 # in case you don't have it installed

# Dataset: https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Beauty_v1_00.tsv.gz

## Read Data

In [2]:
 dataframe = pd.read_table('amazon_reviews_us_Beauty_v1_00.tsv', on_bad_lines='skip'); 

  dataframe = pd.read_table('amazon_reviews_us_Beauty_v1_00.tsv', on_bad_lines='skip');


## Keep Reviews and Ratings

In [3]:
print(list(dataframe)) 
df = dataframe[['star_rating','review_body']]

['marketplace', 'customer_id', 'review_id', 'product_id', 'product_parent', 'product_title', 'product_category', 'star_rating', 'helpful_votes', 'total_votes', 'vine', 'verified_purchase', 'review_headline', 'review_body', 'review_date']


 ## We form three classes and select 20000 reviews randomly from each class.



In [4]:
class1 = df.loc[df['star_rating'].isin([1,2])]
class2 = df.loc[df['star_rating'].isin([3])]
class3 = df.loc[df['star_rating'].isin([4,5])]

class1 = class1.sample(n=20000)
class2 = class2.sample(n=20000)
class3 = class3.sample(n=20000)

class1['star_rating'] = class1['star_rating'].apply(lambda x: 1)
class2['star_rating'] = class2['star_rating'].apply(lambda x: 2)
class3['star_rating'] = class3['star_rating'].apply(lambda x: 3)

sample_data = pd.concat([class1, class2, class3], axis=0)
print(sample_data)


         star_rating                                        review_body
1769386            1  Ordered two; one for me, and one for my wife. ...
3430167            1  These hair claws are absolutely no better (or ...
4882575            1  Tried this a few years ago. Unfortunately if y...
3568322            1  I have had this toothbrush for 3 years and thi...
3125499            1  These sounded good but were not.  They made th...
...              ...                                                ...
3368356            3  These towelettes do an excellent job of removi...
234398             3  Warmth is a great product for every season of ...
1606247            3  Smells great but not any better than the oil a...
3478897            3  Very big bottle that lasts me for months.  Hel...
3108499            3  MY FRIENDS LET ME BUY IT FOR HER<br />AND WHEN...

[60000 rows x 2 columns]


# Data Cleaning



In [5]:
import contractions

# https://stackoverflow.com/questions/45999415/removing-html-tags-in-pandas
# https://stackoverflow.com/questions/51994254/removing-url-from-a-column-in-pandas-dataframe

average_length_before_cleaning = sample_data['review_body'].str.len().mean()

sample_data['review_body'] = sample_data['review_body'].apply(lambda x: str(x).lower())
sample_data['review_body'] = sample_data['review_body'].apply(lambda x: str(x).strip())
sample_data['review_body'] = sample_data['review_body'].str.replace(r'<[^<>]*>', '', regex=True)
sample_data['review_body'] = sample_data['review_body'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
sample_data['review_body'] = sample_data['review_body'].str.replace(r'[^a-zA-Z]',' ',regex=True)
sample_data['review_body'] = sample_data['review_body'].apply(lambda x: contractions.fix(str(x)))

average_length_after_cleaning = sample_data['review_body'].str.len().mean()

print(str(average_length_before_cleaning)+","+str(average_length_after_cleaning))


287.0545693951364,284.68385


# Pre-processing

## remove the stop words 

In [6]:
from nltk.corpus import stopwords
nltk.download('stopwords')
# https://stackoverflow.com/questions/29523254/python-remove-stop-words-from-pandas-dataframe
stop_words = stopwords.words('english')

average_length_before_preprocessing = sample_data['review_body'].str.len().mean()

sample_data['review_body'] = sample_data['review_body'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

print(sample_data)

[nltk_data] Downloading package stopwords to C:\Users\Sai Kumar
[nltk_data]     Peddholla\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


         star_rating                                        review_body
1769386            1  ordered two one one wife color use marker pen ...
3430167            1  hair claws absolutely better worse waythey wor...
4882575            1  tried years ago unfortunately mildly active we...
3568322            1  toothbrush years morning finally died brushed ...
3125499            1  sounded good made bath water greasy ugh left t...
...              ...                                                ...
3368356            3  towelettes excellent job removing make whether...
234398             3  warmth great product every season make wear we...
1606247            3                smells great better oil oil cheaper
3478897            3  big bottle lasts months helped hair feel nice ...
3108499            3          friends let buy herand got like naturural

[60000 rows x 2 columns]


## perform lemmatization  

In [7]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WhitespaceTokenizer
# https://stackoverflow.com/questions/47557563/lemmatization-of-all-pandas-cells

lemmatizer = WordNetLemmatizer()

def lemmatize_words(text):
    words = text.split()
    words = [lemmatizer.lemmatize(word,pos='v') for word in words]
    words = [lemmatizer.lemmatize(word,pos='a') for word in words]
    words = [lemmatizer.lemmatize(word,pos='n') for word in words]
    words = [lemmatizer.lemmatize(word,pos='r') for word in words]
    return ' '.join(words)

sample_data['review_body'] = sample_data['review_body'].apply(lemmatize_words)
print(sample_data)


average_length_after_preprocessing = sample_data['review_body'].str.len().mean()


print(str(average_length_before_preprocessing)+","+str(average_length_after_preprocessing))


         star_rating                                        review_body
1769386            1  order two one one wife color use marker pen te...
3430167            1  hair claw absolutely good bad waythey work app...
4882575            1  try year ago unfortunately mildly active wear ...
3568322            1  toothbrush year morning finally die brush twic...
3125499            1  sound good make bath water greasy ugh leave tu...
...              ...                                                ...
3368356            3  towelettes excellent job remove make whether l...
234398             3  warmth great product every season make wear we...
1606247            3                     smell great good oil oil cheap
3478897            3  big bottle last month help hair feel nice soft...
3108499            3           friend let buy herand get like naturural

[60000 rows x 2 columns]
284.68385,157.93593333333334


# TF-IDF Feature Extraction

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
# https://stackoverflow.com/questions/37593293/how-to-get-tfidf-with-pandas-dataframe

vectorizer = TfidfVectorizer()
feature_vector = vectorizer.fit_transform(sample_data['review_body'])
print(feature_vector)


  (0, 3183)	0.1661193897967145
  (0, 1846)	0.32528702680852406
  (0, 14084)	0.19487302484500882
  (0, 1240)	0.1760571939080441
  (0, 5626)	0.33640280180082055
  (0, 3102)	0.23793841931135845
  (0, 14000)	0.14128720647438744
  (0, 11226)	0.18540427056707065
  (0, 1161)	0.2409944107852801
  (0, 24228)	0.196355264790905
  (0, 17804)	0.31340545131461117
  (0, 14579)	0.35308307980160436
  (0, 26152)	0.26030635075912834
  (0, 4667)	0.1480498615062033
  (0, 27182)	0.23133645789227278
  (0, 16860)	0.2364721310256525
  (0, 25473)	0.1729098768311362
  (0, 17027)	0.15800876412698858
  (1, 15434)	0.17643417993347774
  (1, 21240)	0.5106791632127007
  (1, 4053)	0.19202397384801748
  (1, 1204)	0.298419556660133
  (1, 27399)	0.11956501386181118
  (1, 26900)	0.5106791632127007
  (1, 1875)	0.17493038795413465
  :	:
  (59997, 4053)	0.39177406241699975
  (59997, 10167)	0.22963505534832307
  (59998, 5027)	0.5006304915002902
  (59998, 5811)	0.4930722317532508
  (59998, 23436)	0.22643431625458987
  (59998, 2

# Perceptron

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.metrics import classification_report
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Perceptron.html

# splitting the data into train and test
rating_data = np.array(sample_data['star_rating'])
rating_data = rating_data.astype('int')
review_train_data,review_test_data,rating_train_data,rating_test_data = train_test_split(feature_vector, rating_data, test_size=0.20)


# https://python-course.eu/machine-learning/perceptron-class-in-sklearn.php
perceptron = Perceptron(random_state=42,max_iter=10000,alpha=0.001,penalty='l1')

perceptron_model = perceptron.fit(review_train_data,rating_train_data)
rating_predicted_data = perceptron_model.predict(review_test_data)


target_names = ['Rating 1', 'Rating 2', 'Rating 3']
report = classification_report(rating_test_data, rating_predicted_data, target_names=target_names, output_dict=True)

rating1_data = report['Rating 1']
rating2_data = report['Rating 2']
rating3_data = report['Rating 3']
weighted_avg_data = report['weighted avg']


print(str(rating1_data['precision'])+","+str(rating1_data['recall'])+","+str(rating1_data['f1-score']))
print(str(rating2_data['precision'])+","+str(rating2_data['recall'])+","+str(rating2_data['f1-score']))
print(str(rating3_data['precision'])+","+str(rating3_data['recall'])+","+str(rating3_data['f1-score']))
print(str(weighted_avg_data['precision'])+","+str(weighted_avg_data['recall'])+","+str(weighted_avg_data['f1-score']))


0.801909307875895,0.08345752608047691,0.15118110236220475
0.3582069699388135,0.6805660854182461,0.4693681917211328
0.49372384937238495,0.49937764500871296,0.4965346534653466
0.5524333797070924,0.4195833333333333,0.371710396310088


# SVM

In [10]:
from sklearn import svm

# https://www.datacamp.com/tutorial/svm-classification-scikit-learn-python

svm_classifier = svm.SVC(kernel='linear')

svm_model = svm_classifier.fit(review_train_data,rating_train_data)
rating_predicted_data = svm_model.predict(review_test_data)

target_names = ['Rating 1', 'Rating 2', 'Rating 3']
report = classification_report(rating_test_data, rating_predicted_data, target_names=target_names,output_dict=True)


# print(report)

rating1_data = report['Rating 1']
rating2_data = report['Rating 2']
rating3_data = report['Rating 3']
weighted_avg_data = report['weighted avg']


print(str(rating1_data['precision'])+","+str(rating1_data['recall'])+","+str(rating1_data['f1-score']))
print(str(rating2_data['precision'])+","+str(rating2_data['recall'])+","+str(rating2_data['f1-score']))
print(str(rating3_data['precision'])+","+str(rating3_data['recall'])+","+str(rating3_data['f1-score']))
print(str(weighted_avg_data['precision'])+","+str(weighted_avg_data['recall'])+","+str(weighted_avg_data['f1-score']))

0.6756422924901185,0.6851202404809619,0.6803482587064676
0.5825147347740668,0.587565023532326,0.5850289801455174
0.7507731958762887,0.7335683706874843,0.742071073748567
0.6691746980606835,0.6683333333333333,0.6687063729549361


# Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression

# https://towardsdatascience.com/logistic-regression-using-python-sklearn-numpy-mnist-handwriting-recognition-matplotlib-a6b31e2b166a

logisticRegr = LogisticRegression(max_iter=10000)

logisticRegression_model = logisticRegr.fit(review_train_data,rating_train_data)
rating_predicted_data = logisticRegression_model.predict(review_test_data)

target_names = ['Rating 1', 'Rating 2', 'Rating 3']
report = classification_report(rating_test_data, rating_predicted_data, target_names=target_names,output_dict=True)

# print(report)

rating1_data = report['Rating 1']
rating2_data = report['Rating 2']
rating3_data = report['Rating 3']
weighted_avg_data = report['weighted avg']


print(str(rating1_data['precision'])+","+str(rating1_data['recall'])+","+str(rating1_data['f1-score']))
print(str(rating2_data['precision'])+","+str(rating2_data['recall'])+","+str(rating2_data['f1-score']))
print(str(rating3_data['precision'])+","+str(rating3_data['recall'])+","+str(rating3_data['f1-score']))
print(str(weighted_avg_data['precision'])+","+str(weighted_avg_data['recall'])+","+str(weighted_avg_data['f1-score']))


0.687221396731055,0.6951402805611222,0.6911581569115816
0.5899823810722375,0.5806291800842209,0.5852684144818976
0.7402857859112559,0.7436414001510955,0.741959798994975
0.6720684619993826,0.6726666666666666,0.672346192788625


# Naive Bayes

In [12]:
from sklearn.naive_bayes import MultinomialNB
import numpy as np

# https://scikit-learn.org/stable/modules/naive_bayes.html
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html

naiveBayes_model = MultinomialNB(alpha=0.85).fit(review_train_data,rating_train_data)
rating_predicted_data = naiveBayes_model.predict(review_test_data)


target_names = ['Rating 1', 'Rating 2', 'Rating 3']
report = classification_report(rating_test_data, rating_predicted_data, target_names=target_names, output_dict=True)

rating1_data = report['Rating 1']
rating2_data = report['Rating 2']
rating3_data = report['Rating 3']
weighted_avg_data = report['weighted avg']


print(str(rating1_data['precision'])+","+str(rating1_data['recall'])+","+str(rating1_data['f1-score']))
print(str(rating2_data['precision'])+","+str(rating2_data['recall'])+","+str(rating2_data['f1-score']))
print(str(rating3_data['precision'])+","+str(rating3_data['recall'])+","+str(rating3_data['f1-score']))
print(str(weighted_avg_data['precision'])+","+str(weighted_avg_data['recall'])+","+str(weighted_avg_data['f1-score']))

0.6840249525359371,0.6317635270541082,0.6568563615054043
0.5577742876374243,0.6158038147138964,0.5853543677890276
0.7261410788381742,0.7051120624527827,0.7154720838124441
0.6554890528151778,0.6506666666666666,0.6521988185594244
