In [48]:
# Imports the necessary libraries for data manipulation, visualization, and text processing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [49]:
# Load the dataset
kindle = pd.read_csv('/content/all_kindle_review.csv')

In [50]:
# Display the first few rows of the DataFrame
kindle.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,asin,helpful,rating,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,11539,B0033UV8HI,"[8, 10]",3,"Jace Rankin may be short, but he's nothing to ...","09 2, 2010",A3HHXRELK8BHQG,Ridley,Entertaining But Average,1283385600
1,1,5957,B002HJV4DE,"[1, 1]",5,Great short read. I didn't want to put it dow...,"10 8, 2013",A2RGNZ0TRF578I,Holly Butler,Terrific menage scenes!,1381190400
2,2,9146,B002ZG96I4,"[0, 0]",3,I'll start by saying this is the first of four...,"04 11, 2014",A3S0H2HV6U1I7F,Merissa,Snapdragon Alley,1397174400
3,3,7038,B002QHWOEU,"[1, 3]",3,Aggie is Angela Lansbury who carries pocketboo...,"07 5, 2014",AC4OQW3GZ919J,Cleargrace,very light murder cozy,1404518400
4,4,1776,B001A06VJ8,"[0, 1]",4,I did not expect this type of book to be in li...,"12 31, 2012",A3C9V987IQHOQD,Rjostler,Book,1356912000


In [51]:
# Get information about the DataFrame
kindle.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Unnamed: 0.1    12000 non-null  int64 
 1   Unnamed: 0      12000 non-null  int64 
 2   asin            12000 non-null  object
 3   helpful         12000 non-null  object
 4   rating          12000 non-null  int64 
 5   reviewText      12000 non-null  object
 6   reviewTime      12000 non-null  object
 7   reviewerID      12000 non-null  object
 8   reviewerName    11962 non-null  object
 9   summary         11998 non-null  object
 10  unixReviewTime  12000 non-null  int64 
dtypes: int64(4), object(7)
memory usage: 1.0+ MB


In [52]:
# Select relevant columns
kindle = kindle[['reviewText' , 'rating']]

In [53]:
# Check the shape of the DataFrame
kindle.shape

(12000, 2)

In [54]:
# Display the first few rows of the modified DataFrame
kindle.head()

Unnamed: 0,reviewText,rating
0,"Jace Rankin may be short, but he's nothing to ...",3
1,Great short read. I didn't want to put it dow...,5
2,I'll start by saying this is the first of four...,3
3,Aggie is Angela Lansbury who carries pocketboo...,3
4,I did not expect this type of book to be in li...,4


In [55]:
# Get unique values in the 'rating' column
kindle['rating'].unique()

array([3, 5, 4, 2, 1])

In [56]:
# Get the value counts of the 'rating' column
kindle['rating'].value_counts()

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
5,3000
4,3000
3,2000
2,2000
1,2000


In [57]:
# Convert ratings to binary classification
kindle['rating'] = kindle['rating'].apply(lambda x: 0 if x<3 else 1)

In [58]:
# Get the value counts of the binary 'rating' column
kindle['rating'].value_counts()

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
1,8000
0,4000


In [59]:
# Convert 'reviewText' to lowercase
kindle['reviewText'] = kindle['reviewText'].str.lower()

In [60]:
# Display the 'reviewText' column
kindle['reviewText']

Unnamed: 0,reviewText
0,"jace rankin may be short, but he's nothing to ..."
1,great short read. i didn't want to put it dow...
2,i'll start by saying this is the first of four...
3,aggie is angela lansbury who carries pocketboo...
4,i did not expect this type of book to be in li...
...,...
11995,valentine cupid is a vampire- jena and ian ano...
11996,i have read all seven books in this series. ap...
11997,this book really just wasn't my cuppa. the si...
11998,"tried to use it to charge my kindle, it didn't..."


In [61]:
# Import libraries for text processing and download stopwords
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [62]:
# Import BeautifulSoup for HTML parsing
from bs4 import BeautifulSoup

In [63]:
# Perform text cleaning
## Removing special characters
kindle['reviewText']=kindle['reviewText'].apply(lambda x:re.sub('[^a-z A-z 0-9-]+', '',x))
## Remove the stopswords
kindle['reviewText']=kindle['reviewText'].apply(lambda x:" ".join([y for y in x.split() if y not in stopwords.words('english')]))
## Remove url
kindle['reviewText']=kindle['reviewText'].apply(lambda x: re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '' , str(x)))
## Remove html tags
kindle['reviewText']=kindle['reviewText'].apply(lambda x: BeautifulSoup(x, 'lxml').get_text())
## Remove any additional spaces
kindle['reviewText']=kindle['reviewText'].apply(lambda x: " ".join(x.split()))

In [64]:
# Import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer

In [65]:
# Initialize WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [66]:
# Define function for lemmatization
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

In [67]:
# Download wordnet
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [68]:
# Apply lemmatization
kindle['reviewText']=kindle['reviewText'].apply(lambda x:lemmatize_words(x))

In [69]:
# Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(kindle['reviewText'],kindle['rating'], test_size=0.20)

In [70]:
# Vectorize text data using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
bow=CountVectorizer()
X_train_bow=bow.fit_transform(X_train).toarray()
X_test_bow=bow.transform(X_test).toarray()

In [71]:
# Vectorize text data using TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()
X_train_tfidf=tfidf.fit_transform(X_train).toarray()
X_test_tfidf=tfidf.transform(X_test).toarray()

In [72]:
# Display the Bag-of-Words representation of the training data
X_train_bow

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [73]:
# Train Gaussian Naive Bayes models
from sklearn.naive_bayes import GaussianNB
nb_model_bow=GaussianNB().fit(X_train_bow,y_train)
nb_model_tfidf=GaussianNB().fit(X_train_tfidf,y_train)

In [74]:
# Import evaluation metrics
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

In [75]:
# Make predictions using the BOW model
y_pred_bow=nb_model_bow.predict(X_test_bow)

In [76]:
# Make predictions using the TFIDF model
y_pred_tfidf=nb_model_bow.predict(X_test_tfidf)

In [77]:
# Display confusion matrix for the BOW model
confusion_matrix(y_test,y_pred_bow)

array([[548, 280],
       [708, 864]])

In [78]:
# Print accuracy for the BOW model
print("BOW accuracy: ",accuracy_score(y_test,y_pred_bow))

BOW accuracy:  0.5883333333333334


In [79]:
# Display confusion matrix for the TFIDF model
confusion_matrix(y_test,y_pred_tfidf)

array([[540, 288],
       [702, 870]])

In [80]:
# Print accuracy for the TFIDF model
print("TFIDF accuracy: ",accuracy_score(y_test,y_pred_tfidf))

TFIDF accuracy:  0.5875
