In [65]:
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [66]:
# Load dataset
df = pd.read_csv(r'dataset/all_kindle_review.csv')
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,asin,helpful,rating,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,11539,B0033UV8HI,"[8, 10]",3,"Jace Rankin may be short, but he's nothing to ...","09 2, 2010",A3HHXRELK8BHQG,Ridley,Entertaining But Average,1283385600
1,1,5957,B002HJV4DE,"[1, 1]",5,Great short read. I didn't want to put it dow...,"10 8, 2013",A2RGNZ0TRF578I,Holly Butler,Terrific menage scenes!,1381190400
2,2,9146,B002ZG96I4,"[0, 0]",3,I'll start by saying this is the first of four...,"04 11, 2014",A3S0H2HV6U1I7F,Merissa,Snapdragon Alley,1397174400
3,3,7038,B002QHWOEU,"[1, 3]",3,Aggie is Angela Lansbury who carries pocketboo...,"07 5, 2014",AC4OQW3GZ919J,Cleargrace,very light murder cozy,1404518400
4,4,1776,B001A06VJ8,"[0, 1]",4,I did not expect this type of book to be in li...,"12 31, 2012",A3C9V987IQHOQD,Rjostler,Book,1356912000


In [67]:
df.shape

(12000, 11)

In [68]:
df.duplicated().sum()

0

In [69]:
# Drop unnecessary columns
df = df.drop(['Unnamed: 0.1', 'Unnamed: 0', 'asin', 'helpful', 'reviewTime', 'reviewerID', 'reviewerName', 'unixReviewTime'], axis=1)
df.head()

Unnamed: 0,rating,reviewText,summary
0,3,"Jace Rankin may be short, but he's nothing to ...",Entertaining But Average
1,5,Great short read. I didn't want to put it dow...,Terrific menage scenes!
2,3,I'll start by saying this is the first of four...,Snapdragon Alley
3,3,Aggie is Angela Lansbury who carries pocketboo...,very light murder cozy
4,4,I did not expect this type of book to be in li...,Book


In [70]:
# Check for null values
df.isna().sum()

rating        0
reviewText    0
summary       2
dtype: int64

In [71]:
df.dropna(subset=['summary'], inplace=True)

In [72]:
df.isna().sum()

rating        0
reviewText    0
summary       0
dtype: int64

In [73]:
df.shape

(11998, 3)

In [74]:
df.head()

Unnamed: 0,rating,reviewText,summary
0,3,"Jace Rankin may be short, but he's nothing to ...",Entertaining But Average
1,5,Great short read. I didn't want to put it dow...,Terrific menage scenes!
2,3,I'll start by saying this is the first of four...,Snapdragon Alley
3,3,Aggie is Angela Lansbury who carries pocketboo...,very light murder cozy
4,4,I did not expect this type of book to be in li...,Book


In [75]:
df['rating'].value_counts()

rating
5    2999
4    2999
3    2000
2    2000
1    2000
Name: count, dtype: int64

In [76]:
# Positive review -> 1
# Negative review -> 0
df['rating'] = np.where(df['rating']<3, 0, 1)

In [77]:
df['rating'].value_counts()

rating
1    7998
0    4000
Name: count, dtype: int64

In [78]:
df.head(20)

Unnamed: 0,rating,reviewText,summary
0,1,"Jace Rankin may be short, but he's nothing to ...",Entertaining But Average
1,1,Great short read. I didn't want to put it dow...,Terrific menage scenes!
2,1,I'll start by saying this is the first of four...,Snapdragon Alley
3,1,Aggie is Angela Lansbury who carries pocketboo...,very light murder cozy
4,1,I did not expect this type of book to be in li...,Book
5,1,Aislinn is a little girl with big dreams. Afte...,A story of a little girl with big dreams.
6,0,This has the makings of a good story... unfort...,This story has potential but ultimately disapp...
7,1,I got this because I like collaborated short s...,Good thriller
8,1,"Loved this book, I am hooked on this series an...",Loved it!
9,1,"And that's a good thing. Short, sweet tease th...",I was scared...


In [79]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SIDDHARTH\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SIDDHARTH\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\SIDDHARTH\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [80]:
# Initialize lemmatizer and stopwords set
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [81]:
# Preprocessing pipeline
def preprocess_text(text):
    text = text.lower() # Lowercasing
    text = re.sub('[^a-zA-Z0-9\s-]', '', text) # Remove special characters except letters, numbers, spaces and hyphens
    text = re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', text) # Remove urls
    text = BeautifulSoup(text, 'lxml').get_text() # Remove HTML tags
    tokens = word_tokenize(text) # Tokenization
    tokens = [word for word in tokens if word not in stop_words] # Remove stopwords
    tokens = [lemmatizer.lemmatize(token) for token in tokens] # Lemmatization
    text = ' '.join(tokens) 
    return text

In [82]:
df['text'] = df['reviewText'] + ' ' + df['summary']
df['text'] = df['text'].apply(preprocess_text)

In [83]:
df.head()

Unnamed: 0,rating,reviewText,summary,text
0,1,"Jace Rankin may be short, but he's nothing to ...",Entertaining But Average,jace rankin may short he nothing mess man haul...
1,1,Great short read. I didn't want to put it dow...,Terrific menage scenes!,great short read didnt want put read one sitti...
2,1,I'll start by saying this is the first of four...,Snapdragon Alley,ill start saying first four book wasnt expecti...
3,1,Aggie is Angela Lansbury who carries pocketboo...,very light murder cozy,aggie angela lansbury carry pocketbook instead...
4,1,I did not expect this type of book to be in li...,Book,expect type book library pleased find price ri...


In [84]:
df.drop(['reviewText', 'summary'], axis=1, inplace=True)

In [85]:
df.head()

Unnamed: 0,rating,text
0,1,jace rankin may short he nothing mess man haul...
1,1,great short read didnt want put read one sitti...
2,1,ill start saying first four book wasnt expecti...
3,1,aggie angela lansbury carry pocketbook instead...
4,1,expect type book library pleased find price ri...


In [86]:
# Separate features (X) and target (y)
X = df.drop(['rating'], axis=1)
y = df['rating']

In [87]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

X_train shape: (9598, 1), y_train shape: (9598,)
X_test shape: (2400, 1), y_test shape: (2400,)


In [88]:
# Convert text data into TF-IDF features
tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(X_train['text']).toarray()
X_test = tfidf.transform(X_test['text']).toarray()

In [89]:
X_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [90]:
# Train an XGBoost Classifier
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

In [91]:
# Predict on test data
y_pred = xgb.predict(X_test)

In [92]:
# Evaluate model performance
accuracy_score(y_test, y_pred)

0.8325