In [None]:
import pandas as pd

data = pd.read_csv("datasets/amazon_reviews.csv")

In [6]:
data.head()

Unnamed: 0,class_index,review_title,review_text
0,2,Great CD,My lovely Pat has one of the GREAT voices of h...
1,2,One of the best game music soundtracks - for a...,Despite the fact that I have only played a sma...
2,1,Batteries died within a year ...,I bought this charger in Jul 2003 and it worke...
3,2,"works fine, but Maha Energy is better",Check out Maha Energy's website. Their Powerex...
4,2,Great for the non-audiophile,Reviewed quite a bit of the combo players and ...


In [7]:
data['class_index'].value_counts()

class_index
2    200000
1    200000
Name: count, dtype: int64

In [8]:
data.isnull().sum()

class_index      0
review_title    24
review_text      0
dtype: int64

In [9]:
data.fillna('', inplace=True)
data.isnull().sum()

class_index     0
review_title    0
review_text     0
dtype: int64

In [10]:
data['class_index'] = data['class_index'] - 1
data.head()


Unnamed: 0,class_index,review_title,review_text
0,1,Great CD,My lovely Pat has one of the GREAT voices of h...
1,1,One of the best game music soundtracks - for a...,Despite the fact that I have only played a sma...
2,0,Batteries died within a year ...,I bought this charger in Jul 2003 and it worke...
3,1,"works fine, but Maha Energy is better",Check out Maha Energy's website. Their Powerex...
4,1,Great for the non-audiophile,Reviewed quite a bit of the combo players and ...


In [11]:
data['review'] = data['review_title'] + ' ' + data['review_text']
data.head()

Unnamed: 0,class_index,review_title,review_text,review
0,1,Great CD,My lovely Pat has one of the GREAT voices of h...,Great CD My lovely Pat has one of the GREAT vo...
1,1,One of the best game music soundtracks - for a...,Despite the fact that I have only played a sma...,One of the best game music soundtracks - for a...
2,0,Batteries died within a year ...,I bought this charger in Jul 2003 and it worke...,Batteries died within a year ... I bought this...
3,1,"works fine, but Maha Energy is better",Check out Maha Energy's website. Their Powerex...,"works fine, but Maha Energy is better Check ou..."
4,1,Great for the non-audiophile,Reviewed quite a bit of the combo players and ...,Great for the non-audiophile Reviewed quite a ...


In [12]:
downsized = data.sample(n=500, random_state=42)

In [14]:
#Code to clean review text data
import re
import string
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from textblob import TextBlob
from langdetect import detect

# Download necessary NLTK data
nltk.download("stopwords")
nltk.download("wordnet")

# Initialize NLP tools
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def cleanReview(text):
    if not isinstance(text, str):
        return ""

    # 1. Convert to lowercase
    text = text.lower()

    # 2. Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    # 3. Remove square brackets
    text = re.sub(r'\[.*?\]', '', text)

    # 4. Remove punctuation
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)

    # 5. Remove numbers
    text = re.sub(r'\w*\d\w*', '', text)

    # 6. Remove special quote marks & newlines
    text = re.sub(r'[‘’“”…]', '', text)
    text = text.replace("\n", " ")

    # 7. Remove stopwords
    text = " ".join([word for word in text.split() if word not in stop_words])

    # 8. Spell Correction
    text = str(TextBlob(text).correct())

    # 9. Stemming
    text = " ".join([stemmer.stem(word) for word in text.split()])

    # 10. Lemmatization
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])

    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/seanlai/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /Users/seanlai/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [16]:
downsized["cleaned_review"] = downsized['review'].apply(cleanReview)
downsized.head()


Unnamed: 0,class_index,review_title,review_text,review,cleaned_review
23218,1,This is a great book,I must preface this by saying that I am not re...,This is a great book I must preface this by sa...,great book must prefac say religi love book re...
20731,0,Huge Disappointment.,"As a big time, long term Trevanian fan, I was ...","Huge Disappointment. As a big time, long term ...",huge disappoint big time long term trevanian f...
39555,1,Wayne is tight but cant hang with Turk.,This album is hot as it wants to be. However C...,Wayne is tight but cant hang with Turk. This a...,wayn tight can hang turk album hot want howev ...
147506,1,Excellent,I read this book when I was in elementary scho...,Excellent I read this book when I was in eleme...,excel read book elementari school probabl four...
314215,0,Not about Anusara,Although this book is touted on several Anusar...,Not about Anusara Although this book is touted...,anusara although book rout sever anusara web s...


In [17]:
from transformers import pipeline

# Load a pre-trained sentiment analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

results = sentiment_pipeline(downsized['review'].tolist())

  from .autonotebook import tqdm as notebook_tqdm
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use mps:0


In [18]:
predictions = [1 if item['label'] == 'POSITIVE' else 0 for item in results]


In [19]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(downsized['class_index'].tolist(), predictions)
accuracy

0.918