In [2]:
import pandas as pd

data = pd.read_csv("datasets/amazon_reviews.csv")

In [3]:
data.head()

Unnamed: 0,class_index,review_title,review_text
0,2,Great CD,My lovely Pat has one of the GREAT voices of h...
1,2,One of the best game music soundtracks - for a...,Despite the fact that I have only played a sma...
2,1,Batteries died within a year ...,I bought this charger in Jul 2003 and it worke...
3,2,"works fine, but Maha Energy is better",Check out Maha Energy's website. Their Powerex...
4,2,Great for the non-audiophile,Reviewed quite a bit of the combo players and ...


In [4]:
data['class_index'].value_counts()

class_index
2    200000
1    200000
Name: count, dtype: int64

In [5]:
data.isnull().sum()

class_index      0
review_title    24
review_text      0
dtype: int64

In [6]:
data.fillna('', inplace=True)
data.isnull().sum()

class_index     0
review_title    0
review_text     0
dtype: int64

In [7]:
data['class_index'] = data['class_index'] - 1
data.head()


Unnamed: 0,class_index,review_title,review_text
0,1,Great CD,My lovely Pat has one of the GREAT voices of h...
1,1,One of the best game music soundtracks - for a...,Despite the fact that I have only played a sma...
2,0,Batteries died within a year ...,I bought this charger in Jul 2003 and it worke...
3,1,"works fine, but Maha Energy is better",Check out Maha Energy's website. Their Powerex...
4,1,Great for the non-audiophile,Reviewed quite a bit of the combo players and ...


In [8]:
data['review'] = data['review_title'] + ' ' + data['review_text']
data.head()

Unnamed: 0,class_index,review_title,review_text,review
0,1,Great CD,My lovely Pat has one of the GREAT voices of h...,Great CD My lovely Pat has one of the GREAT vo...
1,1,One of the best game music soundtracks - for a...,Despite the fact that I have only played a sma...,One of the best game music soundtracks - for a...
2,0,Batteries died within a year ...,I bought this charger in Jul 2003 and it worke...,Batteries died within a year ... I bought this...
3,1,"works fine, but Maha Energy is better",Check out Maha Energy's website. Their Powerex...,"works fine, but Maha Energy is better Check ou..."
4,1,Great for the non-audiophile,Reviewed quite a bit of the combo players and ...,Great for the non-audiophile Reviewed quite a ...


In [9]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove punctuation
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])  # Lemmatization
    return text

data['review'] = data['review'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/seanlai/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/seanlai/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
from langdetect import detect

# Function to detect and remove non-English text
def is_english(text):
    try:
        return detect(text) == "en"
    except:
        return False
    
data = data[data['review'].apply(is_english)]

In [11]:
import numpy as np
from sklearn.model_selection import train_test_split

X = np.array(data['review'])
y = np.array(data['class_index'])

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
from transformers import pipeline

# Load a pre-trained sentiment analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

results = sentiment_pipeline(X_test.tolist())

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use mps:0


In [14]:
y_pred = [1 if item['label'] == 'POSITIVE' else 0 for item in results]


In [15]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8062050699962164

In [16]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))

              precision    recall  f1-score   support

    Negative       0.75      0.92      0.83     39648
    Positive       0.89      0.69      0.78     39642

    accuracy                           0.81     79290
   macro avg       0.82      0.81      0.80     79290
weighted avg       0.82      0.81      0.80     79290

