In [None]:
"""
Project: Amazon Review Sentiment Analysis
Goal: Build end-to-end pipeline from data collection to deployment.
Author: Sneha Chowdhury
"""

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv(
    "../data/Amazon_Reviews.csv",
    encoding="latin1",
    engine="python",
    on_bad_lines="skip"
)

print(df.shape)
df.head()

(21214, 9)


Unnamed: 0,Reviewer Name,Profile Link,Country,Review Count,Review Date,Rating,Review Title,Review Text,Date of Experience
0,Eugene ath,/users/66e8185ff1598352d6b3701a,US,1 review,2024-09-16T13:44:26.000Z,Rated 1 out of 5 stars,A Store That Doesn't Want to Sell Anything,"I registered on the website, tried to order a ...","September 16, 2024"
1,Daniel ohalloran,/users/5d75e460200c1f6a6373648c,GB,9 reviews,2024-09-16T18:26:46.000Z,Rated 1 out of 5 stars,Had multiple orders one turned up andâ¦,Had multiple orders one turned up and driver h...,"September 16, 2024"
2,p fisher,/users/546cfcf1000064000197b88f,GB,90 reviews,2024-09-16T21:47:39.000Z,Rated 1 out of 5 stars,I informed these reprobates,I informed these reprobates that I WOULD NOT B...,"September 16, 2024"
3,Greg Dunn,/users/62c35cdbacc0ea0012ccaffa,AU,5 reviews,2024-09-17T07:15:49.000Z,Rated 1 out of 5 stars,Advertise one price then increase it on website,I have bought from Amazon before and no proble...,"September 17, 2024"
4,Sheila Hannah,/users/5ddbe429478d88251550610e,GB,8 reviews,2024-09-16T18:37:17.000Z,Rated 1 out of 5 stars,If I could give a lower rate I would,If I could give a lower rate I would! I cancel...,"September 16, 2024"


In [3]:
df['Rating'].unique()[:5]

<StringArray>
['Rated 1 out of 5 stars', 'Rated 5 out of 5 stars', 'Rated 2 out of 5 stars',
 'Rated 4 out of 5 stars', 'Rated 3 out of 5 stars']
Length: 5, dtype: str

In [4]:
df['Rating'].isnull().sum()
df['Rating'].dtype


<StringDtype(storage='python', na_value=nan)>

In [5]:
import re

def extract_rating(x):
    match = re.search(r'\d', str(x))
    if match:
        return int(match.group())
    else:
        return None

df['rating_num'] = df['Rating'].apply(extract_rating)
df[['Rating', 'rating_num']].head()

Unnamed: 0,Rating,rating_num
0,Rated 1 out of 5 stars,1.0
1,Rated 1 out of 5 stars,1.0
2,Rated 1 out of 5 stars,1.0
3,Rated 1 out of 5 stars,1.0
4,Rated 1 out of 5 stars,1.0


In [6]:
df['rating_num'].isnull().sum()


np.int64(159)

In [7]:
# Drop neutral 3-star reviews
df = df[df['rating_num'] != 3]

# Create binary label
df['label'] = df['rating_num'].apply(lambda x: 1 if x > 3 else 0)

df['label'].value_counts()


label
0    14509
1     5820
Name: count, dtype: int64

In [8]:
df = df[df['rating_num'] != 3]
df['rating_num'].value_counts()


rating_num
1.0    13123
5.0     4528
4.0     1292
2.0     1227
Name: count, dtype: int64

In [9]:
df['label'] = df['rating_num'].apply(lambda x: 1 if x > 3 else 0)
df['label'].value_counts(normalize=True)


label
0    0.713709
1    0.286291
Name: proportion, dtype: float64

In [10]:
df_model = df[['Review Text', 'label']].copy()
df_model.rename(columns={'Review Text': 'text'}, inplace=True)

df_model.head()


Unnamed: 0,text,label
0,"I registered on the website, tried to order a ...",0
1,Had multiple orders one turned up and driver h...,0
2,I informed these reprobates that I WOULD NOT B...,0
3,I have bought from Amazon before and no proble...,0
4,If I could give a lower rate I would! I cancel...,0


In [11]:
#Baseline model
#train/test split
from sklearn.model_selection import train_test_split

X = df_model['text']
y = df_model['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    stratify=y   # important for imbalance
)


In [26]:
X = df_model['text'].fillna("")
y = df_model['label']


In [27]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        max_features=5000,
        ngram_range=(1,2)
    )),
    ("clf", LogisticRegression(
        max_iter=1000,
        class_weight="balanced"
    ))
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.97      0.93      0.95      2902
           1       0.85      0.93      0.89      1164

    accuracy                           0.93      4066
   macro avg       0.91      0.93      0.92      4066
weighted avg       0.94      0.93      0.93      4066



In [28]:
import joblib

joblib.dump(pipeline, "../models/sentiment_model.pkl")


['../models/sentiment_model.pkl']

In [29]:
def predict_sentiment(text):
    return pipeline.predict([text])[0]


In [30]:
predict_sentiment("This product is amazing and works perfectly!")


np.int64(1)

In [31]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(
    pipeline,
    X,
    y,
    cv=skf,
    scoring="f1"
)

print("F1 Scores:", scores)
print("Mean F1:", scores.mean())


F1 Scores: [0.89342215 0.88603398 0.89809445 0.87952822 0.88375997]
Mean F1: 0.8881677537379276
