In [3]:
# Import Required Libraries
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [4]:
# Load Dataset
df = pd.read_csv("amazonreviews.tsv", sep="\t")
df.head()


Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [5]:
# Explore Dataset
df.info()
df.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   10000 non-null  object
 1   review  10000 non-null  object
dtypes: object(2)
memory usage: 156.4+ KB


label     0
review    0
dtype: int64

In [8]:
print(df.columns)


Index(['label', 'review'], dtype='object')


In [9]:
df.columns = df.columns.str.strip()
print(df.columns)


Index(['label', 'review'], dtype='object')


In [10]:
# Keep only required columns
df = df[['review', 'label']]
df.dropna(inplace=True)
df.head()


Unnamed: 0,review,label
0,Stuning even for the non-gamer: This sound tra...,pos
1,The best soundtrack ever to anything.: I'm rea...,pos
2,Amazing!: This soundtrack is my favorite music...,pos
3,Excellent Soundtrack: I truly like this soundt...,pos
4,"Remember, Pull Your Jaw Off The Floor After He...",pos


In [11]:
# Check class distribution
df['label'].value_counts()


label
neg    5097
pos    4903
Name: count, dtype: int64

In [12]:
# Text Cleaning 
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # remove numbers & symbols
    text = re.sub(r'\s+', ' ', text)      # remove extra spaces
    return text

df['review'] = df['review'].apply(clean_text)
df.head()


Unnamed: 0,review,label
0,stuning even for the nongamer this sound track...,pos
1,the best soundtrack ever to anything im readin...,pos
2,amazing this soundtrack is my favorite music o...,pos
3,excellent soundtrack i truly like this soundtr...,pos
4,remember pull your jaw off the floor after hea...,pos


In [13]:
# Split data (Train–Test)
from sklearn.model_selection import train_test_split

X = df['review']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [14]:
# TF-IDF Feature Extraction
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=5000,
    stop_words='english'
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [15]:
# Train Model (Logistic Regression)
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)


In [16]:
# Predictions
y_pred = model.predict(X_test_tfidf)


In [17]:
# Evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.8425

Classification Report:
               precision    recall  f1-score   support

         neg       0.84      0.85      0.85      1019
         pos       0.84      0.83      0.84       981

    accuracy                           0.84      2000
   macro avg       0.84      0.84      0.84      2000
weighted avg       0.84      0.84      0.84      2000


Confusion Matrix:
 [[869 150]
 [165 816]]


In [18]:
# Test with Custom Review
def predict_sentiment(text):
    text = clean_text(text)
    vec = tfidf.transform([text])
    return model.predict(vec)[0]

predict_sentiment("This product is amazing and worth the money")


'pos'