<a href="https://colab.research.google.com/github/siddharthpurswani/amazon-review-sentiment-analysis/blob/main/Amazon_product_reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from datasets import load_dataset

dataset = load_dataset("imdb")

In [None]:
import pandas as pd

train_df = pd.DataFrame(dataset["train"])
test_df = pd.DataFrame(dataset["test"])


In [None]:
df = pd.concat([train_df, test_df])
df.head()

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


In [None]:
df = df.reset_index(drop=True)

In [None]:
df.shape

(50000, 2)

In [None]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,25000
1,25000


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    50000 non-null  object
 1   label   50000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 781.4+ KB


In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
def clean_text(text):
    if pd.isna(text):
        return ""

    text = str(text).lower()

    # Remove URLs, emails, HTML
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'<[^>]+>', '', text)

    # Remove special characters, keep letters and spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove extra whitespace
    text = ' '.join(text.split())

    return text

import joblib

joblib.dump(clean_text,"clean_text.joblib")

['clean_text.joblib']

In [None]:
df["cleaned_text"] = df["text"].apply(clean_text)

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
negations = {'not', 'no', 'nor', "didn't", "wasn't", "isn't", "aren't","don't"}
stop_words = stop_words - negations

In [None]:
def preprocess(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

joblib.dump(preprocess,"preprocess.joblib")

['preprocess.joblib']

In [None]:
df["processed_text"] = df['cleaned_text'].apply(preprocess)

In [None]:
df["processed_text"]

Unnamed: 0,processed_text
0,rented curiousyellow video store controversy s...
1,curious yellow risible pretentious steaming pi...
2,avoid making type film future film interesting...
3,film probably inspired godard masculin fminin ...
4,brotherafter hearing ridiculous film umpteen y...
...,...
49995,got around seeing monster man yesterday long w...
49996,got part competition prize watched not really ...
49997,got monster man box set three film mainly want...
49998,five minute started feel naff looking youve go...


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF (recommended for most text classification)
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95
)


In [None]:
X_text = df['processed_text']
y = df['label']

In [None]:
X = tfidf.fit_transform(X_text)

In [None]:
joblib.dump(tfidf, "tfidf_vectorizer.joblib")

['tfidf_vectorizer.joblib']

In [None]:
from sklearn.model_selection import train_test_split

X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

# Logistic Regression (often best for text)
lr = LogisticRegression(max_iter=1000, class_weight='balanced')
lr.fit(X_train_tfidf, y_train)


In [None]:
joblib.dump(lr, "logistic_regression_model.joblib")

['logistic_regression_model.joblib']

In [None]:
y_pred = lr.predict(X_test_tfidf)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.8880
Precision: 0.8882
Recall: 0.8880
F1-Score: 0.8880

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      7500
           1       0.88      0.90      0.89      7500

    accuracy                           0.89     15000
   macro avg       0.89      0.89      0.89     15000
weighted avg       0.89      0.89      0.89     15000


Confusion Matrix:
[[6583  917]
 [ 763 6737]]


In [None]:
a = "movie was good"
cleaned_text = clean_text(a)
processed_text = preprocess(cleaned_text)
vectorized = tfidf.transform([processed_text])
pred = lr.predict(vectorized)[0]
sentiment = "positive" if pred == 1 else "negative"

In [None]:
print(sentiment)

positive
