<a href="https://colab.research.google.com/github/ss0610/NoLimitCpp/blob/main/Sentiment_Analysis_for_Product_Reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

Load dataset

In [4]:
import zipfile

with zipfile.ZipFile('/content/1429_1.csv.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/')

In [7]:
df = pd.read_csv('/content/1429_1.csv')
print(df.head())

                     id                                               name  \
0  AVqkIhwDv8e3D1O-lebb  All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...   
1  AVqkIhwDv8e3D1O-lebb  All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...   
2  AVqkIhwDv8e3D1O-lebb  All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...   
3  AVqkIhwDv8e3D1O-lebb  All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...   
4  AVqkIhwDv8e3D1O-lebb  All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...   

        asins   brand                                         categories  \
0  B01AHB9CN2  Amazon  Electronics,iPad & Tablets,All Tablets,Fire Ta...   
1  B01AHB9CN2  Amazon  Electronics,iPad & Tablets,All Tablets,Fire Ta...   
2  B01AHB9CN2  Amazon  Electronics,iPad & Tablets,All Tablets,Fire Ta...   
3  B01AHB9CN2  Amazon  Electronics,iPad & Tablets,All Tablets,Fire Ta...   
4  B01AHB9CN2  Amazon  Electronics,iPad & Tablets,All Tablets,Fire Ta...   

                                                keys manufacturer  \
0  84

  df = pd.read_csv('/content/1429_1.csv')


In [8]:
df = df[['reviews.text', 'reviews.rating']].dropna()

Assign sentiment labels

In [9]:
def label_sentiment(rating):
    if rating >= 4:
        return 'positive'
    elif rating == 3:
        return 'neutral'
    else:
        return 'negative'

df['sentiment'] = df['reviews.rating'].apply(label_sentiment)

Text Preprocessing Function

In [10]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.lower()
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['clean_text'] = df['reviews.text'].apply(preprocess_text)

Feature Extraction using TF-IDF

In [11]:
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['clean_text']).toarray()
y = df['sentiment']

Train-Test Split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

Train Model and Evaluation

In [14]:
#Train Naive Bayes Model
model = MultinomialNB()
model.fit(X_train, y_train)

#Evaluation
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9332948310713255
F1 Score: 0.9017842608984264

Classification Report:
               precision    recall  f1-score   support

    negative       0.00      0.00      0.00       162
     neutral       0.33      0.01      0.01       300
    positive       0.93      1.00      0.97      6464

    accuracy                           0.93      6926
   macro avg       0.42      0.34      0.33      6926
weighted avg       0.89      0.93      0.90      6926


Confusion Matrix:
 [[   0    2  160]
 [   0    2  298]
 [   0    2 6462]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Misclassified Reviews Analysis

In [15]:
df_test = df.loc[y_test.index]  # Ensure index alignment
df_test['predicted'] = y_pred
misclassified = df_test[df_test['sentiment'] != df_test['predicted']]
print("\nSample Misclassified Reviews:\n", misclassified[['reviews.text', 'sentiment', 'predicted']].head())


Sample Misclassified Reviews:
                                             reviews.text sentiment predicted
32664  This is an excellent device and easy to use. 5...   neutral  positive
4927   I got 6 of them they didn't have google plus m...  negative  positive
3610   I bought it cause I wanted to read more. That ...   neutral  positive
34397  Good streaming box for 4K with Netflix and Ama...   neutral  positive
16065  Only good for kids and games. Battery life is ...   neutral  positive
