Sentiment Analysis Project


In [26]:
# Import Libraries
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [27]:
# Load Dataset
df = pd.read_csv("amazon_alexa.tsv", sep="\t")

print("Dataset Loaded:")
print(df.head())

# We will use: verified_reviews → review text
# feedback → 1 = positive, 0 = negative

Dataset Loaded:
   rating       date         variation  \
0       5  31-Jul-18  Charcoal Fabric    
1       5  31-Jul-18  Charcoal Fabric    
2       4  31-Jul-18    Walnut Finish    
3       5  31-Jul-18  Charcoal Fabric    
4       5  31-Jul-18  Charcoal Fabric    

                                    verified_reviews  feedback  
0                                      Love my Echo!         1  
1                                          Loved it!         1  
2  Sometimes while playing a game, you can answer...         1  
3  I have had a lot of fun with this thing. My 4 ...         1  
4                                              Music         1  


In [31]:
# Column names
print(f"Feature names : {df.columns.values}")

Feature names : ['rating' 'date' 'variation' 'verified_reviews' 'feedback']


In [32]:
# Dataset actual size
print(f"Actual dataset size : {df.shape}")

Actual dataset size : (3150, 5)


In [36]:
# Check for null values
df.isnull().sum()

Unnamed: 0,0
rating,0
date,0
variation,0
verified_reviews,0
feedback,0


In [37]:
# We will drop the null record
df.dropna(inplace=True)

In [38]:
# Dataset size after dropping null values
print(f"Dataset size after dropping null values : {df.shape}")

Dataset size after dropping null values : (3149, 5)


In [42]:
# Preprocessing text:

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+", "", text)     # remove URLs
    text = re.sub(r"[^a-zA-Z\s]", "", text)        # remove numbers/punctuation
    text = re.sub(r"\s+", " ", text).strip()       # normalize spaces
    return text

df["clean_review"] = df["verified_reviews"].apply(clean_text)

# Convert labels to readable format
df["sentiment"] = df["feedback"].map({1: "positive", 0: "negative"})

print("\nCleaned Text:")
print(df[["verified_reviews", "clean_review", "sentiment"]].head())


Cleaned Text:
                                    verified_reviews  \
0                                      Love my Echo!   
1                                          Loved it!   
2  Sometimes while playing a game, you can answer...   
3  I have had a lot of fun with this thing. My 4 ...   
4                                              Music   

                                        clean_review sentiment  
0                                       love my echo  positive  
1                                           loved it  positive  
2  sometimes while playing a game you can answer ...  positive  
3  i have had a lot of fun with this thing my yr ...  positive  
4                                              music  positive  


In [43]:
# Train-Test Split

X = df["clean_review"]
y = df["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [44]:
# TF-IDF Vectorization

tfidf = TfidfVectorizer(stop_words="english", max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [47]:
# Logistic Regression

logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_tfidf, y_train)

y_pred_lr = logreg.predict(X_test_tfidf)

print("LOGISTIC REGRESSION RESULTS")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lr))

LOGISTIC REGRESSION RESULTS
Accuracy: 0.919047619047619

Classification Report:
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00        51
    positive       0.92      1.00      0.96       579

    accuracy                           0.92       630
   macro avg       0.46      0.50      0.48       630
weighted avg       0.84      0.92      0.88       630



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [48]:
# Random Forest

rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train_tfidf.toarray(), y_train)

y_pred_rf = rf.predict(X_test_tfidf.toarray())

print("RANDOM FOREST RESULTS")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))

RANDOM FOREST RESULTS
Accuracy: 0.9317460317460318

Classification Report:
              precision    recall  f1-score   support

    negative       0.75      0.24      0.36        51
    positive       0.94      0.99      0.96       579

    accuracy                           0.93       630
   macro avg       0.84      0.61      0.66       630
weighted avg       0.92      0.93      0.91       630



In [50]:
# Example Predictions

eg = "The device works really well and I love using it!"
cleaned = clean_text(eg)
vec = tfidf.transform([cleaned])

print("Example")
print("Input Text:", eg)
print("Logistic Regression Prediction:", logreg.predict(vec)[0])
print("Random Forest Prediction:", rf.predict(vec.toarray())[0])


Example
Input Text: The device works really well and I love using it!
Logistic Regression Prediction: positive
Random Forest Prediction: positive
