In [10]:
import pandas as pd
import re
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# 🔹 Load and preprocess data
df = pd.read_csv('reviews.csv')
df = df[['Review Title', 'Review', 'Sentiment']]

# Combine title + review
df['Full_Review'] = df['Review Title'].fillna('') + ' ' + df['Review'].fillna('')

# Binary label encoding
df['Sentiment_Label'] = df['Sentiment'].map({'Positive': 1, 'Negative': 0})

# Text cleaning
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    stopwords = {'the','a','an','and','or','in','on','at','of','this','is','to','with','for','it','was','we','had'}
    tokens = [t for t in tokens if t not in stopwords]
    return ' '.join(tokens)

df['Clean_Review'] = df['Full_Review'].apply(clean_text)

# TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['Clean_Review'])
y = df['Sentiment_Label']

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 🔹 Define models
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": LinearSVC(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

# 🔍 Train and evaluate
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = round(acc * 100, 2)
    print(f"🔹 {name} Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred, target_names=["Negative", "Positive"]))
    print("-" * 50)

# 📊 Summary table
print("📊 Model Comparison Summary")
for name, acc in results.items():
    print(f"{name}: {acc}%")


🔹 Naive Bayes Accuracy: 0.8738
              precision    recall  f1-score   support

    Negative       1.00      0.22      0.37        49
    Positive       0.87      1.00      0.93       252

    accuracy                           0.87       301
   macro avg       0.93      0.61      0.65       301
weighted avg       0.89      0.87      0.84       301

--------------------------------------------------
🔹 Logistic Regression Accuracy: 0.9203
              precision    recall  f1-score   support

    Negative       1.00      0.51      0.68        49
    Positive       0.91      1.00      0.95       252

    accuracy                           0.92       301
   macro avg       0.96      0.76      0.82       301
weighted avg       0.93      0.92      0.91       301

--------------------------------------------------
🔹 SVM Accuracy: 0.9601
              precision    recall  f1-score   support

    Negative       0.95      0.80      0.87        49
    Positive       0.96      0.99      0.9