In [27]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.2.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.2.0-py3-none-win_amd64.whl (101.7 MB)
   ---------------------------------------- 0.0/101.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/101.7 MB ? eta -:--:--
   ---------------------------------------- 0.5/101.7 MB 2.1 MB/s eta 0:00:49
   ---------------------------------------- 1.0/101.7 MB 2.1 MB/s eta 0:00:48
    --------------------------------------- 1.3/101.7 MB 2.0 MB/s eta 0:00:50
    --------------------------------------- 1.6/101.7 MB 1.9 MB/s eta 0:00:53
    --------------------------------------- 2.1/101.7 MB 2.0 MB/s eta 0:00:51
   - -------------------------------------- 2.9/101.7 MB 2.2 MB/s eta 0:00:46
   - -------------------------------------- 3.4/101.7 MB 2.2 MB/s eta 0:00:44
   - -------------------------------------- 4.2/101.7 MB 2.4 MB/s eta 0:00:42
   - -------------------------------------- 4.7/101.7 MB 2.4 MB/s eta 0:00:40
   -- --

In [51]:
import pandas as pd
import numpy as np
import re
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.decomposition import PCA

In [52]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [53]:
df = pd.read_csv("Sentiment_Analysis_Dataset.csv")

In [54]:
def clean_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.lower()
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return " ".join(words)

In [55]:
df['clean_review'] = df['review'].apply(clean_text)
df['sentiment'] = df['sentiment'].map({'positive':1, 'negative':0})

In [56]:
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_review'],
    df['sentiment'],
    test_size=0.3,
    random_state=42
)

In [57]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
'''METHOD - 1 Model trained differently'''

In [58]:
#MODEL 1 - Logistic Regression

In [59]:
lr_model = LogisticRegression(max_iter=200)
lr_model.fit(X_train_vec, y_train)
lr_pred = lr_model.predict(X_test_vec)

In [60]:
print("\nLogistic Regression Accuracy:")
print((accuracy_score(y_test, lr_pred))*100, "%")
print("\nClassification Report:")
print(classification_report(y_test, lr_pred))


Logistic Regression Accuracy:
89.07333333333334 %

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      7411
           1       0.88      0.90      0.89      7589

    accuracy                           0.89     15000
   macro avg       0.89      0.89      0.89     15000
weighted avg       0.89      0.89      0.89     15000



In [61]:
#MODEL 2 - Support Vector Machine

In [62]:
svm_model = LinearSVC()
svm_model.fit(X_train_vec, y_train)
svm_pred = svm_model.predict(X_test_vec)

In [63]:
print("\nSVM Accuracy:")
print((accuracy_score(y_test, svm_pred))*100, "%")
print("\nClassification Report:")
print(classification_report(y_test, svm_pred))
print("\nConfusion Matrix (SVM):")
print(confusion_matrix(y_test, svm_pred))


SVM Accuracy:
88.1 %

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.87      0.88      7411
           1       0.88      0.89      0.88      7589

    accuracy                           0.88     15000
   macro avg       0.88      0.88      0.88     15000
weighted avg       0.88      0.88      0.88     15000


Confusion Matrix (SVM):
[[6475  936]
 [ 849 6740]]


In [64]:
#MODEL 3 - Naive Bayes

In [65]:
nb_model = MultinomialNB()
nb_model.fit(X_train_vec, y_train)
nb_pred = nb_model.predict(X_test_vec)

In [66]:
print("\nNaive Bayes Accuracy:")
print((accuracy_score(y_test, nb_pred))*100, "%")
print(classification_report(y_test, nb_pred))


Naive Bayes Accuracy:
85.58 %
              precision    recall  f1-score   support

           0       0.85      0.85      0.85      7411
           1       0.86      0.86      0.86      7589

    accuracy                           0.86     15000
   macro avg       0.86      0.86      0.86     15000
weighted avg       0.86      0.86      0.86     15000



In [67]:
#MODEL 4 - Random Forest

In [68]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_vec, y_train)
rf_pred = rf_model.predict(X_test_vec)

In [69]:
print("\nRandom Forest Accuracy:")
print((accuracy_score(y_test, rf_pred))*100, "%")
print(classification_report(y_test, rf_pred))


Random Forest Accuracy:
85.04666666666667 %
              precision    recall  f1-score   support

           0       0.84      0.86      0.85      7411
           1       0.86      0.84      0.85      7589

    accuracy                           0.85     15000
   macro avg       0.85      0.85      0.85     15000
weighted avg       0.85      0.85      0.85     15000



In [70]:
#MODEL 5 - XGBoost

In [71]:
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train_vec, y_train)
xgb_pred = xgb_model.predict(X_test_vec)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [72]:
print("\nXGBoost Accuracy:")
print((accuracy_score(y_test, xgb_pred))*100, "%")
print(classification_report(y_test, xgb_pred))


XGBoost Accuracy:
85.9 %
              precision    recall  f1-score   support

           0       0.87      0.84      0.85      7411
           1       0.85      0.88      0.86      7589

    accuracy                           0.86     15000
   macro avg       0.86      0.86      0.86     15000
weighted avg       0.86      0.86      0.86     15000



In [None]:
'''METHOD - 2 Model trained together'''

In [74]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=200),
    "SVM": LinearSVC(),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')}

In [None]:
accuracies = {}

# Train and evaluate
for name, model in models.items():
    model.fit(X_train_vec, y_train)
    pred = model.predict(X_test_vec)
    acc = accuracy_score(y_test, pred)
    accuracies[name] = acc
    print(f"{name} Accuracy: {acc:.4f}")

Logistic Regression Accuracy: 0.8907
SVM Accuracy: 0.8810
Naive Bayes Accuracy: 0.8558
Random Forest Accuracy: 0.8493


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
plt.figure(figsize=(8,5))
sns.barplot(x=list(accuracies.keys()), y=list(accuracies.values()))
plt.xticks(rotation=45)
plt.ylabel("Accuracy")
plt.title("Model Accuracy Comparison")
plt.tight_layout()
plt.savefig("model_accuracy_comparison.png")
plt.show()

In [None]:
for name, model in models.items():
    pred = model.predict(X_test_vec)
    cm = confusion_matrix(y_test, pred)

    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"Confusion Matrix - {name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.show()

In [None]:
pca = PCA(n_components=2)
X_reduced = pca.fit_transform(X_test_vec.toarray())

# Plot for each model
for name, model in models.items():
    pred = model.predict(X_test_vec)

    plt.figure(figsize=(6,5))
    scatter = plt.scatter(
        X_reduced[:,0],
        X_reduced[:,1],
        c=pred,
        cmap='coolwarm',
        alpha=0.5
    )

    plt.title(f"Cluster Visualization - {name}")
    plt.xlabel("PCA Component 1")
    plt.ylabel("PCA Component 2")
    plt.colorbar(label="Sentiment Prediction")
    plt.tight_layout()
    plt.savefig(f"cluster_{name.replace(' ','_')}.png")
    plt.show()