# 1. IMPORT LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import time
import contractions
import nltk

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, roc_auc_score

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import LinearSVC


# 2. DATA LOADING

In [2]:
print("\n========== DATA LOADING ==========")

df = pd.read_csv(
    r"C:\Users\LENOVO\Downloads\reviews.csv",
    encoding="latin-1"
)

print("Dataset Loaded | Shape:", df.shape)



Dataset Loaded | Shape: (50000, 2)


# 3. TEXT PREPROCESSING

In [3]:
print("\n========== TEXT PREPROCESSING ==========")

stemmer = SnowballStemmer('english')
stop = set(stopwords.words('english'))

def clean(text):
    text = text.lower()
    text = contractions.fix(text)
    tokens = word_tokenize(text)

    cleaned_tokens = []
    for token in tokens:
        if token.isalpha():
            if token not in stop:
                cleaned_tokens.append(stemmer.stem(token))

    return ' '.join(cleaned_tokens)

df['Text'] = df['Text'].apply(clean)
print("Preprocessing Completed")


Preprocessing Completed


# 4. TRAIN–TEST SPLIT

In [4]:
print("\n========== TRAIN–TEST SPLIT ==========")

df = df.sample(frac=1).reset_index(drop=True)

X = df['Text']
y = df['Sentiment']

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y,
    test_size=0.30,
    stratify=y,
    random_state=42
)




# 5. TF-IDF VECTORIZATION

In [5]:
print("\n========== TF-IDF VECTORIZATION ==========")

vectorizer = TfidfVectorizer(min_df=5)

X_train = vectorizer.fit_transform(X_train_raw)
X_test = vectorizer.transform(X_test_raw)

print("Vocabulary Size:", len(vectorizer.get_feature_names_out()))


Vocabulary Size: 20042


# 6. TRAINING UTILITY

In [6]:
results = []

def train_and_evaluate(name, model):
    print(f"\n{'='*60}")
    print(f"Training Model: {name}")
    print(f"{'='*60}")

    start_train = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_train

    start_pred = time.time()
    y_pred = model.predict(X_test)
    pred_time = time.time() - start_pred

    acc = accuracy_score(y_test, y_pred) * 100

    print(f"Training Time   : {train_time:.2f} sec")
    print(f"Prediction Time : {pred_time:.2f} sec")
    print(f"Accuracy        : {acc:.2f}%")

    results.append([name, acc, train_time, pred_time])
    return model


# 7. MODELS

In [7]:
common_random_state = 42

mnb = train_and_evaluate(
    "Multinomial Naive Bayes",
    MultinomialNB()
)

lrcv = train_and_evaluate(
    "Logistic Regression (CV)",
    LogisticRegressionCV(
        max_iter=100,
        solver="liblinear",
        random_state=common_random_state
    )
)

svc = train_and_evaluate(
    "Linear SVM",
    LinearSVC(
        max_iter=100,
        random_state=common_random_state
    )
)


Training Model: Multinomial Naive Bayes
Training Time   : 0.03 sec
Prediction Time : 0.01 sec
Accuracy        : 85.30%

Training Model: Logistic Regression (CV)
Training Time   : 72.83 sec
Prediction Time : 0.01 sec
Accuracy        : 89.45%

Training Model: Linear SVM
Training Time   : 1.30 sec
Prediction Time : 0.00 sec
Accuracy        : 88.57%


# 8. MODEL COMPARISON SUMMARY

In [8]:
print("\n========== MODEL COMPARISON SUMMARY ==========")

summary = pd.DataFrame(
    results,
    columns=[
        "Model",
        "Accuracy (%)",
        "Training Time (sec)",
        "Prediction Time (sec)"
    ]
)

print(summary.sort_values(by="Accuracy (%)", ascending=False))


                      Model  Accuracy (%)  Training Time (sec)  \
1  Logistic Regression (CV)     89.446667            72.827452   
2                Linear SVM     88.566667             1.300209   
0   Multinomial Naive Bayes     85.300000             0.029717   

   Prediction Time (sec)  
1               0.005919  
2               0.004854  
0               0.010646  


# 9. ROC-AUC

In [9]:
print("\n========== ROC-AUC SCORES ==========")

roc_models = {
    "Logistic Regression CV": lrcv
}

for name, model in roc_models.items():
    probs = model.predict_proba(X_test)
    roc = roc_auc_score(y_test, probs[:, 1])
    print(f"{name} ROC-AUC: {roc:.4f}")



Logistic Regression CV ROC-AUC: 0.9591


# 10. SENTIMENT PREDICTION

In [10]:
def predict_sentiment(review, model):
    review_clean = clean(review)
    vect = vectorizer.transform([review_clean])

    model_name = type(model).__name__

    if hasattr(model, "predict_proba"):
        probs = model.predict_proba(vect)[0]
        sentiment = "Positive" if probs[1] > 0.5 else "Negative"
        confidence = max(probs) * 100
    else:
        score = model.decision_function(vect)[0]
        sentiment = "Positive" if score > 0 else "Negative"
        confidence = score

    print("\n===============================")
    print(f"Review    : {review}")
    print(f"Prediction: {sentiment}")
    print(f"Confidence: {confidence:.2f}")
    print(f"Predictor : {model_name}")
    print("===============================")


# 11. SAMPLE TEST

In [11]:
sample_review = "The movie was absolutely brilliant with outstanding acting"

predict_sentiment(sample_review, lrcv)
predict_sentiment(sample_review, svc)
predict_sentiment(sample_review, mnb)


Review    : The movie was absolutely brilliant with outstanding acting
Prediction: Positive
Confidence: 98.51
Predictor : LogisticRegressionCV

Review    : The movie was absolutely brilliant with outstanding acting
Prediction: Positive
Confidence: 1.78
Predictor : LinearSVC

Review    : The movie was absolutely brilliant with outstanding acting
Prediction: Positive
Confidence: 79.22
Predictor : MultinomialNB


In [12]:
confusing_negative_review = """
I really wanted to like this movie because the idea sounded promising.
The cinematography was beautiful and at first glance everything seemed polished.
Unfortunately, the story quickly fell apart and became painfully boring.
The characters were shallow and their motivations made little sense.
By the end, the film felt unnecessarily long and emotionally empty.
Overall, despite looking good on the surface, it was a deeply disappointing experience.
"""

predict_sentiment(confusing_negative_review, lrcv)
predict_sentiment(confusing_negative_review, svc)
predict_sentiment(confusing_negative_review, mnb)


Review    : 
I really wanted to like this movie because the idea sounded promising.
The cinematography was beautiful and at first glance everything seemed polished.
Unfortunately, the story quickly fell apart and became painfully boring.
The characters were shallow and their motivations made little sense.
By the end, the film felt unnecessarily long and emotionally empty.
Overall, despite looking good on the surface, it was a deeply disappointing experience.

Prediction: Negative
Confidence: 98.10
Predictor : LogisticRegressionCV

Review    : 
I really wanted to like this movie because the idea sounded promising.
The cinematography was beautiful and at first glance everything seemed polished.
Unfortunately, the story quickly fell apart and became painfully boring.
The characters were shallow and their motivations made little sense.
By the end, the film felt unnecessarily long and emotionally empty.
Overall, despite looking good on the surface, it was a deeply disappointing experience.