# IMPORT LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import time
import contractions
import nltk

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, roc_auc_score

from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier

# DATA LOADING

In [2]:
print("\n========== DATA LOADING ==========")

df = pd.read_csv(
    r"C:\Users\LENOVO\Downloads\reviews.csv",
    encoding="latin-1"
)

print("Dataset Loaded | Shape:", df.shape)


Dataset Loaded | Shape: (50000, 2)


In [3]:
df

Unnamed: 0,Text,Sentiment
0,Once again Mr. Costner has dragged out a movie...,0
1,This is an example of why the majority of acti...,0
2,"First of all I hate those moronic rappers, who...",0
3,Not even the Beatles could write songs everyon...,0
4,Brass pictures (movies is not a fitting word f...,0
...,...,...
49995,"Seeing as the vote average was pretty low, and...",1
49996,"The plot had some wretched, unbelievable twist...",1
49997,I am amazed at how this movie(and most others ...,1
49998,A Christmas Together actually came before my t...,1


In [4]:
df['Text'][0]

"Once again Mr. Costner has dragged out a movie for far longer than necessary. Aside from the terrific sea rescue sequences, of which there are very few I just did not care about any of the characters. Most of us have ghosts in the closet, and Costner's character are realized early on, and then forgotten until much later, by which time I did not care. The character we should really care about is a very cocky, overconfident Ashton Kutcher. The problem is he comes off as kid who thinks he's better than anyone else around him and shows no signs of a cluttered closet. His only obstacle appears to be winning over Costner. Finally when we are well past the half way point of this stinker, Costner tells us all about Kutcher's ghosts. We are told why Kutcher is driven to be the best with no prior inkling or foreshadowing. No magic here, it was all I could do to keep from turning it off an hour in."

In [5]:
df['Text'][1]

"This is an example of why the majority of action films are the same. Generic and boring, there's really nothing worth watching here. A complete waste of the then barely-tapped talents of Ice-T and Ice Cube, who've each proven many times over that they are capable of acting, and acting well. Don't bother with this one, go see New Jack City, Ricochet or watch New York Undercover for Ice-T, or Boyz n the Hood, Higher Learning or Friday for Ice Cube and see the real deal. Ice-T's horribly cliched dialogue alone makes this film grate at the teeth, and I'm still wondering what the heck Bill Paxton was doing in this film? And why the heck does he always play the exact same character? From Aliens onward, every film I've seen with Bill Paxton has him playing the exact same irritating character, and at least in Aliens his character died, which made it somewhat gratifying... Overall, this is second-rate action trash. There are countless better films to see, and if you really want to see this one

# TEXT PREPROCESSING

In [6]:
print("\n========== TEXT PREPROCESSING ==========")

stemmer = SnowballStemmer('english')
stop = set(stopwords.words('english'))

def clean(text):
    text = text.lower()
    text = contractions.fix(text)
    tokens = word_tokenize(text)

    cleaned_tokens = []
    for token in tokens:
        if token.isalpha():
            if token not in stop:
                cleaned_tokens.append(stemmer.stem(token))

    return ' '.join(cleaned_tokens)

df['Text'] = df['Text'].apply(clean)
print("Preprocessing Completed")

# ---------------- TRAIN–TEST SPLIT ------------------------------
print("\n========== TRAIN–TEST SPLIT ==========")

df = df.sample(frac=1).reset_index(drop=True)

X = df['Text']
y = df['Sentiment']

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y,
    test_size=0.30,
    stratify=y,
    random_state=42
)


Preprocessing Completed



# VECTORIZATION

In [7]:
print("\n========== TF-IDF VECTORIZATION ==========")

vectorizer = TfidfVectorizer(min_df=5)

X_train = vectorizer.fit_transform(X_train_raw)
X_test = vectorizer.transform(X_test_raw)

print("Vocabulary Size:", len(vectorizer.get_feature_names_out()))



Vocabulary Size: 20117


# TRAINING UTILITY

In [8]:
results = []

def train_and_evaluate(name, model, dense=False):
    print(f"\n{'='*60}")
    print(f"Training Model: {name}")
    print(f"{'='*60}")

    start_train = time.time()
    model.fit(X_train.toarray() if dense else X_train, y_train)
    train_time = time.time() - start_train

    start_pred = time.time()
    y_pred = model.predict(X_test.toarray() if dense else X_test)
    pred_time = time.time() - start_pred

    acc = accuracy_score(y_test, y_pred) * 100

    print(f"Training Time   : {train_time:.2f} sec")
    print(f"Prediction Time : {pred_time:.2f} sec")
    print(f"Accuracy        : {acc:.2f}%")

    results.append([name, acc, train_time, pred_time])
    return model

# MODELS

In [9]:
common_random_state = 42

gnb = train_and_evaluate(
    "Gaussian Naive Bayes",
    GaussianNB(),
    dense=True
)

mnb = train_and_evaluate(
    "Multinomial Naive Bayes",
    MultinomialNB()
)

lrcv = train_and_evaluate(
    "Logistic Regression (CV)",
    LogisticRegressionCV(
        max_iter=100,
        solver="liblinear",
        random_state=common_random_state
    )
)

svc = train_and_evaluate(
    "Linear SVM",
    LinearSVC(
        max_iter=100,
        random_state=common_random_state
    )
)

rf = train_and_evaluate(
    "Random Forest",
    RandomForestClassifier(
        n_estimators=100,
        random_state=common_random_state
    )
)

xgb = train_and_evaluate(
    "XGBoost (Simple Baseline)",
    XGBClassifier(
        n_estimators=100,
        random_state=common_random_state,
        eval_metric="logloss"
    ),
    dense=True
)

# ---------------- MODEL COMPARISON ------------------------------
print("\n========== MODEL COMPARISON SUMMARY ==========")

summary = pd.DataFrame(
    results,
    columns=[
        "Model",
        "Accuracy (%)",
        "Training Time (sec)",
        "Prediction Time (sec)"
    ]
)

print(summary.sort_values(by="Accuracy (%)", ascending=False))

# ---------------- ROC-AUC --------------------------------------
print("\n========== ROC-AUC SCORES ==========")

roc_models = {
    "Logistic Regression CV": lrcv,
    "XGBoost": xgb
}

for name, model in roc_models.items():
    probs = model.predict_proba(
        X_test if name != "XGBoost" else X_test.toarray()
    )
    roc = roc_auc_score(y_test, probs[:, 1])
    print(f"{name} ROC-AUC: {roc:.4f}")


Training Model: Gaussian Naive Bayes
Training Time   : 79.41 sec
Prediction Time : 10.38 sec
Accuracy        : 69.69%

Training Model: Multinomial Naive Bayes
Training Time   : 0.39 sec
Prediction Time : 0.01 sec
Accuracy        : 86.27%

Training Model: Logistic Regression (CV)
Training Time   : 68.69 sec
Prediction Time : 0.01 sec
Accuracy        : 89.65%

Training Model: Linear SVM
Training Time   : 1.20 sec
Prediction Time : 0.01 sec
Accuracy        : 89.18%

Training Model: Random Forest
Training Time   : 167.04 sec
Prediction Time : 1.37 sec
Accuracy        : 85.11%

Training Model: XGBoost (Simple Baseline)
Training Time   : 372.25 sec
Prediction Time : 3.55 sec
Accuracy        : 85.71%

                       Model  Accuracy (%)  Training Time (sec)  \
2   Logistic Regression (CV)     89.653333            68.690917   
3                 Linear SVM     89.180000             1.200566   
1    Multinomial Naive Bayes     86.266667             0.394525   
5  XGBoost (Simple Baseline

# SENTIMENT PREDICTION

In [10]:
def predict_sentiment(review, model):
    review_clean = clean(review)
    vect = vectorizer.transform([review_clean])

    # Convert to dense only for GaussianNB
    if isinstance(model, GaussianNB):
        vect_input = vect.toarray()
    else:
        vect_input = vect

    # Auto-detect model name
    model_name = type(model).__name__

    if hasattr(model, "predict_proba"):
        probs = model.predict_proba(vect_input)[0]
        sentiment = "Positive" if probs[1] > 0.5 else "Negative"
        confidence = max(probs) * 100
    else:  # LinearSVC without predict_proba
        score = model.decision_function(vect_input)[0]
        sentiment = "Positive" if score > 0 else "Negative"
        confidence = score

    print("\n===============================")
    print(f"Review    : {review}")
    print(f"Prediction: {sentiment}")
    print(f"Confidence: {confidence:.2f}")
    print(f"Predictor : {model_name}")
    print("===============================")


# SAMPLE TEST

In [11]:
sample_review = "The movie was absolutely brilliant with outstanding acting"

predict_sentiment(sample_review, lrcv)
predict_sentiment(sample_review, svc)
predict_sentiment(sample_review, xgb)
predict_sentiment(sample_review, gnb)
predict_sentiment(sample_review, mnb)
predict_sentiment(sample_review, rf)


Review    : The movie was absolutely brilliant with outstanding acting
Prediction: Positive
Confidence: 98.57
Predictor : LogisticRegressionCV

Review    : The movie was absolutely brilliant with outstanding acting
Prediction: Positive
Confidence: 1.77
Predictor : LinearSVC

Review    : The movie was absolutely brilliant with outstanding acting
Prediction: Positive
Confidence: 94.99
Predictor : XGBClassifier

Review    : The movie was absolutely brilliant with outstanding acting
Prediction: Positive
Confidence: 100.00
Predictor : GaussianNB

Review    : The movie was absolutely brilliant with outstanding acting
Prediction: Positive
Confidence: 80.46
Predictor : MultinomialNB

Review    : The movie was absolutely brilliant with outstanding acting
Prediction: Positive
Confidence: 76.00
Predictor : RandomForestClassifier


In [12]:
neg_rev="I was absolutely mesmerized by the cinematography—the entire film felt like a dream, but one of those dreams where you're perpetually searching for something you can't quite name. The two-hour runtime was a masterclass in patience testing, proving that the writers are true masters of the drawn-out slow burn. Every single scene was remarkably consistent in its ability to underwhelm. The twist, when it finally arrived, was so brilliantly foreshadowed by a complete lack of context that I can only conclude the director's goal was ambitious, baffling boredom. I left the theatre feeling lighter, having successfully shed the weight of high expectations. Go see it."

predict_sentiment(neg_rev, lrcv)
predict_sentiment(neg_rev, svc)
predict_sentiment(neg_rev, xgb)
predict_sentiment(neg_rev, gnb)
predict_sentiment(neg_rev, mnb)
predict_sentiment(neg_rev, rf)


Review    : I was absolutely mesmerized by the cinematography—the entire film felt like a dream, but one of those dreams where you're perpetually searching for something you can't quite name. The two-hour runtime was a masterclass in patience testing, proving that the writers are true masters of the drawn-out slow burn. Every single scene was remarkably consistent in its ability to underwhelm. The twist, when it finally arrived, was so brilliantly foreshadowed by a complete lack of context that I can only conclude the director's goal was ambitious, baffling boredom. I left the theatre feeling lighter, having successfully shed the weight of high expectations. Go see it.
Prediction: Negative
Confidence: 64.51
Predictor : LogisticRegressionCV

Review    : I was absolutely mesmerized by the cinematography—the entire film felt like a dream, but one of those dreams where you're perpetually searching for something you can't quite name. The two-hour runtime was a masterclass in patience testi

In [13]:
rev="""First thing first, I am not a Salman hater. I went into this movie expecting another good experience, because that's exactly what Bajrangi Bhaijaan did for me. I was disappointed.

This movie means well and wants us love our neighbors, but when the main man of the movie (this movie is 95% Salman) is unconvincing and at times, annoying, you just cant enjoy the movie. I liked what Salman did in Bajrangi. I didn't like what Salman did in this. I cant make up my mind on if he was trying too hard to be a convincing disabled childlike-man or not trying at all, because most of the times the camera was on Salman, he's making extremely silly and at times annoying distorted faces. I know kids and people with similar disabilities do that too, but what Salman was doing looked like he was making fun of such people, far from convincing us that he was one of them. Then again, making Salman play a disabled childlike man was never a good idea.

This movie can get emotional (trying too hard at times) so you might want to bring tissues along. The plot is pretty simple, Laxman aka Tubelight does little but wholesome things to increase his 'yakeen' which he believes will bring his brother back from war. The cinematography is breathtaking, but the war scenes were disappointing and poorly choreographed. The film's music was thoroughly enjoyable, especially Nach Meri Jaan and Radio.

The supporting cast was pretty good. The late Om Puri showed us his excellence for one last time, acing the role as Banne Chacha. Sohail Khan played his part well, nothing else to it. Zhu Zhu was great whenever she was great on-screen, so was little Matin Rey Tangu. Zeeshan Ayyub was convincingly good in his role. SRK's cameo was (no pun intended) magical."""

predict_sentiment(rev, lrcv)
predict_sentiment(rev, svc)
predict_sentiment(rev, xgb)
predict_sentiment(rev, gnb)
predict_sentiment(rev, mnb)
predict_sentiment(rev, rf)


Review    : First thing first, I am not a Salman hater. I went into this movie expecting another good experience, because that's exactly what Bajrangi Bhaijaan did for me. I was disappointed.

This movie means well and wants us love our neighbors, but when the main man of the movie (this movie is 95% Salman) is unconvincing and at times, annoying, you just cant enjoy the movie. I liked what Salman did in Bajrangi. I didn't like what Salman did in this. I cant make up my mind on if he was trying too hard to be a convincing disabled childlike-man or not trying at all, because most of the times the camera was on Salman, he's making extremely silly and at times annoying distorted faces. I know kids and people with similar disabilities do that too, but what Salman was doing looked like he was making fun of such people, far from convincing us that he was one of them. Then again, making Salman play a disabled childlike man was never a good idea.

This movie can get emotional (trying too hard