In [30]:

# Environment & Imports

import re
import string
import random
import numpy as np
import pandas as pd

import nltk
# Download required NLTK resources (including punkt_tab fix)
nltk.download('punkt')
nltk.download('punkt_tab')  # <-- Fix for new NLTK versions
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from tensorflow.keras.datasets import imdb

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, cross_validate, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, confusion_matrix, classification_report)
import joblib

# Reproducibility
RANDOM_STATE = 1234
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [31]:
# Step 1: Load IMDB data
VOCAB_SIZE = 20000  # Keep top 20k words
(encoded_train_X, train_y), (encoded_test_X, test_y) = imdb.load_data(num_words=VOCAB_SIZE)

# Build index -> word mapping
raw_word_index = imdb.get_word_index()
index_to_word = {idx + 3: w for w, idx in raw_word_index.items()}
index_to_word[0] = "<PAD>"
index_to_word[1] = "<START>"
index_to_word[2] = "<UNK>"
index_to_word[3] = "<UNUSED>"

def decode_sequence(seq):
    """Convert list of word indices back into review text."""
    return " ".join(index_to_word.get(i, "?") for i in seq)

# Dataset info
print("Train examples:", len(encoded_train_X))
print("Test examples:", len(encoded_test_X))
print("Sample decoded review:\n", decode_sequence(encoded_train_X[0])[:400])
print("Label:", train_y[0])


Train examples: 25000
Test examples: 25000
Sample decoded review:
 <START> this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert <UNK> is an amazing actor and now the same being director <UNK> father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it
Label: 1


In [32]:
# Step 2: Clean and lemmatize text
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))
punct_pattern = re.compile(f"[{re.escape(string.punctuation)}]")

def clean_and_lemmatize(text):
    text = text.lower()
    text = re.sub(r"<[^>]+>", " ", text)  # remove HTML tags
    text = punct_pattern.sub(" ", text)   # remove punctuation
    tokens = nltk.word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha()]
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words]
    return " ".join(tokens)

# Decode and clean
X_train_text = [clean_and_lemmatize(decode_sequence(s)) for s in encoded_train_X]
X_test_text  = [clean_and_lemmatize(decode_sequence(s)) for s in encoded_test_X]

print("Processed sample:", X_train_text[0][:200], "...")


Processed sample: film brilliant casting location scenery story direction everyone really suited part played could imagine robert amazing actor director father came scottish island loved fact real connection film witty ...


In [33]:
# Step 3: Convert to TF-IDF vectors
tfidf_vectorizer = TfidfVectorizer(
    max_features=12000,
    ngram_range=(1,2),
    sublinear_tf=True,
    max_df=0.95,
    min_df=5
)

X_train_vec = tfidf_vectorizer.fit_transform(X_train_text)
X_test_vec  = tfidf_vectorizer.transform(X_test_text)

print("Train features:", X_train_vec.shape)
print("Test  features:", X_test_vec.shape)


Train features: (25000, 12000)
Test  features: (25000, 12000)


In [34]:
# Step 4: Train models and compare
def compute_metrics(y_true, y_pred):
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "f1": f1_score(y_true, y_pred)
    }

models = {
    "LogisticRegression": LogisticRegression(max_iter=400, C=1.0, random_state=RANDOM_STATE, solver="saga"),
    "ComplementNB": ComplementNB(),
    "LinearSVM(SGD)": SGDClassifier(loss="hinge", max_iter=1000, tol=1e-3, random_state=RANDOM_STATE)
}

results = []
for name, clf in models.items():
    clf.fit(X_train_vec, train_y)
    preds = clf.predict(X_test_vec)
    m = compute_metrics(test_y, preds)
    print(f"\n{name} report:\n", classification_report(test_y, preds, digits=4))
    print("Confusion matrix:\n", confusion_matrix(test_y, preds))
    results.append((name, m["accuracy"], m["precision"], m["recall"], m["f1"]))

# Results table
results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1"])
print("\nSummary:\n", results_df)



LogisticRegression report:
               precision    recall  f1-score   support

           0     0.8927    0.8858    0.8893     12500
           1     0.8867    0.8935    0.8901     12500

    accuracy                         0.8897     25000
   macro avg     0.8897    0.8897    0.8897     25000
weighted avg     0.8897    0.8897    0.8897     25000

Confusion matrix:
 [[11073  1427]
 [ 1331 11169]]

ComplementNB report:
               precision    recall  f1-score   support

           0     0.8516    0.8698    0.8606     12500
           1     0.8669    0.8484    0.8576     12500

    accuracy                         0.8591     25000
   macro avg     0.8592    0.8591    0.8591     25000
weighted avg     0.8592    0.8591    0.8591     25000

Confusion matrix:
 [[10872  1628]
 [ 1895 10605]]

LinearSVM(SGD) report:
               precision    recall  f1-score   support

           0     0.8916    0.8857    0.8886     12500
           1     0.8864    0.8923    0.8894     12500

    a

In [35]:
# Step 5: CV on Logistic Regression
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
clf_for_cv = LogisticRegression(max_iter=400, C=1.0, solver="saga", random_state=RANDOM_STATE)

cv_results = cross_validate(
    clf_for_cv, X_train_vec, train_y, cv=cv,
    scoring=["accuracy", "precision", "recall", "f1"],
    n_jobs=-1
)
print("CV mean scores:")
print({k: np.mean(cv_results[k]) for k in cv_results if k.startswith("test_")})


CV mean scores:
{'test_accuracy': np.float64(0.8901199999999999), 'test_precision': np.float64(0.8799977847852617), 'test_recall': np.float64(0.9034400000000001), 'test_f1': np.float64(0.8915616199747932)}


In [36]:
# Step 6: Pipeline + parameter tuning
pipe = Pipeline([
    ("tfidf", TfidfVectorizer(sublinear_tf=True)),
    ("clf", LogisticRegression(max_iter=400, solver="saga", random_state=RANDOM_STATE))
])

param_grid = {
    "tfidf__max_features": [5000, 10000],
    "tfidf__ngram_range": [(1,1), (1,2)],
    "clf__C": [0.5, 1.0, 2.0]
}

grid = GridSearchCV(pipe, param_grid, cv=3, scoring="accuracy", n_jobs=-1, verbose=1)
grid.fit(X_train_text, train_y)

print("Best params:", grid.best_params_)
print("Best CV accuracy:", grid.best_score_)

best_pipeline = grid.best_estimator_


Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best params: {'clf__C': 2.0, 'tfidf__max_features': 10000, 'tfidf__ngram_range': (1, 2)}
Best CV accuracy: 0.8894799132498058


In [37]:
# Step 7: Evaluate tuned pipeline
test_preds = best_pipeline.predict(X_test_text)
print("Classification report:\n", classification_report(test_y, test_preds, digits=4))
print("Confusion matrix:\n", confusion_matrix(test_y, test_preds))
print("Final metrics:", compute_metrics(test_y, test_preds))


Classification report:
               precision    recall  f1-score   support

           0     0.8897    0.8858    0.8877     12500
           1     0.8863    0.8902    0.8883     12500

    accuracy                         0.8880     25000
   macro avg     0.8880    0.8880    0.8880     25000
weighted avg     0.8880    0.8880    0.8880     25000

Confusion matrix:
 [[11072  1428]
 [ 1372 11128]]
Final metrics: {'accuracy': 0.888, 'precision': 0.8862695125836254, 'recall': 0.89024, 'f1': 0.888250319284802}


In [38]:
# Step 8A: sample from test set
sample_indices = [10, 50, 200, 1000, 2500]
for i in sample_indices:
    print("\n--- Test Review", i, "---")
    print("Raw:", decode_sequence(encoded_test_X[i])[:300], "...")
    pred = best_pipeline.predict([X_test_text[i]])[0]
    print("Predicted:", "Positive" if pred == 1 else "Negative")
    print("Actual:   ", "Positive" if test_y[i] == 1 else "Negative")

# Step 8B: custom reviews
custom_reviews = [
    "I loved every minute of this film. The acting was brilliant.",
    "Boring, slow, and unoriginal. I nearly fell asleep.",
    "A solid movie with a few great moments, but overall average.",
    "One of the best films I've seen in years — highly recommended!",
    "Bad script and worse acting. Don't waste your time."
]
processed_custom = [clean_and_lemmatize(r) for r in custom_reviews]
preds_custom = best_pipeline.predict(processed_custom)
for r, p in zip(custom_reviews, preds_custom):
    print("\nReview:", r)
    print("Prediction:", "Positive" if p == 1 else "Negative")



--- Test Review 10 ---
Raw: <START> inspired by hitchcock's strangers on a train concept of two men swapping murders in exchange for getting rid of the two people messing up their lives throw <UNK> from the train is an original and very inventive comedy take on the idea it's a credit to danny devito that he both wrote and star ...
Predicted: Positive
Actual:    Positive

--- Test Review 50 ---
Raw: <START> first of i should point out that i used to love <UNK> the <UNK> as a child and i really enjoyed the movie even though i am in my 20's br br but this movie was so bad i was ashamed to have been a fan in my youth br br ok ok i know this is a movie for kids and isn't aimed at people like me any ...
Predicted: Negative
Actual:    Negative

--- Test Review 200 ---
Raw: <START> don't be fooled this isn't yet another tired example of the girls from outer space pretending to be the french ski team come to earth to collect as much sperm as possible genre though the synopsis may suggest other

In [39]:
# Step 9: Save best pipeline for future use
joblib.dump(best_pipeline, "imdb_best_pipeline.pkl")
print("Saved pipeline as imdb_best_pipeline.pkl")


Saved pipeline as imdb_best_pipeline.pkl
