In [25]:

#  Data Import & Overview


import numpy as np
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from tensorflow.keras.datasets import imdb
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Download NLTK resources
nltk.download('stopwords')

# Limit to top 8000 most common words
NUM_WORDS = 8000
(raw_train_X, raw_train_y), (raw_test_X, raw_test_y) = imdb.load_data(num_words=NUM_WORDS)

# Merge train/test so we can shuffle/split ourselves
X_all = np.concatenate((raw_train_X, raw_test_X), axis=0)
y_all = np.concatenate((raw_train_y, raw_test_y), axis=0)

print("Total reviews:", len(X_all))
print("Label distribution:", pd.Series(y_all).value_counts())

# Create reverse word mapping
word_map = imdb.get_word_index()
idx_to_word = {idx + 3: word for word, idx in word_map.items()}
idx_to_word[0] = "<PAD>"
idx_to_word[1] = "<START>"
idx_to_word[2] = "<UNK>"

def decode_review(encoded_review):
    return " ".join([idx_to_word.get(i, "?") for i in encoded_review])

# Example decoded review
print("\nSample Review:\n", decode_review(X_all[0]))
print("Label:", y_all[0])


# Preprocessing Function


stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

def preprocess(review_text):
    text = review_text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    words = text.split()
    words = [stemmer.stem(w) for w in words if w not in stop_words]
    return " ".join(words)

# Decode & preprocess
decoded_reviews = [decode_review(r) for r in X_all]
processed_reviews = [preprocess(r) for r in decoded_reviews]

# Split into train/test sets
train_X, test_X, train_y, test_y = train_test_split(
    processed_reviews, y_all, test_size=0.5, random_state=42
)


#  Feature Extraction


vectorizer = CountVectorizer(max_features=6000, ngram_range=(1, 2))
train_vectors = vectorizer.fit_transform(train_X)
test_vectors = vectorizer.transform(test_X)

print("Feature matrix shape:", train_vectors.shape)

#  Train & Evaluate Models


models = {
    "Logistic Regression": LogisticRegression(max_iter=200),
    "Bernoulli NB": BernoulliNB(),
    "SVM": LinearSVC()
}

metrics_data = []

for name, model in models.items():
    model.fit(train_vectors, train_y)
    preds = model.predict(test_vectors)
    acc = accuracy_score(test_y, preds)
    prec = precision_score(test_y, preds)
    rec = recall_score(test_y, preds)
    f1 = f1_score(test_y, preds)
    metrics_data.append([name, acc, prec, rec, f1])
    print(f"\n{name} Confusion Matrix:\n", confusion_matrix(test_y, preds))

# Metrics table
metrics_df = pd.DataFrame(metrics_data, columns=["Model", "Accuracy", "Precision", "Recall", "F1"])
print("\nModel Comparison:\n", metrics_df)


#  Pipeline Approach


from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ("vectorizer", CountVectorizer(stop_words="english", max_features=5000)),
    ("clf", LogisticRegression(max_iter=300))
])

pipeline.fit(train_X, train_y)
print("\nPipeline Accuracy:", accuracy_score(test_y, pipeline.predict(test_X)))


#  Manual Inference


custom_reviews = [
    "I absolutely loved this movie! The performances were breathtaking.",
    "The movie was too long and extremely boring.",
    "A masterpiece! Definitely one of the best films I've ever seen.",
    "Terrible plot and bad acting, I do not recommend.",
    "An okay film, some parts were great but others were dull."
]

custom_reviews_processed = [preprocess(r) for r in custom_reviews]
predictions = pipeline.predict(custom_reviews_processed)

for review, label in zip(custom_reviews, predictions):
    print(f"\nReview: {review}")
    print("Predicted Sentiment:", "Positive" if label == 1 else "Negative")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Total reviews: 50000
Label distribution: 1    25000
0    25000
Name: count, dtype: int64

Sample Review:
 <START> this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert <UNK> is an amazing actor and now the same being director <UNK> father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for <UNK> and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also <UNK> to the two little boy's that played the <UNK> of norman and paul they were just brilliant children are often left out of the <UNK> list i think because the stars that play them all grown

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=200).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Logistic Regression Confusion Matrix:
 [[10750  1685]
 [ 1807 10758]]

Bernoulli NB Confusion Matrix:
 [[10525  1910]
 [ 1792 10773]]

SVM Confusion Matrix:
 [[10433  2002]
 [ 2271 10294]]

Model Comparison:
                  Model  Accuracy  Precision    Recall        F1
0  Logistic Regression   0.86032   0.864582  0.856188  0.860365
1         Bernoulli NB   0.85192   0.849405  0.857382  0.853375
2                  SVM   0.82908   0.837183  0.819260  0.828124


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=300).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Pipeline Accuracy: 0.86012

Review: I absolutely loved this movie! The performances were breathtaking.
Predicted Sentiment: Positive

Review: The movie was too long and extremely boring.
Predicted Sentiment: Negative

Review: A masterpiece! Definitely one of the best films I've ever seen.
Predicted Sentiment: Positive

Review: Terrible plot and bad acting, I do not recommend.
Predicted Sentiment: Negative

Review: An okay film, some parts were great but others were dull.
Predicted Sentiment: Negative
