<a href="https://colab.research.google.com/github/sachinagnihotri/notebooks/blob/master/E01ML_sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import re
from tensorflow.keras.datasets import imdb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

# Load and combine both splits (total 50,000 samples)

num_words = 30000
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=num_words)

x = np.concatenate((x_train, x_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)
print(f"Total samples: {len(x)}")  # 50,000


# Decode integer sequences back to text

word_index = imdb.get_word_index()
reverse_word_index = {value + 3: key for key, value in word_index.items()}
reverse_word_index[0] = "<PAD>"
reverse_word_index[1] = "<START>"
reverse_word_index[2] = "<UNK>"
reverse_word_index[3] = "<UNUSED>"

def decode_review(sequence):
    return ' '.join([reverse_word_index.get(i, '?') for i in sequence])

x_text = [decode_review(seq) for seq in x]


# Text cleaning / preprocessing

def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r"<.*?>", " ", text)         # remove tokens like <PAD>
    text = re.sub(r"http\S+|www\S+", " ", text) # remove URLs
    text = re.sub(r"[^a-z\s]", " ", text)      # remove punctuation/numbers
    text = re.sub(r"\s+", " ", text).strip()   # remove extra spaces
    return text

x_clean = [clean_text(t) for t in x_text]


# Split 80/20 for training and testing

x_train_text, x_test_text, y_train, y_test = train_test_split(
    x_clean, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train size: {len(x_train_text)}, Test size: {len(x_test_text)}")


# TF-IDF feature extraction

vectorizer = TfidfVectorizer(
    max_features=30000,
    stop_words='english',
    ngram_range=(1, 2),
    sublinear_tf=True
)
x_train_tfidf = vectorizer.fit_transform(x_train_text)
x_test_tfidf = vectorizer.transform(x_test_text)

print("TF-IDF training shape:", x_train_tfidf.shape)
print("TF-IDF test shape:", x_test_tfidf.shape)

Total samples: 50000
Train size: 40000, Test size: 10000
TF-IDF training shape: (40000, 30000)
TF-IDF test shape: (10000, 30000)


In [None]:
!pip install optuna




In [None]:
import optuna
import time
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

results = []

# Helper function to run Optuna study for a given model
def run_optuna(name, objective_fn, n_trials=20):
    print(f"\nOptimizing {name} with Optuna...")
    start_time = time.time()

    study = optuna.create_study(direction="maximize")
    study.optimize(objective_fn, n_trials=n_trials)

    best_params = study.best_params
    best_model = objective_fn(trial=None, return_model=True, best_params=best_params)

    train_acc = accuracy_score(y_train, best_model.predict(x_train_tfidf))
    test_acc = accuracy_score(y_test, best_model.predict(x_test_tfidf))
    runtime = time.time() - start_time

    print(f"Best params: {best_params}")
    print(f"Train acc: {train_acc:.4f}, Test acc: {test_acc:.4f}, Time: {runtime:.2f}s")

    results.append({
        "Model": name,
        "Best Params": best_params,
        "Train Acc": train_acc,
        "Test Acc": test_acc,
        "Runtime (s)": runtime
    })




In [None]:

def lr_objective(trial, return_model=False, best_params=None):
    if best_params is None:
        C = trial.suggest_loguniform("C", 0.01, 10)
        solver = trial.suggest_categorical("solver", ["lbfgs", "liblinear"])
    else:
        C = best_params["C"]
        solver = best_params["solver"]

    model = LogisticRegression(C=C, solver=solver, max_iter=5000)
    if return_model:
        model.fit(x_train_tfidf, y_train)
        return model
    score = cross_val_score(model, x_train_tfidf, y_train, cv=3, scoring="accuracy", n_jobs=-1).mean()
    return score

run_optuna("Logistic Regression", lr_objective, n_trials=30)

[I 2025-11-10 16:14:45,682] A new study created in memory with name: no-name-6fa8359b-4ff4-42ca-8780-ae5a35c6cba7



Optimizing Logistic Regression with Optuna...


[I 2025-11-10 16:14:51,396] Trial 0 finished with value: 0.8907500417373941 and parameters: {'C': 0.6242694113380384, 'solver': 'liblinear'}. Best is trial 0 with value: 0.8907500417373941.
[I 2025-11-10 16:14:53,449] Trial 1 finished with value: 0.8891000417353316 and parameters: {'C': 0.4760724663793627, 'solver': 'liblinear'}. Best is trial 0 with value: 0.8907500417373941.
[I 2025-11-10 16:14:54,694] Trial 2 finished with value: 0.8979250748705346 and parameters: {'C': 1.4224610678169458, 'solver': 'lbfgs'}. Best is trial 2 with value: 0.8979250748705346.
[I 2025-11-10 16:14:57,929] Trial 3 finished with value: 0.895275062992519 and parameters: {'C': 1.0362876688166727, 'solver': 'liblinear'}. Best is trial 2 with value: 0.8979250748705346.
[I 2025-11-10 16:14:59,399] Trial 4 finished with value: 0.8624750285773795 and parameters: {'C': 0.06387512335187631, 'solver': 'lbfgs'}. Best is trial 2 with value: 0.8979250748705346.
[I 2025-11-10 16:15:01,327] Trial 5 finished with value: 0

Best params: {'C': 4.3191189572541475, 'solver': 'lbfgs'}
Train acc: 0.9666, Test acc: 0.9000, Time: 43.92s


In [None]:

def svm_objective(trial, return_model=False, best_params=None):
    if best_params is None:
        C = trial.suggest_loguniform("C", 0.01, 10)
    else:
        C = best_params["C"]

    model = LinearSVC(C=C, max_iter=10000)
    if return_model:
        model.fit(x_train_tfidf, y_train)
        return model
    score = cross_val_score(model, x_train_tfidf, y_train, cv=3, scoring="accuracy", n_jobs=-1).mean()
    return score

run_optuna("Linear SVM", svm_objective, n_trials=30)

[I 2025-11-10 16:15:29,630] A new study created in memory with name: no-name-e9780b9d-c335-4889-9089-1e62ec98d313



Optimizing Linear SVM with Optuna...


[I 2025-11-10 16:15:30,608] Trial 0 finished with value: 0.8752999985941602 and parameters: {'C': 0.016393648768661655}. Best is trial 0 with value: 0.8752999985941602.
[I 2025-11-10 16:15:31,465] Trial 1 finished with value: 0.871475013589004 and parameters: {'C': 0.012804846165754275}. Best is trial 0 with value: 0.8752999985941602.
[I 2025-11-10 16:15:32,943] Trial 2 finished with value: 0.8961750179947688 and parameters: {'C': 0.6498395842690288}. Best is trial 2 with value: 0.8961750179947688.
[I 2025-11-10 16:15:34,551] Trial 3 finished with value: 0.8993000829970502 and parameters: {'C': 0.14865756615186568}. Best is trial 3 with value: 0.8993000829970502.
[I 2025-11-10 16:15:35,681] Trial 4 finished with value: 0.8886000292350191 and parameters: {'C': 0.041884673521420376}. Best is trial 3 with value: 0.8993000829970502.
[I 2025-11-10 16:15:37,858] Trial 5 finished with value: 0.892174977990769 and parameters: {'C': 1.1613909118929242}. Best is trial 3 with value: 0.89930008299

Best params: {'C': 0.1936187997754335}
Train acc: 0.9570, Test acc: 0.9022, Time: 41.72s


In [None]:
def nb_objective(trial, return_model=False, best_params=None):
    if best_params is None:
        alpha = trial.suggest_loguniform("alpha", 0.01, 2)
    else:
        alpha = best_params["alpha"]

    model = MultinomialNB(alpha=alpha)
    if return_model:
        model.fit(x_train_tfidf, y_train)
        return model
    score = cross_val_score(model, x_train_tfidf, y_train, cv=3, scoring="accuracy", n_jobs=-1).mean()
    return score

run_optuna("Multinomial NB", nb_objective, n_trials=30)

[I 2025-11-10 16:16:11,367] A new study created in memory with name: no-name-66ed1ff2-3d69-4cd1-af04-1a5b9491f57e



Optimizing Multinomial NB with Optuna...


[I 2025-11-10 16:16:11,661] Trial 0 finished with value: 0.8732749854669571 and parameters: {'alpha': 1.4150001968059296}. Best is trial 0 with value: 0.8732749854669571.
[I 2025-11-10 16:16:11,924] Trial 1 finished with value: 0.8725999967158321 and parameters: {'alpha': 0.013185218216178522}. Best is trial 0 with value: 0.8732749854669571.
[I 2025-11-10 16:16:12,197] Trial 2 finished with value: 0.8735499548430664 and parameters: {'alpha': 1.0007037790165085}. Best is trial 2 with value: 0.8735499548430664.
[I 2025-11-10 16:16:12,460] Trial 3 finished with value: 0.8737999685930351 and parameters: {'alpha': 0.41757631942118384}. Best is trial 3 with value: 0.8737999685930351.
[I 2025-11-10 16:16:12,724] Trial 4 finished with value: 0.8726749929660196 and parameters: {'alpha': 0.05093160781781281}. Best is trial 3 with value: 0.8737999685930351.
[I 2025-11-10 16:16:12,998] Trial 5 finished with value: 0.8734249648426601 and parameters: {'alpha': 1.129796029625658}. Best is trial 3 wit

Best params: {'alpha': 0.6280519951357411}
Train acc: 0.9008, Test acc: 0.8760, Time: 8.53s


In [None]:

def rf_objective(trial, return_model=False, best_params=None):
    if best_params is None:
        n_estimators = trial.suggest_int("n_estimators", 100, 1000)
        max_depth = trial.suggest_int("max_depth", 10, 70)
        min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
        max_features = trial.suggest_categorical("max_features", ["sqrt", "log2"])
    else:
        n_estimators = best_params["n_estimators"]
        max_depth = best_params["max_depth"]
        min_samples_split = best_params["min_samples_split"]
        max_features = best_params["max_features"]

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        max_features=max_features,
        random_state=42,
        n_jobs=-1
    )
    if return_model:
        model.fit(x_train_tfidf, y_train)
        return model
    score = cross_val_score(model, x_train_tfidf, y_train, cv=3, scoring="accuracy", n_jobs=-1).mean()
    return score

run_optuna("Random Forest", rf_objective, n_trials=5)

[I 2025-11-10 16:27:17,475] A new study created in memory with name: no-name-a5dc910d-492f-45f7-9180-a6c8188284b3



Optimizing Random Forest with Optuna...


[I 2025-11-10 16:30:25,387] Trial 0 finished with value: 0.855949994820067 and parameters: {'n_estimators': 356, 'max_depth': 41, 'min_samples_split': 7, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.855949994820067.
[I 2025-11-10 16:31:30,011] Trial 1 finished with value: 0.8644249648314104 and parameters: {'n_estimators': 562, 'max_depth': 65, 'min_samples_split': 6, 'max_features': 'log2'}. Best is trial 1 with value: 0.8644249648314104.
[I 2025-11-10 16:34:25,911] Trial 2 finished with value: 0.8568250204455201 and parameters: {'n_estimators': 316, 'max_depth': 46, 'min_samples_split': 15, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.8644249648314104.
[I 2025-11-10 16:35:07,840] Trial 3 finished with value: 0.8586999398248792 and parameters: {'n_estimators': 943, 'max_depth': 29, 'min_samples_split': 19, 'max_features': 'log2'}. Best is trial 1 with value: 0.8644249648314104.
[I 2025-11-10 16:36:16,336] Trial 4 finished with value: 0.8653499467080197 and paramete

Best params: {'n_estimators': 695, 'max_depth': 69, 'min_samples_split': 19, 'max_features': 'log2'}
Train acc: 0.9746, Test acc: 0.8712, Time: 582.75s


In [None]:

# Compare results

df_results = pd.DataFrame(results).sort_values(by="Test Acc", ascending=False)
print("\nModel Comparison:\n")
df_results


Model Comparison:



Unnamed: 0,Model,Best Params,Train Acc,Test Acc,Runtime (s)
1,Linear SVM,{'C': 0.1936187997754335},0.95705,0.9022,41.722884
0,Logistic Regression,"{'C': 4.3191189572541475, 'solver': 'lbfgs'}",0.9666,0.9,43.924511
2,Multinomial NB,{'alpha': 0.6280519951357411},0.90075,0.876,8.526561
3,Random Forest,"{'n_estimators': 695, 'max_depth': 69, 'min_sa...",0.9746,0.8712,582.750996
