<a href="https://colab.research.google.com/github/sachinagnihotri/Natural-Language-Processing/blob/main/E1ML_sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [47]:
import nltk

# Download required NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('tagsets')  # optional, for reference


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [48]:
import numpy as np
import re
import time
import nltk
import optuna
from tensorflow.keras.datasets import imdb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
import warnings
warnings.filterwarnings("ignore")


In [49]:
# Load IMDB dataset
num_words = 30000
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=num_words)
x = np.concatenate((x_train, x_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)

# Decode integer sequences
word_index = imdb.get_word_index()
reverse_word_index = {value + 3: key for key, value in word_index.items()}
reverse_word_index[0] = "<PAD>"
reverse_word_index[1] = "<START>"
reverse_word_index[2] = "<UNK>"
reverse_word_index[3] = "<UNUSED>"

def decode_review(sequence):
    return ' '.join([reverse_word_index.get(i, '?') for i in sequence])

x_text = [decode_review(seq) for seq in x]

# Text cleaning
def clean_text(text):
    text = text.lower()
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r"http\S+|www\S+", " ", text)
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

x_clean = [clean_text(t) for t in x_text]

# Split 80/20
x_train_text, x_test_text, y_train, y_test = train_test_split(
    x_clean, y, test_size=0.2, random_state=42, stratify=y
)

# POS tagging transformer
class POSTagger(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        pos_texts = []
        for doc in X:
            tokens = nltk.word_tokenize(doc)
            pos_tags = [tag for word, tag in nltk.pos_tag(tokens)]
            pos_texts.append(" ".join(pos_tags))
        return pos_texts



In [50]:
# Combine word TF-IDF and POS TF-IDF
word_vectorizer = TfidfVectorizer(max_features=30000, ngram_range=(1, 2), stop_words='english', sublinear_tf=True)
pos_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), sublinear_tf=True)

feature_union = FeatureUnion([
    ('word_tfidf', word_vectorizer),
    ('pos_tfidf', Pipeline([
        ('pos', POSTagger()),
        ('tfidf', pos_vectorizer)
    ]))
])

x_train_tfidf = feature_union.fit_transform(x_train_text)
x_test_tfidf = feature_union.transform(x_test_text)

print("TF-IDF + POS training shape:", x_train_tfidf.shape)
print("TF-IDF + POS test shape:", x_test_tfidf.shape)

TF-IDF + POS training shape: (40000, 30905)
TF-IDF + POS test shape: (10000, 30905)


In [51]:
import optuna
import time
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

results = []

# Helper function to run Optuna study for a given model
def run_optuna(name, objective_fn, n_trials=20):
    print(f"\nOptimizing {name} with Optuna...")
    start_time = time.time()

    study = optuna.create_study(direction="maximize")
    study.optimize(objective_fn, n_trials=n_trials)

    best_params = study.best_params
    best_model = objective_fn(trial=None, return_model=True, best_params=best_params)

    train_acc = accuracy_score(y_train, best_model.predict(x_train_tfidf))
    test_acc = accuracy_score(y_test, best_model.predict(x_test_tfidf))
    runtime = time.time() - start_time

    print(f"Best params: {best_params}")
    print(f"Train acc: {train_acc:.4f}, Test acc: {test_acc:.4f}, Time: {runtime:.2f}s")

    results.append({
        "Model": name,
        "Best Params": best_params,
        "Train Acc": train_acc,
        "Test Acc": test_acc,
        "Runtime (s)": runtime
    })




In [52]:

def lr_objective(trial, return_model=False, best_params=None):
    if best_params is None:
        C = trial.suggest_loguniform("C", 0.01, 10)
        solver = trial.suggest_categorical("solver", ["lbfgs", "liblinear"])
    else:
        C = best_params["C"]
        solver = best_params["solver"]

    model = LogisticRegression(C=C, solver=solver, max_iter=5000)
    if return_model:
        model.fit(x_train_tfidf, y_train)
        return model
    score = cross_val_score(model, x_train_tfidf, y_train, cv=3, scoring="accuracy", n_jobs=-1).mean()
    return score

run_optuna("Logistic Regression", lr_objective, n_trials=30)

[I 2025-11-10 16:56:59,323] A new study created in memory with name: no-name-81cec304-400a-4b85-8367-0041b414cdc4



Optimizing Logistic Regression with Optuna...


[I 2025-11-10 16:57:01,917] Trial 0 finished with value: 0.7997250328738345 and parameters: {'C': 0.017100277399992744, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.7997250328738345.
[I 2025-11-10 16:57:05,776] Trial 1 finished with value: 0.8784000404719882 and parameters: {'C': 0.33940562784296335, 'solver': 'liblinear'}. Best is trial 1 with value: 0.8784000404719882.
[I 2025-11-10 16:57:06,574] Trial 2 finished with value: 0.7867500572320067 and parameters: {'C': 0.010820202511856801, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.8784000404719882.
[I 2025-11-10 16:57:11,765] Trial 3 finished with value: 0.8951000717420815 and parameters: {'C': 1.469035083307254, 'solver': 'liblinear'}. Best is trial 3 with value: 0.8951000717420815.
[I 2025-11-10 16:57:13,481] Trial 4 finished with value: 0.7979250553710221 and parameters: {'C': 0.015873569370860156, 'solver': 'liblinear'}. Best is trial 3 with value: 0.8951000717420815.
[I 2025-11-10 16:57:17,631] Trial 5 finished with va

Best params: {'C': 4.510838215623556, 'solver': 'liblinear'}
Train acc: 0.9753, Test acc: 0.8994, Time: 163.37s


In [53]:

def svm_objective(trial, return_model=False, best_params=None):
    if best_params is None:
        C = trial.suggest_loguniform("C", 0.01, 10)
    else:
        C = best_params["C"]

    model = LinearSVC(C=C, max_iter=10000)
    if return_model:
        model.fit(x_train_tfidf, y_train)
        return model
    score = cross_val_score(model, x_train_tfidf, y_train, cv=3, scoring="accuracy", n_jobs=-1).mean()
    return score

run_optuna("Linear SVM", svm_objective, n_trials=30)

[I 2025-11-10 16:59:42,702] A new study created in memory with name: no-name-a31e7bbd-b8b4-48e3-b065-55b1d39f3559



Optimizing Linear SVM with Optuna...


[I 2025-11-10 16:59:43,960] Trial 0 finished with value: 0.8929000711143472 and parameters: {'C': 0.07149659901488574}. Best is trial 0 with value: 0.8929000711143472.
[I 2025-11-10 16:59:45,582] Trial 1 finished with value: 0.897650028621347 and parameters: {'C': 0.28446112963190556}. Best is trial 1 with value: 0.897650028621347.
[I 2025-11-10 16:59:47,678] Trial 2 finished with value: 0.8957249861200033 and parameters: {'C': 0.6477197327714506}. Best is trial 1 with value: 0.897650028621347.
[I 2025-11-10 16:59:55,275] Trial 3 finished with value: 0.8828500017285194 and parameters: {'C': 4.11999147957859}. Best is trial 1 with value: 0.897650028621347.
[I 2025-11-10 16:59:59,215] Trial 4 finished with value: 0.8866500029832379 and parameters: {'C': 2.420848738161974}. Best is trial 1 with value: 0.897650028621347.
[I 2025-11-10 17:00:00,402] Trial 5 finished with value: 0.8849500692294567 and parameters: {'C': 0.043869062061147296}. Best is trial 1 with value: 0.897650028621347.
[I 

Best params: {'C': 0.2231589182966078}
Train acc: 0.9623, Test acc: 0.9008, Time: 69.88s


In [54]:
def nb_objective(trial, return_model=False, best_params=None):
    if best_params is None:
        alpha = trial.suggest_loguniform("alpha", 0.01, 2)
    else:
        alpha = best_params["alpha"]

    model = MultinomialNB(alpha=alpha)
    if return_model:
        model.fit(x_train_tfidf, y_train)
        return model
    score = cross_val_score(model, x_train_tfidf, y_train, cv=3, scoring="accuracy", n_jobs=-1).mean()
    return score

run_optuna("Multinomial NB", nb_objective, n_trials=30)

[I 2025-11-10 17:00:52,586] A new study created in memory with name: no-name-f4b875e8-f1b7-400f-8f63-79aed8ee6825



Optimizing Multinomial NB with Optuna...


[I 2025-11-10 17:00:53,176] Trial 0 finished with value: 0.8716499748401914 and parameters: {'alpha': 0.06210005963628852}. Best is trial 0 with value: 0.8716499748401914.
[I 2025-11-10 17:00:53,754] Trial 1 finished with value: 0.8715499817148946 and parameters: {'alpha': 0.31272714368157606}. Best is trial 0 with value: 0.8716499748401914.
[I 2025-11-10 17:00:54,324] Trial 2 finished with value: 0.8716749754652072 and parameters: {'alpha': 0.05785470298416864}. Best is trial 2 with value: 0.8716749754652072.
[I 2025-11-10 17:00:54,781] Trial 3 finished with value: 0.8708499829639883 and parameters: {'alpha': 0.7145897430845739}. Best is trial 2 with value: 0.8716749754652072.
[I 2025-11-10 17:00:55,185] Trial 4 finished with value: 0.8711999617149572 and parameters: {'alpha': 0.01133299535167289}. Best is trial 2 with value: 0.8716749754652072.
[I 2025-11-10 17:00:55,591] Trial 5 finished with value: 0.871524966090254 and parameters: {'alpha': 0.02035587294010374}. Best is trial 2 wi

Best params: {'alpha': 0.24085125887286546}
Train acc: 0.9003, Test acc: 0.8751, Time: 13.47s


In [55]:

def rf_objective(trial, return_model=False, best_params=None):
    if best_params is None:
        n_estimators = trial.suggest_int("n_estimators", 100, 1000)
        max_depth = trial.suggest_int("max_depth", 10, 70)
        min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
        max_features = trial.suggest_categorical("max_features", ["sqrt", "log2"])
    else:
        n_estimators = best_params["n_estimators"]
        max_depth = best_params["max_depth"]
        min_samples_split = best_params["min_samples_split"]
        max_features = best_params["max_features"]

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        max_features=max_features,
        random_state=42,
        n_jobs=-1
    )
    if return_model:
        model.fit(x_train_tfidf, y_train)
        return model
    score = cross_val_score(model, x_train_tfidf, y_train, cv=3, scoring="accuracy", n_jobs=-1).mean()
    return score

run_optuna("Random Forest", rf_objective, n_trials=5)

[I 2025-11-10 17:01:06,069] A new study created in memory with name: no-name-ae5e6fc6-46c7-4494-bf2e-43813810f354



Optimizing Random Forest with Optuna...


[I 2025-11-10 17:02:51,498] Trial 0 finished with value: 0.8467249310601296 and parameters: {'n_estimators': 992, 'max_depth': 44, 'min_samples_split': 17, 'max_features': 'log2'}. Best is trial 0 with value: 0.8467249310601296.
[I 2025-11-10 17:04:03,081] Trial 1 finished with value: 0.8427999398050048 and parameters: {'n_estimators': 949, 'max_depth': 31, 'min_samples_split': 18, 'max_features': 'log2'}. Best is trial 0 with value: 0.8467249310601296.
[I 2025-11-10 17:05:06,436] Trial 2 finished with value: 0.8416249254288953 and parameters: {'n_estimators': 500, 'max_depth': 47, 'min_samples_split': 15, 'max_features': 'log2'}. Best is trial 0 with value: 0.8467249310601296.
[I 2025-11-10 17:06:33,151] Trial 3 finished with value: 0.8462249954328954 and parameters: {'n_estimators': 507, 'max_depth': 68, 'min_samples_split': 15, 'max_features': 'log2'}. Best is trial 0 with value: 0.8467249310601296.
[I 2025-11-10 17:07:20,054] Trial 4 finished with value: 0.8404749566766768 and para

Best params: {'n_estimators': 992, 'max_depth': 44, 'min_samples_split': 17, 'max_features': 'log2'}
Train acc: 0.9651, Test acc: 0.8475, Time: 445.32s


In [56]:

# Results with TF-IDF POSTAGS

df_results = pd.DataFrame(results).sort_values(by="Test Acc", ascending=False)
print("\nModel Comparison:\n")
df_results


Model Comparison:



Unnamed: 0,Model,Best Params,Train Acc,Test Acc,Runtime (s)
1,Linear SVM,{'C': 0.2231589182966078},0.962275,0.9008,69.876518
0,Logistic Regression,"{'C': 4.510838215623556, 'solver': 'liblinear'}",0.9753,0.8994,163.367326
2,Multinomial NB,{'alpha': 0.24085125887286546},0.90035,0.8751,13.469675
3,Random Forest,"{'n_estimators': 992, 'max_depth': 44, 'min_sa...",0.965125,0.8475,445.319406
