In [1]:
#!pip install mlflow

In [3]:
import numpy as np
from tensorflow.keras.datasets import imdb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import nltk
from nltk.corpus import stopwords
from flask import Flask, request, jsonify
import mlflow
import mlflow.sklearn
import re
import warnings
warnings.filterwarnings("ignore")

In [5]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mmish\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=10000)

In [9]:
# Word index mapping
word_index = imdb.get_word_index()
reverse_word_index = {v: k for k, v in word_index.items()}

In [11]:
# Decode reviews
def decode_review(encoded_review):
    return ' '.join([reverse_word_index.get(i - 3, '?') for i in encoded_review])

In [13]:
# Decode and preprocess
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

decoded_train = [preprocess(decode_review(r)) for r in x_train]
decoded_test = [preprocess(decode_review(r)) for r in x_test]

In [15]:
#ML flow and Hyperparameter tuning
mlflow.set_experiment("Sentiment_Analysis_Movie_Reviews")

model_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000)),
    ('nb', MultinomialNB())
])

params = {
    'tfidf__max_df': [0.9, 1.0],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'nb__alpha': [0.5, 1.0]
}

with mlflow.start_run():
    grid = GridSearchCV(model_pipeline, param_grid=params, cv=3, scoring='accuracy', verbose=1)
    grid.fit(decoded_train, y_train)
    test_preds = grid.predict(decoded_test)
    acc = np.mean(test_preds == y_test)
    mlflow.log_param("best_params", grid.best_params_)
    mlflow.log_metric("accuracy", acc)
    #mlflow.sklearn.log_model(grid.best_estimator_, "best_model")
    mlflow.sklearn.log_model(
    sk_model=grid.best_estimator_,
    artifact_path="model",  # this is just a folder name
    registered_model_name="SentimentClassifier"  # human-readable name in the model registry
)

    print(f"\n✅ Best accuracy: {acc}")
    print(f"✅ Best parameters: {grid.best_params_}")
    print("\n📊 Classification Report:")
    print(classification_report(y_test, test_preds))

2025/07/05 18:37:34 INFO mlflow.tracking.fluent: Experiment with name 'Sentiment_Analysis_Movie_Reviews' does not exist. Creating a new experiment.


Fitting 3 folds for each of 8 candidates, totalling 24 fits





✅ Best accuracy: 0.85752
✅ Best parameters: {'nb__alpha': 1.0, 'tfidf__max_df': 0.9, 'tfidf__ngram_range': (1, 2)}

📊 Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.86      0.86     12500
           1       0.86      0.85      0.86     12500

    accuracy                           0.86     25000
   macro avg       0.86      0.86      0.86     25000
weighted avg       0.86      0.86      0.86     25000



Successfully registered model 'SentimentClassifier'.
Created version '1' of model 'SentimentClassifier'.


In [17]:
# Save best model
joblib.dump(grid.best_estimator_, "best_sentiment_model.pkl")

['best_sentiment_model.pkl']