In [1]:
import mlflow
mlflow.set_tracking_uri("file:../mlruns")

mlflow.set_experiment("Flipkart_Sentiment_Model_Comparison")

print("✅ MLflow initialized")


  from .autonotebook import tqdm as notebook_tqdm


✅ MLflow initialized


  return FileStore(store_uri, store_uri)


In [2]:
import pandas as pd
import numpy as np
import re
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

from xgboost import XGBClassifier

nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt")

print("✅ Libraries imported")


✅ Libraries imported


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
df = pd.read_csv("../data/data.csv")

df.columns = df.columns.str.strip()
print("Dataset shape:", df.shape)
print(df.columns.tolist())


Dataset shape: (8518, 8)
['Reviewer Name', 'Review Title', 'Place of Review', 'Up Votes', 'Down Votes', 'Month', 'Review text', 'Ratings']


In [4]:
# Drop Reviewer Name
df.drop(columns=["Reviewer Name"], inplace=True)

# Handle Review Title
df["Review Title"].fillna("", inplace=True)

# Drop rows with missing Review text
df = df.dropna(subset=["Review text"])

# Combine Review Title + Review text
df["full_review"] = (df["Review Title"] + " " + df["Review text"]).str.strip()

# Handle Place of Review
df["Place of Review"].fillna("Unknown", inplace=True)

# Handle votes
df["Up Votes"].fillna(0, inplace=True)
df["Down Votes"].fillna(0, inplace=True)

# Drop Month column
df.drop(columns=["Month"], inplace=True)

# Drop original text columns
df.drop(columns=["Review Title", "Review text"], inplace=True)

print("Columns after preparation:")
print(df.columns.tolist())
print("Dataset size after preparation:", df.shape)


Columns after preparation:
['Place of Review', 'Up Votes', 'Down Votes', 'Ratings', 'full_review']
Dataset size after preparation: (8510, 5)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Review Title"].fillna("", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Place of Review"].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting va

In [5]:
def create_sentiment(rating):
    if rating <= 2:
        return 0
    elif rating >= 4:
        return 1
    else:
        return 2

df["sentiment"] = df["Ratings"].apply(create_sentiment)

df_binary = df[df["sentiment"] != 2].copy()

print("Binary dataset size:", len(df_binary))
print(df_binary["sentiment"].value_counts())


Binary dataset size: 7895
sentiment
1    6823
0    1072
Name: count, dtype: int64


In [6]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"read more", "", text, flags=re.IGNORECASE)
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = re.sub(r"@\w+|#\w+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def preprocess_text(text, use_lemmatization=True):
    text = clean_text(text)
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words and len(w) > 2]

    if use_lemmatization:
        tokens = [lemmatizer.lemmatize(w) for w in tokens]
    else:
        tokens = [stemmer.stem(w) for w in tokens]

    return " ".join(tokens)

print("✅ Preprocessing functions ready")


✅ Preprocessing functions ready


In [7]:
df_binary["cleaned_review"] = df_binary["full_review"].apply(
    lambda x: preprocess_text(x, use_lemmatization=True)
)

df_binary = df_binary[df_binary["cleaned_review"].str.len() > 0]

print("Dataset size after removing empty reviews:", len(df_binary))


Dataset size after removing empty reviews: 7895


In [8]:
X = df_binary["cleaned_review"]
y = df_binary["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train size:", len(X_train))
print("Test size:", len(X_test))


Train size: 6316
Test size: 1579


In [9]:
def run_experiment(model, vectorizer, model_name, feature_type, params):
    import joblib
    import os

    with mlflow.start_run(run_name=f"{model_name}_{feature_type}"):

        X_train_vec = vectorizer.fit_transform(X_train)
        X_test_vec = vectorizer.transform(X_test)

        model.fit(X_train_vec, y_train)
        y_pred = model.predict(X_test_vec)

        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        mlflow.log_param("model", model_name)
        mlflow.log_param("feature", feature_type)

        for k, v in params.items():
            mlflow.log_param(k, v)

        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("f1_score", f1)

        # ✅ LOG ARTIFACT
        os.makedirs("artifacts", exist_ok=True)
        joblib.dump(vectorizer, "artifacts/vectorizer.pkl")
        mlflow.log_artifact("artifacts/vectorizer.pkl")

        print(f"{model_name:<18} {feature_type:<6} "
              f"Accuracy={acc:.6f} | F1={f1:.6f}")


In [10]:
run_experiment(
    LogisticRegression(max_iter=1000, random_state=42),
    CountVectorizer(max_features=5000, ngram_range=(1,2)),
    "Logistic Regression", "BoW",
    {"max_features": 5000}
)


Logistic Regression BoW    Accuracy=0.925269 | F1=0.957736


In [11]:
# 1. Logistic Regression + BoW
run_experiment(
    LogisticRegression(max_iter=1000, random_state=42),
    CountVectorizer(max_features=5000, ngram_range=(1,2)),
    "Logistic Regression", "BoW",
    {"max_features": 5000}
)

# 2. SVM + TF-IDF
run_experiment(
    LinearSVC(C=1.0),
    TfidfVectorizer(max_features=5000),
    "SVM", "TF-IDF",
    {"max_features": 5000, "C": 1.0}
)

# 3. Naive Bayes + BoW
run_experiment(
    MultinomialNB(),
    CountVectorizer(max_features=5000),
    "Naive Bayes", "BoW",
    {"max_features": 5000}
)

# 4. XGBoost + TF-IDF
run_experiment(
    XGBClassifier(n_estimators=100, random_state=42, eval_metric="logloss"),
    TfidfVectorizer(max_features=5000),
    "XGBoost", "TF-IDF",
    {"n_estimators": 100}
)

# 5. Random Forest + TF-IDF
run_experiment(
    RandomForestClassifier(n_estimators=100, random_state=42),
    TfidfVectorizer(max_features=5000),
    "Random Forest", "TF-IDF",
    {"n_estimators": 100}
)

# 6. Logistic Regression + TF-IDF
run_experiment(
    LogisticRegression(max_iter=1000, random_state=42),
    TfidfVectorizer(max_features=5000, ngram_range=(1,2)),
    "Logistic Regression", "TF-IDF",
    {"max_features": 5000}
)

# 7. Random Forest + BoW
run_experiment(
    RandomForestClassifier(n_estimators=100, random_state=42),
    CountVectorizer(max_features=5000),
    "Random Forest", "BoW",
    {"n_estimators": 100}
)

# 8. Naive Bayes + TF-IDF
run_experiment(
    MultinomialNB(),
    TfidfVectorizer(max_features=5000),
    "Naive Bayes", "TF-IDF",
    {"max_features": 5000}
)


Logistic Regression BoW    Accuracy=0.925269 | F1=0.957736
SVM                TF-IDF Accuracy=0.920203 | F1=0.954709
Naive Bayes        BoW    Accuracy=0.915136 | F1=0.952279
XGBoost            TF-IDF Accuracy=0.917669 | F1=0.953505
Random Forest      TF-IDF Accuracy=0.916403 | F1=0.952790
Logistic Regression TF-IDF Accuracy=0.918303 | F1=0.954272
Random Forest      BoW    Accuracy=0.913870 | F1=0.951498
Naive Bayes        TF-IDF Accuracy=0.887904 | F1=0.938902


In [12]:
import mlflow
print("Tracking URI:", mlflow.get_tracking_uri())


Tracking URI: file:../mlruns


In [13]:
import mlflow.sklearn

with mlflow.start_run(run_name="LogReg_BoW_Register"):

    # Vectorizer
    bow = CountVectorizer(max_features=5000, ngram_range=(1,2))
    X_train_bow = bow.fit_transform(X_train)
    X_test_bow = bow.transform(X_test)

    # Train BEST model
    lr_bow = LogisticRegression(max_iter=1000, random_state=42)
    lr_bow.fit(X_train_bow, y_train)

    # Evaluate
    y_pred = lr_bow.predict(X_test_bow)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Log metrics
    mlflow.log_param("model", "Logistic Regression")
    mlflow.log_param("feature", "BoW")
    mlflow.log_param("max_features", 5000)

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1_score", f1)

    # ✅ REGISTER MODEL
    mlflow.sklearn.log_model(
        sk_model=lr_bow,
        artifact_path="model",
        registered_model_name="Flipkart_Sentiment_Model"
    )



  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)
  return FileStore(store_uri)
Registered model 'Flipkart_Sentiment_Model' already exists. Creating a new version of this model...
Created version '3' of model 'Flipkart_Sentiment_Model'.


In [14]:
# STEP 4.1 – Hyperparameter sweep for max_features (ONE model only)

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score
import mlflow

max_features_list = [1000, 3000, 5000, 8000]

for mf in max_features_list:
    with mlflow.start_run(run_name=f"LogReg_BoW_maxfeat_{mf}"):

        vectorizer = CountVectorizer(max_features=mf, ngram_range=(1,2))
        X_train_vec = vectorizer.fit_transform(X_train)
        X_test_vec = vectorizer.transform(X_test)

        model = LogisticRegression(max_iter=1000, random_state=42)
        model.fit(X_train_vec, y_train)

        y_pred = model.predict(X_test_vec)
        f1 = f1_score(y_test, y_pred)

        # Log ONLY what matters for tuning
        mlflow.log_param("model", "LogisticRegression")
        mlflow.log_param("feature", "BoW")
        mlflow.log_param("max_features", mf)
        mlflow.log_metric("f1_score", f1)

        print(f"max_features={mf} → F1={f1:.4f}")


max_features=1000 → F1=0.9580
max_features=3000 → F1=0.9588
max_features=5000 → F1=0.9577
max_features=8000 → F1=0.9593
