# Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

import mlflow
import mlflow.sklearn

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import make_scorer, precision_recall_curve
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold, cross_val_score 

# Enable autologging
mlflow.sklearn.autolog()

import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings(action='ignore', module="mlflow.models.model")

# Loading Train, Validation, Test Data

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
val = pd.read_csv('validation.csv')

X_train = train['cleaned_text'].astype(str)
X_test = test['cleaned_text'].astype(str)
X_val = val['cleaned_text'].astype(str)

y_train = (train['label'] == "spam").astype(int)
y_test = (test['label'] == "spam").astype(int)
y_val = (val['label'] == "spam").astype(int)

# Training

Looking at literature we choose the following classifiers and features for our task:

**Classifiers:** Naive Bayes, Decision Tree, Logistic Regression, SVM, Random Forest, XGBoost.

**Features:** CountVectorizer + TF-IDF.

https://indusedu.org/pdfs/IJREISS/IJREISS_4388_87139.pdf?form=MG0AV3

https://radimrehurek.com/data_science_python/

## Creating Classifiers, Params

In [3]:
classifiers = {
    "NaiveBayes": MultinomialNB(),
    "LogisticRegression": LogisticRegression(max_iter=500),
    "RandomForest": RandomForestClassifier(),
    "SVM": SVC(probability=True),
    "DecisionTree": DecisionTreeClassifier()
}

param_grids = {
    "NaiveBayes": {'clf__alpha': [0.1, 0.5, 1.0]},
    "LogisticRegression": {'clf__C': [0.1, 1, 10]},
    "RandomForest": {'clf__n_estimators': [50, 100], 'clf__max_depth': [None, 10]},
    "SVM": {'clf__C': [0.1, 1, 10], 'clf__kernel': ['linear', 'rbf']},
    "DecisionTree": {'clf__max_depth': [None, 10, 20]}
}

# Set MLFlow Experiment

In [4]:
mlflow.set_experiment("Spam vs Ham Classification")

2025/03/18 19:58:04 INFO mlflow.tracking.fluent: Experiment with name 'Spam vs Ham Classification' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///C:/Users/Shankar/Desktop/goat-ipython/goat-vault/goat-vault/01%20-%20Notes/03%20-%20Resources/Codes/CMI/AML/Assignment%202/mlruns/144961064304096514', creation_time=1742308084703, experiment_id='144961064304096514', last_update_time=1742308084703, lifecycle_stage='active', name='Spam vs Ham Classification', tags={}>

# Fit Training Data

In [5]:
for name, clf in classifiers.items():
    print(f"Training {name}...")
    
    # Define pipeline
    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', clf)
    ])
    
    # Grid Search
    param_grid = param_grids[name]
    grid_search = GridSearchCV(pipeline, param_grid, scoring='average_precision', cv=StratifiedKFold(), n_jobs=-1)
    
    # Log model to MLflow
    with mlflow.start_run(run_name=name):
        grid_search.fit(X_train, y_train)
        # Best model
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        print(f"Best params for {name}: {best_params}")

        # Evaluate on the VALIDATION set
        val_aucpr = best_model.score(X_val, y_val)
        mlflow.log_metric("validation_aucpr", val_aucpr)
        print(f"Validation AUCPR for {name}: {val_aucpr}")
        
        print(f"Best AUCPR for {name}: {grid_search.best_score_}")

        # Register the model
        mlflow.sklearn.log_model(
        sk_model=best_model,
        artifact_path="ham-spam-models",
        registered_model_name=name,
        )

Training NaiveBayes...


2025/03/18 19:58:24 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.


Best params for NaiveBayes: {'clf__alpha': 0.1}
Validation AUCPR for NaiveBayes: 0.982078853046595
Best AUCPR for NaiveBayes: 0.9680471396357196


Successfully registered model 'NaiveBayes'.
Created version '1' of model 'NaiveBayes'.


Training LogisticRegression...


2025/03/18 19:58:37 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.


Best params for LogisticRegression: {'clf__C': 10}
Validation AUCPR for LogisticRegression: 0.982078853046595
Best AUCPR for LogisticRegression: 0.9712515499982448


Successfully registered model 'LogisticRegression'.
Created version '1' of model 'LogisticRegression'.


Training RandomForest...


2025/03/18 19:58:52 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.


Best params for RandomForest: {'clf__max_depth': None, 'clf__n_estimators': 100}
Validation AUCPR for RandomForest: 0.974910394265233
Best AUCPR for RandomForest: 0.9680914360527245


Successfully registered model 'RandomForest'.
Created version '1' of model 'RandomForest'.


Training SVM...


2025/03/18 19:59:20 INFO mlflow.sklearn.utils: Logging the 5 best runs, one run will be omitted.


Best params for SVM: {'clf__C': 10, 'clf__kernel': 'rbf'}
Validation AUCPR for SVM: 0.985663082437276
Best AUCPR for SVM: 0.9773425179794465


Successfully registered model 'SVM'.
Created version '1' of model 'SVM'.


Training DecisionTree...


2025/03/18 19:59:33 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.


Best params for DecisionTree: {'clf__max_depth': 10}
Validation AUCPR for DecisionTree: 0.9587813620071685
Best AUCPR for DecisionTree: 0.7384096342951365


Successfully registered model 'DecisionTree'.
Created version '1' of model 'DecisionTree'.


# Benchmark models on Test data

In [6]:
for model_name in classifiers.keys():
    print(f"\nChecking out model: {model_name}")
    
    # Load the model from the MLflow model registry or a specific run
    model_uri = f"models:/{model_name}/latest"  # Ensure the model is registered in MLflow
    loaded_model = mlflow.sklearn.load_model(model_uri)
    
    # Calculate AUCPR
    aucpr = loaded_model.score(X_test, y_test)
    
    print(f"TEST AUCPR for {model_name}: {aucpr}")


Checking out model: NaiveBayes
TEST AUCPR for NaiveBayes: 0.9856502242152466

Checking out model: LogisticRegression
TEST AUCPR for LogisticRegression: 0.9847533632286996

Checking out model: RandomForest
TEST AUCPR for RandomForest: 0.9775784753363229

Checking out model: SVM
TEST AUCPR for SVM: 0.9856502242152466

Checking out model: DecisionTree
TEST AUCPR for DecisionTree: 0.95695067264574


# Run MLFlow Server

In [7]:
!mlflow server --host 127.0.0.1 --port 8080

^C


In [9]:
!git add .
!git commit -m "Completed training part with MLFlow tracking"



[a2 4df257d] Completed training part with MLFlow tracking
 1301 files changed, 5269 insertions(+), 776 deletions(-)
 create mode 100644 Assignment 2/mlruns/0/meta.yaml
 create mode 100644 Assignment 2/mlruns/144961064304096514/031f5340e5f04852abbefa0bce43f5f5/meta.yaml
 create mode 100644 Assignment 2/mlruns/144961064304096514/031f5340e5f04852abbefa0bce43f5f5/metrics/mean_fit_time
 create mode 100644 Assignment 2/mlruns/144961064304096514/031f5340e5f04852abbefa0bce43f5f5/metrics/mean_score_time
 create mode 100644 Assignment 2/mlruns/144961064304096514/031f5340e5f04852abbefa0bce43f5f5/metrics/mean_test_score
 create mode 100644 Assignment 2/mlruns/144961064304096514/031f5340e5f04852abbefa0bce43f5f5/metrics/rank_test_score
 create mode 100644 Assignment 2/mlruns/144961064304096514/031f5340e5f04852abbefa0bce43f5f5/metrics/std_fit_time
 create mode 100644 Assignment 2/mlruns/144961064304096514/031f5340e5f04852abbefa0bce43f5f5/metrics/std_score_time
 create mode 100644 Assignment 2/mlruns/