# Imports

In [20]:
import pandas as pd
import matplotlib.pyplot as plt

import mlflow
import mlflow.sklearn

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import make_scorer, precision_recall_curve
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold, cross_val_score 

# Enable autologging
mlflow.sklearn.autolog()

import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings(action='ignore', module="mlflow.models.model")

# Loading Train, Validation, Test Data

In [21]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
val = pd.read_csv('validation.csv')

X_train = train['cleaned_text'].astype(str)
X_test = test['cleaned_text'].astype(str)
X_val = val['cleaned_text'].astype(str)

y_train = (train['label'] == "spam").astype(int)
y_test = (test['label'] == "spam").astype(int)
y_val = (val['label'] == "spam").astype(int)

# Training

Looking at literature we choose the following classifiers and features for our task:

**Classifiers:** Naive Bayes, Decision Tree, Logistic Regression, SVM, Random Forest, XGBoost.

**Features:** CountVectorizer + TF-IDF.

https://indusedu.org/pdfs/IJREISS/IJREISS_4388_87139.pdf?form=MG0AV3

https://radimrehurek.com/data_science_python/

## Creating Classifiers, Params

In [22]:
classifiers = {
    "NaiveBayes": MultinomialNB(),
    "LogisticRegression": LogisticRegression(max_iter=500),
    "RandomForest": RandomForestClassifier(),
    "SVM": SVC(probability=True),
    "DecisionTree": DecisionTreeClassifier()
}

param_grids = {
    "NaiveBayes": {'clf__alpha': [0.1, 0.5, 1.0]},
    "LogisticRegression": {'clf__C': [0.1, 1, 10]},
    "RandomForest": {'clf__n_estimators': [50, 100], 'clf__max_depth': [None, 10]},
    "SVM": {'clf__C': [0.1, 1, 10], 'clf__kernel': ['linear', 'rbf']},
    "DecisionTree": {'clf__max_depth': [None, 10, 20]}
}

# Set MLFlow Experiment

In [23]:
mlflow.set_experiment("Spam vs Ham Classification")

2025/03/18 19:38:51 INFO mlflow.tracking.fluent: Experiment with name 'Spam vs Ham Classification' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///C:/Users/Shankar/Desktop/goat-ipython/goat-vault/goat-vault/01%20-%20Notes/03%20-%20Resources/Codes/CMI/AML/Assignment%202/mlruns/469622913993143644', creation_time=1742306931775, experiment_id='469622913993143644', last_update_time=1742306931775, lifecycle_stage='active', name='Spam vs Ham Classification', tags={}>

# Fit Training Data

In [24]:
# Scoring metric AUCPR
def auc_prc_score(y_true, y_pred):
    precision, recall, _ = precision_recall_curve(y_true, y_pred)
    return auc(recall, precision)

auc_prc = make_scorer(auc_prc_score, greater_is_better=True, needs_proba=True);

In [14]:
for name, clf in classifiers.items():
    print(f"Training {name}...")
    
    # Define pipeline
    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', clf)
    ])
    
    # Grid Search
    param_grid = param_grids[name]
    grid_search = GridSearchCV(pipeline, param_grid, scoring='average_precision', cv=StratifiedKFold(), n_jobs=-1)
    
    # Log model to MLflow
    with mlflow.start_run(run_name=name):
        grid_search.fit(X_train, y_train)
        # Best model
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        print(f"Best params for {name}: {best_params}")

        # Evaluate on the VALIDATION set
        val_aucpr = best_model.score(X_val, y_val)
        mlflow.log_metric("validation_aucpr", val_aucpr)
        print(f"Validation AUCPR for {name}: {val_aucpr}")
        
        print(f"Best AUCPR for {name}: {grid_search.best_score_}")

Training NaiveBayes...


2025/03/18 19:33:57 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.


Best params for NaiveBayes: {'clf__alpha': 0.1}
Validation AUCPR for NaiveBayes: 0.982078853046595
Best AUCPR for NaiveBayes: nan
Training LogisticRegression...


2025/03/18 19:34:07 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.


Best params for LogisticRegression: {'clf__C': 0.1}
Validation AUCPR for LogisticRegression: 0.8566308243727598
Best AUCPR for LogisticRegression: nan
Training RandomForest...


2025/03/18 19:34:18 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.


Best params for RandomForest: {'clf__max_depth': None, 'clf__n_estimators': 50}
Validation AUCPR for RandomForest: 0.9767025089605734
Best AUCPR for RandomForest: nan
Training SVM...


2025/03/18 19:34:38 INFO mlflow.sklearn.utils: Logging the 5 best runs, one run will be omitted.


Best params for SVM: {'clf__C': 0.1, 'clf__kernel': 'linear'}
Validation AUCPR for SVM: 0.9408602150537635
Best AUCPR for SVM: nan
Training DecisionTree...


2025/03/18 19:34:48 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.


Best params for DecisionTree: {'clf__max_depth': None}
Validation AUCPR for DecisionTree: 0.9551971326164874
Best AUCPR for DecisionTree: nan


# Test the benchmark models on TEST data

In [18]:
for name in classifiers.keys():
    # Load the best model from MLflow
    model_uri = f"runs:/{mlflow.active_run().info.run_id}/model"
    loaded_model = mlflow.sklearn.load_model(model_uri)
    
    # Predict on the TEST set
    test_proba = loaded_model.predict_proba(X_test)[:, 1]
    test_aucpr = average_precision_score(y_test, test_proba)
    print(f"{name} AUCPR on test set: {test_aucpr}")

AttributeError: 'NoneType' object has no attribute 'info'

# Run MLFlow Server

In [19]:
!mlflow server --host 127.0.0.1 --port 8080

^C
