In [14]:
import warnings

In [15]:
warnings.filterwarnings('ignore')

In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
import lightgbm as lgb
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, average_precision_score
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient

## Loading the datasets

In [16]:
# Define a function to load and split the data
def load_split_data(train_path, validation_path, test_path):
    # Load datasets
    train_data = pd.read_csv(train_path)
    validation_data = pd.read_csv(validation_path)
    test_data = pd.read_csv(test_path)
    
    # Split features and target
    X_train = train_data['text']
    y_train = train_data['spam']
    X_val = validation_data['text']
    y_val = validation_data['spam']
    X_test = test_data['text']
    y_test = test_data['spam']
    
    return X_train, y_train, X_val, y_val, X_test, y_test

In [17]:
# Load your data (update paths according to your dataset location)
train_path = '../data/train.csv'
validation_path = '../data/validation.csv'
test_path = '../data/test.csv'


In [18]:
X_train, y_train, X_val, y_val, X_test, y_test = load_split_data(train_path, validation_path, test_path)

## Model training, registering and version controlling using MLflow

In [31]:
def train_and_log_model(model_name, X_train, y_train, X_val, y_val, params={},tags={}):
    with mlflow.start_run(run_name=model_name):
        # Define the model pipeline
        if model_name == 'logistic_regression':
            model = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words='english')),
                ('clf', LogisticRegression(random_state=42, **params))
            ])
        elif model_name == 'naive_bayes':
            model = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words='english')),
                ('clf', MultinomialNB(**params))
            ])
        elif model_name == 'lightgbm':
            model = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words='english')),
                ('clf', lgb.LGBMClassifier(random_state=42, force_row_wise=True, **params))
            ])
        else:
            raise ValueError("Model name not recognized.")

        # Train the model
        model.fit(X_train, y_train)

        # Evaluate the model
        y_pred_val = model.predict(X_val)
        accuracy = accuracy_score(y_val, y_pred_val)
        aucpr = average_precision_score(y_val, model.predict_proba(X_val)[:, 1])

        # Log parameters, metrics, and model
        mlflow.log_params(params)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("aucpr", aucpr)
        mlflow.sklearn.log_model(model, f"model_{model_name}")
        # After training and logging metrics, register the model
        mlflow.sklearn.log_model(model, f"model_{model_name}")

        # Get the run ID
        run_id = mlflow.active_run().info.run_id

        # Create or get the model name in the MLflow Model Registry
        client = MlflowClient()
        try:
            client.create_registered_model(model_name)
        except Exception as e:
            print(f"Model {model_name} already exists in the registry.")

        # Create a new version of the model in the registry
        model_uri = f"runs:/{run_id}/model_{model_name}"
        model_version_info = client.create_model_version(model_name, model_uri, run_id)

        # Add tags to the model version
        tags['Created by'] = 'shreyan'  # Replace with your identifier
        for tag_key, tag_value in tags.items():
            client.set_model_version_tag(
                model_name,
                model_version_info.version,
                tag_key,
                tag_value
            )

        print(f"Model {model_name}, version {model_version_info.version} registered in the MLflow Model Registry with tags {tags}.")
        print(f"Model: {model_name}, Accuracy: {accuracy}, AUCPR: {aucpr}")


## Setting up the experiment

In [33]:
# Start MLflow experiment
mlflow.set_experiment("Email Spam Classification")


<Experiment: artifact_location='file:///c:/CMI/Applied%20ML/applied-ML/ASS_2/src/mlruns/235275653375214283', creation_time=1708268303769, experiment_id='235275653375214283', last_update_time=1708268303769, lifecycle_stage='active', name='Email Spam Classification', tags={}>

In [34]:
tags = {
    "Review": "Passed",
    "Ready for Deployment": "Yes"
}

In [35]:
# Train and log models with tags
model_names = ['logistic_regression', 'naive_bayes', 'lightgbm']
for model_name in model_names:
    train_and_log_model(model_name, X_train, y_train, X_val, y_val, tags=tags)

Model logistic_regression already exists in the registry.
Model logistic_regression, version 3 registered in the MLflow Model Registry with tags {'Review': 'Passed', 'Ready for Deployment': 'Yes', 'Created by': 'shreyan'}.
Model: logistic_regression, Accuracy: 0.9764397905759162, AUCPR: 0.9950933631282883
Model naive_bayes already exists in the registry.
Model naive_bayes, version 3 registered in the MLflow Model Registry with tags {'Review': 'Passed', 'Ready for Deployment': 'Yes', 'Created by': 'shreyan'}.
Model: naive_bayes, Accuracy: 0.8926701570680629, AUCPR: 0.9903953190439188
[LightGBM] [Info] Number of positive: 830, number of negative: 2606
[LightGBM] [Info] Total Bins 74550
[LightGBM] [Info] Number of data points in the train set: 3436, number of used features: 2502
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.241560 -> initscore=-1.144146
[LightGBM] [Info] Start training from score -1.144146
Model lightgbm already exists in the registry.
Model lightgbm, version 3 regist

In [28]:
# Train and log models
model_names = ['logistic_regression', 'naive_bayes', 'lightgbm']
for model_name in model_names:
    train_and_log_model(model_name, X_train, y_train, X_val, y_val)

Model logistic_regression already exists in the registry.
Model logistic_regression, version 2 registered in the MLflow Model Registry.
Model: logistic_regression, Accuracy: 0.9764397905759162, AUCPR: 0.9950933631282883
Model naive_bayes already exists in the registry.
Model naive_bayes, version 2 registered in the MLflow Model Registry.
Model: naive_bayes, Accuracy: 0.8926701570680629, AUCPR: 0.9903953190439188
[LightGBM] [Info] Number of positive: 830, number of negative: 2606
[LightGBM] [Info] Total Bins 74550
[LightGBM] [Info] Number of data points in the train set: 3436, number of used features: 2502
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.241560 -> initscore=-1.144146
[LightGBM] [Info] Start training from score -1.144146
Model lightgbm already exists in the registry.
Model lightgbm, version 2 registered in the MLflow Model Registry.
Model: lightgbm, Accuracy: 0.9808027923211169, AUCPR: 0.9924414635350459


## After this command

In [29]:
! mlflow ui

^C


## Access the UI by navigating to http://localhost:5000 in your web browser to review the performance metrics, parameters, and details of each run.

--------------------------------------------------------------------------------------------------------------------------

## Using the Best version of the Lightgbm model to get "Accuracy" and "AUCPR" on test data

In [37]:
# Specify the model name and version you want to load
model_name = "lightgbm"  # replace with the actual registered model name
model_version = "3"  # replace with the actual desired version number


In [42]:
model_uri = f"models:/{model_name}/{model_version}"
model = mlflow.sklearn.load_model(model_uri)

In [43]:
# Use the loaded model to make predictions on the test dataset
y_pred = model.predict(X_test)

In [44]:
# Calculate metrics on the test dataset
accuracy = accuracy_score(y_test, y_pred)
aucpr = average_precision_score(y_test, model.predict_proba(X_test)[:, 1])

In [45]:
# Print out the metrics
print(f"Accuracy on test dataset: {accuracy}")
print(f"AUCPR on test dataset: {aucpr}")

Accuracy on test dataset: 0.987783595113438
AUCPR on test dataset: 0.996066457593158


-----------------------------------------------------------------------------------------------------------------------------