In [1]:
import os

os.environ["MLFLOW_TRACKING_URI"] = "http://localhost:5000"


In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, confusion_matrix, accuracy_score, precision_recall_curve, auc
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import mlflow
from urllib.parse import urlparse
import numpy as np
from sklearn.svm import SVC



In [5]:
# Loading the datasets
train = pd.read_csv("train.csv")
train_X, train_y = train.text, train.spam

test = pd.read_csv("test.csv")
test_X, test_y = test.text, test.spam

val = pd.read_csv("validation.csv")
val_X, val_y = val.text, val.spam

# Defining models
MODELS = {
    "Logistic Regression": LogisticRegression(random_state = 1207, max_iter=2000),
    "Random Forest": RandomForestClassifier(random_state = 1207),
    "Support Vector Machine": SVC(random_state = 1207, probability=True),  # Enable probability estimates
}


def eval_metrics(actual_y, pred_y):
    precision, recall, _ = precision_recall_curve(actual_y, pred_y)
    return auc(recall, precision)

# Define the function to build, track and register all three models using MLFlow
def perform_training_and_benchmarking(model, model_name, train_X, train_y, val_X, val_y, test_X, test_y):
    """Trains and benchmarks model"""
    print(f"\nTraining and Evaluating {model_name}:\n")
    
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=2000)),
        ('clf', model),
    ])

    pipeline.fit(train_X, train_y)

    train_score = pipeline.score(train_X, train_y)
    validation_score = pipeline.score(val_X, val_y)

    print(f'Training Accuracy: {train_score}')
    print(f'Validation Accuracy: {validation_score}')

    predict = pd.concat([pd.DataFrame(pipeline.predict(test_X)), pd.DataFrame(test_y.values)], axis=1)
    predict.columns = ["Predicted values", "Actual values"]
    print(predict.head(10))

    y_pred = pipeline.predict(test_X)
    aucpr = eval_metrics(test_y, y_pred)
    acc = accuracy_score(test_y, y_pred)
    conf = confusion_matrix(test_y, y_pred)

    with mlflow.start_run(run_name=f"{model_name}"):
        mlflow.log_param("tfidf__max_features", 2000)
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("AUCPR",aucpr)
        mlflow.log_dict(np.array(conf).tolist(), "confusion_matrix.json")
        mlflow.sklearn.log_model(pipeline, "model")

        print("\nModel (tfidf__max_features={:f}):".format(2000))
        print(f"Accuracy: {acc}")
        print(f"AUCPR: {aucpr} ")
        print(f"Confusion Matrix:\n {conf} \n \n")

        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
        mlflow.sklearn.log_model(
            sk_model=pipeline,
            artifact_path="sklearn-model",
            registered_model_name=f"{model_name}-classification-model"
        )
        if tracking_url_type_store != "file":
            mlflow.sklearn.log_model(pipeline, "model", registered_model_name=f"{model_name}")
        else:
            mlflow.sklearn.log_model(pipeline, "model")

models = sorted(MODELS.items())

for model_name, model in models:
    perform_training_and_benchmarking(model, model_name, train_X, train_y, val_X, val_y, test_X, test_y)




Training and Evaluating Logistic Regression:

Training Accuracy: 0.9936406995230525
Validation Accuracy: 0.9872727272727273
   Predicted values  Actual values
0                 0              0
1                 0              0
2                 1              1
3                 1              1
4                 0              0
5                 1              1
6                 0              0
7                 1              1
8                 0              0
9                 0              0





Model (tfidf__max_features=2000.000000):
Accuracy: 0.9800362976406534
AUCPR: 0.9721434688377908 
Confusion Matrix:
 [[395   2]
 [  9 145]] 
 



Registered model 'Logistic Regression-classification-model' already exists. Creating a new version of this model...
2024/02/21 22:25:43 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Logistic Regression-classification-model, version 2
Created version '2' of model 'Logistic Regression-classification-model'.
Registered model 'Logistic Regression' already exists. Creating a new version of this model...
2024/02/21 22:25:47 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Logistic Regression, version 2
Created version '2' of model 'Logistic Regression'.



Training and Evaluating Random Forest:

Training Accuracy: 0.9997728821258233
Validation Accuracy: 0.9872727272727273
   Predicted values  Actual values
0                 0              0
1                 0              0
2                 1              1
3                 1              1
4                 1              0
5                 1              1
6                 0              0
7                 1              1
8                 0              0
9                 0              0

Model (tfidf__max_features=2000.000000):
Accuracy: 0.9782214156079855
AUCPR: 0.9671774459660122 
Confusion Matrix:
 [[392   5]
 [  7 147]] 
 



Registered model 'Random Forest-classification-model' already exists. Creating a new version of this model...
2024/02/21 22:26:11 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Random Forest-classification-model, version 2
Created version '2' of model 'Random Forest-classification-model'.
Registered model 'Random Forest' already exists. Creating a new version of this model...
2024/02/21 22:26:16 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Random Forest, version 2
Created version '2' of model 'Random Forest'.



Training and Evaluating Support Vector Machine:

Training Accuracy: 0.9995457642516467
Validation Accuracy: 0.9963636363636363
   Predicted values  Actual values
0                 0              0
1                 0              0
2                 1              1
3                 1              1
4                 0              0
5                 1              1
6                 0              0
7                 1              1
8                 0              0
9                 0              0

Model (tfidf__max_features=2000.000000):
Accuracy: 0.9836660617059891
AUCPR: 0.9769134049910917 
Confusion Matrix:
 [[395   2]
 [  7 147]] 
 



Registered model 'Support Vector Machine-classification-model' already exists. Creating a new version of this model...
2024/02/21 22:27:26 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Support Vector Machine-classification-model, version 2
Created version '2' of model 'Support Vector Machine-classification-model'.
Registered model 'Support Vector Machine' already exists. Creating a new version of this model...
2024/02/21 22:27:30 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Support Vector Machine, version 2
Created version '2' of model 'Support Vector Machine'.
