# Importing libraries

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')

import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import  accuracy_score
from sklearn.pipeline import Pipeline
from itertools import product
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, average_precision_score
from sklearn.preprocessing import LabelBinarizer

import mlflow
import mlflow.sklearn


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\j25sr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\j25sr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\j25sr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
mlflow.set_experiment("SMSSpamDetectionModels")

2025/03/04 21:42:12 INFO mlflow.tracking.fluent: Experiment with name 'SMSSpamDetectionModels' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///c:/Users/j25sr/OneDrive/Desktop/Sem%204/AML/Applied-Machine-Learning/Assignment_2/mlruns/313760295266013132', creation_time=1741104732203, experiment_id='313760295266013132', last_update_time=1741104732203, lifecycle_stage='active', name='SMSSpamDetectionModels', tags={}>

# Function to load data

In [3]:
def load_data(file_path):
    msg_train = pd.read_csv(file_path+ 'msg_train.csv').squeeze()
    msg_val = pd.read_csv(file_path+ 'msg_val.csv').squeeze()
    msg_test = pd.read_csv(file_path+ 'msg_test.csv').squeeze()
    label_train = pd.read_csv(file_path+ 'label_train.csv').squeeze()
    label_val = pd.read_csv(file_path+ 'label_val.csv').squeeze()
    label_test = pd.read_csv(file_path+ 'label_test.csv').squeeze()
    return msg_train, msg_val, msg_test, label_train, label_val, label_test

In [4]:
path = "C:/Users/j25sr/OneDrive/Desktop/AML 1/split_dataset/split_dataset"
msg_train, msg_val, msg_test, label_train, label_val, label_test = load_data(path)

# MLFlow

## Logistic Regression

In [8]:
pipeline_logistic = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=lambda x: x, preprocessor=None, lowercase=False)),  # Directly handle tokenized data
    ('classifier', LogisticRegression(solver='liblinear')),
])

In [9]:
from itertools import product
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, average_precision_score
import mlflow
import mlflow.sklearn

def param_search_logistic(msg_train, label_train, msg_val, label_val, pipeline_logistic):
    param_grid = {
        'tfidf__use_idf': [True, False],
        'classifier__C': [0.1, 1, 10],
        'classifier__penalty': ['l1', 'l2']
    }

    param_combinations = list(product(*param_grid.values()))
    best_score = 0
    best_params = None
    best_model = None
    best_metrics = {}

    for params in param_combinations:
        try:
            # Ensure 'l1' penalty is used only with solvers that support it
            penalty = params[2]
            solver = 'liblinear' if penalty == 'l1' else 'lbfgs'

            pipeline_logistic.set_params(
                tfidf__use_idf=params[0],
                classifier__C=params[1],
                classifier__penalty=penalty,
                classifier__solver=solver  # Explicitly set compatible solver
            )
            
            pipeline_logistic.fit(msg_train, label_train)
            val_predictions = pipeline_logistic.predict(msg_val)
            val_probabilities = pipeline_logistic.predict_proba(msg_val)[:, 1]

            # Compute evaluation metrics
            val_accuracy = accuracy_score(label_val, val_predictions)
            val_precision = precision_score(label_val, val_predictions, pos_label='spam', zero_division=0)
            val_recall = recall_score(label_val, val_predictions, pos_label='spam', zero_division=0)
            val_f1 = f1_score(label_val, val_predictions, pos_label='spam', zero_division=0)
            val_aucpr = average_precision_score((label_val == 'spam').astype(int), val_probabilities)

            # Track best model
            if val_f1 > best_score:
                best_score = val_f1
                best_params = params
                best_model = pipeline_logistic
                best_metrics = {
                    "Accuracy": val_accuracy,
                    "Precision": val_precision,
                    "Recall": val_recall,
                    "F1-score": val_f1,
                    "AUCPR": val_aucpr
                }
        
        except ValueError as e:
            print(f"Skipping params {params} due to error: {e}")
            continue

    # Log only the best model
    if best_model:
        with mlflow.start_run(run_name="Best_Logistic_Regression"):
            mlflow.log_params({"use_idf": best_params[0], "C": best_params[1], "penalty": best_params[2]})
            mlflow.log_metrics(best_metrics)

            # Log best model and get ModelInfo object
            model_info = mlflow.sklearn.log_model(best_model, "logistic_regression_model")

            # Extract model URI and register it
            model_uri = model_info.model_uri
            model_version = mlflow.register_model(model_uri, "LogisticRegressionModel")

            print(f"Model registered as 'LogisticRegressionModel' with version: {model_version.version}")
    
    return best_params, best_metrics

# Run the search
best_params_logistic, best_metrics_logistic = param_search_logistic(msg_train, label_train, msg_val, label_val, pipeline_logistic)

# Print results
print("\nBest Parameters (Logistic Regression):", best_params_logistic)
print(f"Final Metrics (Logistic Regression):")
for metric, value in best_metrics_logistic.items():
    print(f"{metric}: {value * 100:.2f}%")

Registered model 'LogisticRegressionModel' already exists. Creating a new version of this model...
Created version '2' of model 'LogisticRegressionModel'.


Model registered as 'LogisticRegressionModel' with version: 2

Best Parameters (Logistic Regression): (True, 10, 'l1')
Final Metrics (Logistic Regression):
Accuracy: 98.48%
Precision: 97.14%
Recall: 91.28%
F1-score: 94.12%
AUCPR: 98.58%


## SVM Model

In [10]:
pipeline_svm = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=lambda x: x, preprocessor=None, lowercase=False)),
    ('classifier', SVC()),
])

In [11]:
from itertools import product
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, average_precision_score
import mlflow
import mlflow.sklearn

def param_search_svm(msg_train, label_train, msg_val, label_val, pipeline_svm):
    param_grid = {
        'tfidf__use_idf': [True, False],
        'classifier__C': [0.1, 1, 10],
        'classifier__kernel': ['linear', 'rbf']
    }

    param_combinations = list(product(*param_grid.values()))
    best_score = 0
    best_params = None
    best_model = None
    best_metrics = {}

    for params in param_combinations:
        try:
            # Set parameters for the pipeline
            pipeline_svm.set_params(
                tfidf__use_idf=params[0],
                classifier__C=params[1],
                classifier__kernel=params[2]
            )
            
            pipeline_svm.fit(msg_train, label_train)
            val_predictions = pipeline_svm.predict(msg_val)
            val_probabilities = pipeline_svm.decision_function(msg_val)  # SVM uses decision_function

            # Compute evaluation metrics
            val_accuracy = accuracy_score(label_val, val_predictions)
            val_precision = precision_score(label_val, val_predictions, pos_label='spam', zero_division=0)
            val_recall = recall_score(label_val, val_predictions, pos_label='spam', zero_division=0)
            val_f1 = f1_score(label_val, val_predictions, pos_label='spam', zero_division=0)
            val_aucpr = average_precision_score((label_val == 'spam').astype(int), val_probabilities)

            # Track best model
            if val_f1 > best_score:
                best_score = val_f1
                best_params = params
                best_model = pipeline_svm
                best_metrics = {
                    "Accuracy": val_accuracy,
                    "Precision": val_precision,
                    "Recall": val_recall,
                    "F1-score": val_f1,
                    "AUCPR": val_aucpr
                }
        
        except ValueError as e:
            print(f"Skipping params {params} due to error: {e}")
            continue

    # Log only the best model
    if best_model:
        with mlflow.start_run(run_name="Best_SVM"):
            mlflow.log_params({"use_idf": best_params[0], "C": best_params[1], "kernel": best_params[2]})
            mlflow.log_metrics(best_metrics)

            # Log best model and get ModelInfo object
            model_info = mlflow.sklearn.log_model(best_model, "svm_model")

            # Extract model URI and register it
            model_uri = model_info.model_uri
            model_version = mlflow.register_model(model_uri, "SVMModel")

            print(f"Model registered as 'SVMModel' with version: {model_version.version}")

    return best_params, best_metrics

# Run the search
best_params_svm, best_metrics_svm = param_search_svm(msg_train, label_train, msg_val, label_val, pipeline_svm)

# Print results
print("\nBest Parameters (SVM):", best_params_svm)
print(f"Final Metrics (SVM):")
for metric, value in best_metrics_svm.items():
    print(f"{metric}: {value * 100:.2f}%")

Registered model 'SVMModel' already exists. Creating a new version of this model...


Model registered as 'SVMModel' with version: 2

Best Parameters (SVM): (True, 10, 'rbf')
Final Metrics (SVM):
Accuracy: 98.57%
Precision: 98.54%
Recall: 90.60%
F1-score: 94.41%
AUCPR: 98.39%


Created version '2' of model 'SVMModel'.


## Naive Bayes

In [6]:
pipeline_NB = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=lambda x: x, preprocessor=None, lowercase=False)),
    ('classifier', MultinomialNB())  # Classifier
])

In [7]:
from itertools import product
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, average_precision_score
import mlflow
import mlflow.sklearn

def param_search_nb(msg_train, label_train, msg_val, label_val, pipeline_NB):
    param_grid = {
        'tfidf__use_idf': [True, False],
        'classifier__alpha': [0.1, 0.5, 1.0, 2.0]
    }

    param_combinations = list(product(*param_grid.values()))
    best_score = 0
    best_params = None
    best_model = None
    best_metrics = {}

    for params in param_combinations:
        try:
            # Set parameters for the pipeline
            pipeline_NB.set_params(
                tfidf__use_idf=params[0],
                classifier__alpha=params[1]
            )
            
            pipeline_NB.fit(msg_train, label_train)
            val_predictions = pipeline_NB.predict(msg_val)
            val_probabilities = pipeline_NB.predict_proba(msg_val)[:, 1]

            # Compute evaluation metrics
            val_accuracy = accuracy_score(label_val, val_predictions)
            val_precision = precision_score(label_val, val_predictions, pos_label='spam', zero_division=0)
            val_recall = recall_score(label_val, val_predictions, pos_label='spam', zero_division=0)
            val_f1 = f1_score(label_val, val_predictions, pos_label='spam', zero_division=0)
            val_aucpr = average_precision_score((label_val == 'spam').astype(int), val_probabilities)

            # Track best model
            if val_f1 > best_score:
                best_score = val_f1
                best_params = params
                best_model = pipeline_NB
                best_metrics = {
                    "Accuracy": val_accuracy,
                    "Precision": val_precision,
                    "Recall": val_recall,
                    "F1-score": val_f1,
                    "AUCPR": val_aucpr
                }
        
        except ValueError as e:
            print(f"Skipping params {params} due to error: {e}")
            continue

    # Log only the best model
    if best_model:
        with mlflow.start_run(run_name="Best_Naive_Bayes"):
            mlflow.log_params({"use_idf": best_params[0], "alpha": best_params[1]})
            mlflow.log_metrics(best_metrics)

            # Log best model and get ModelInfo object
            model_info = mlflow.sklearn.log_model(best_model, "naive_bayes_model")

            # Extract model URI and register it
            model_uri = model_info.model_uri
            model_version = mlflow.register_model(model_uri, "NaiveBayesModel")

            print(f"Model registered as 'NaiveBayesModel' with version: {model_version.version}")

    return best_params, best_metrics

# Run the search
best_params_nb, best_metrics_nb = param_search_nb(msg_train, label_train, msg_val, label_val, pipeline_NB)

# Print results
print("\nBest Parameters (NB):", best_params_nb)
print(f"Final Metrics (NB):")
for metric, value in best_metrics_nb.items():
    print(f"{metric}: {value * 100:.2f}%")


Registered model 'NaiveBayesModel' already exists. Creating a new version of this model...


Model registered as 'NaiveBayesModel' with version: 10

Best Parameters (NB): (True, 0.1)
Final Metrics (NB):
Accuracy: 94.26%
Precision: 98.85%
Recall: 57.72%
F1-score: 72.88%
AUCPR: 94.83%


Created version '10' of model 'NaiveBayesModel'.
