# Applied Machine Learning - Assignment 2
#### Submitted by 
- Anusha R
- MDS202212
- anushar@cmi.ac.in

In [1]:
# importing necessary libraries

import os
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve, auc
from sklearn.metrics import average_precision_score



from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import mlflow
import mlflow.sklearn
import tempfile

In [2]:
import warnings

warnings.filterwarnings("ignore")

In [3]:
current_directory = os.getcwd()

In [4]:
# fit a model on train data

def train_model(model, vectorizer, train_df):
    X_train = vectorizer.fit_transform(train_df['text'])
    y_train = train_df['spam']

    pipeline = Pipeline([
        ('vectorizer', vectorizer),
        ('model', model) ])

    pipeline.fit(train_df['text'], y_train)

    return pipeline

In [5]:
# score a model on given data

def score_model(model, X_data, y_data):
    y_pred = model.predict(X_data)
    accuracy = accuracy_score(y_data, y_pred)
    return accuracy

In [6]:
# evaluate the model predictions

def evaluate_model(model, X_data, y_data):
    y_pred = model.predict(X_data)
    report = classification_report(y_data, y_pred)
    return report

In [7]:
# fine-tune using training data

def fine_tune_model(model, param_grid, X_train, y_train):
    grid_search = GridSearchCV(model, param_grid, cv=5)
    grid_search.fit(X_train, y_train)

    return grid_search.best_estimator_

In [8]:
def auc_value(y_data, y_pred):
    precision, recall, _ = precision_recall_curve(y_data, y_pred)
    return auc(recall, precision)

In [9]:
# Function to calculate AUCPR
def calculate_aucpr(model, X, y):
    y_proba = model.predict_proba(X)[:, 1]
    precision, recall, _ = precision_recall_curve(y, y_proba)
    aucpr = auc(recall, precision)
    return aucpr

## Loading the Preprocessed Dataset

In [10]:
train_data = pd.read_csv('train.csv')
validation_data = pd.read_csv('validation.csv')
test_data = pd.read_csv('test.csv')

## MLFlow

In [11]:
mlflow.set_experiment("BenchmarkModelsExperiment")

<Experiment: artifact_location='file:///D:/CMI%20DS/Sem%204/AML/Assignment2/mlruns/831138452859572019', creation_time=1708511803100, experiment_id='831138452859572019', last_update_time=1708511803100, lifecycle_stage='active', name='BenchmarkModelsExperiment', tags={}>

In [12]:
# Start MLflow experiment for Naive Bayes model
with mlflow.start_run(run_name="NaiveBayes"):
    # Train Naive Bayes model
    nb_model = MultinomialNB()
    nb_pipeline = train_model(nb_model, TfidfVectorizer(), train_data)
    
    # Fine-tune Naive Bayes model
    nb_param_grid = {'model__alpha': [0.1, 0.5, 1.0]}
    best_nb_model = fine_tune_model(nb_pipeline, nb_param_grid, train_data['text'], train_data['spam'])
    
    # Score model on train data
    nb_train_accuracy = score_model(best_nb_model, train_data['text'], train_data['spam'])
    mlflow.log_metric("train_accuracy", nb_train_accuracy)
    
    # Calculate and log AUCPR for Naive Bayes model
    nb_train_aucpr = calculate_aucpr(best_nb_model, train_data['text'], train_data['spam'])
    mlflow.log_metric("train_aucpr", nb_train_aucpr)
    
    # Evaluate model on train data
    nb_train_report = evaluate_model(best_nb_model, train_data['text'], train_data['spam'])
    
    # Save evaluation report to a temporary file
    with tempfile.NamedTemporaryFile(mode='w', delete=False) as tmp_file:
        tmp_file.write(nb_train_report)
        tmp_file_path = tmp_file.name
    
    # Log evaluation report file as text artifact
    mlflow.log_artifact(tmp_file_path, "train_evaluation_report.txt")
    
    # Log trained model artifact
    mlflow.sklearn.log_model(best_nb_model, "model")

In [15]:
# Start MLflow experiment for logistic regression model
with mlflow.start_run(run_name="LogisticRegression"):
    # Train logistic regression model
    lr_model = LogisticRegression()
    lr_pipeline = train_model(lr_model, TfidfVectorizer(), train_data)

    # Fine-tune logistic regression model
    lr_param_grid = {'model__C': [0.1, 1.0, 10.0]}
    best_lr_model = fine_tune_model(lr_pipeline, lr_param_grid, train_data['text'], train_data['spam'])
    
    # Score model on train data
    lr_train_accuracy = score_model(best_lr_model, train_data['text'], train_data['spam'])
    mlflow.log_metric("train_accuracy", lr_train_accuracy)
    
    # Evaluate model on train data
    lr_train_predictions = best_lr_model.predict_proba(train_data['text'])[:, 1]
    lr_aucpr = average_precision_score(train_data['spam'], lr_train_predictions)
    mlflow.log_metric("train_aucpr", lr_aucpr)
    
    # Log trained model artifact
    mlflow.sklearn.log_model(best_lr_model, "model")

In [17]:
# Start MLflow experiment for random forest model
with mlflow.start_run(run_name="RandomForest"):
    # Train random forest model
    rf_model = RandomForestClassifier()
    rf_pipeline = train_model(rf_model, TfidfVectorizer(), train_data)

    # Fine-tune random forest model
    rf_param_grid = {
        'model__n_estimators': [50, 100, 200],
        'model__max_depth': [None, 10, 20],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4]
    }
    best_rf_model = fine_tune_model(rf_pipeline, rf_param_grid, train_data['text'], train_data['spam'])
    
    # Score model on train data
    rf_train_accuracy = score_model(best_rf_model, train_data['text'], train_data['spam'])
    mlflow.log_metric("train_accuracy", rf_train_accuracy)
    
    # Evaluate model on train data
    rf_train_predictions = best_rf_model.predict_proba(train_data['text'])[:, 1]
    rf_aucpr = average_precision_score(train_data['spam'], rf_train_predictions)
    mlflow.log_metric("train_aucpr", rf_aucpr)
    
    # Log trained model artifact
    mlflow.sklearn.log_model(best_rf_model, "model")

In [29]:
# Retrieve AUCPR values from MLflow
aucpr_values = {}
runs = mlflow.search_runs()
for _, run in runs.iterrows():
    run_id = run['run_id']
    run_data = mlflow.get_run(run_id).data
    if 'train_aucpr' in run_data.metrics.keys():
        aucpr_values[run_id] = {
            'model_name': run_data.tags.get('model_name', 'Unknown Model'),
            'aucpr': run_data.metrics['train_aucpr']
        }

# Choose the model with the highest AUCPR
best_run_id = max(aucpr_values, key=lambda x: aucpr_values[x]['aucpr'])
best_model_aucpr = aucpr_values[best_run_id]['aucpr']

# Display all results
for run_id, values in aucpr_values.items():
    print(f"Model: {values['model_name']}, Run ID: {run_id}, AUCPR: {values['aucpr']}")

best_model_name = aucpr_values[best_run_id]['model_name']
print(f"The best model based on AUCPR is: {best_model_name} with AUCPR: {best_model_aucpr}")


Model: Unknown Model, Run ID: 3019d8e680e34b10a6e3049e93bee6ef, AUCPR: 1.0
Model: Unknown Model, Run ID: 64750997992e403fb8b7b90fa9867f9c, AUCPR: 0.9999999999999998
Model: Unknown Model, Run ID: b91a611cce8343fbbe37ed2abb2ac358, AUCPR: 1.0
Model: Unknown Model, Run ID: a0c3a6d5a4e04cbbaddeec7b41c3f00b, AUCPR: 1.0
Model: Unknown Model, Run ID: 09336f42ec6144c989d2c56514f454af, AUCPR: 0.9999924325344409
The best model based on AUCPR is: Unknown Model with AUCPR: 1.0


## End of Assignment 2