# Assignment 2 : SMS Spam Classification
# Rishika Tibrewal, MDS202135

## Importing Libraries

In [1]:
# ! pip install mlflow

In [2]:
# ! pip install jinja2==3.0.3

In [3]:
# pip install -U sentence-transformers

In [4]:
import mlflow
import joblib
import logging
import pandas as pd
import numpy as np
from urllib.parse import urlparse
from markupsafe import escape
from sklearn.utils import resample
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score,precision_recall_curve,auc
from sklearn.ensemble import RandomForestClassifier

  from .autonotebook import tqdm as notebook_tqdm


## Loading training, validation, test data

In [5]:
train=pd.read_csv("Data/Training Data.csv")
val=pd.read_csv("Data/Validation Data.csv")
test=pd.read_csv("Data/Test Data.csv")

In [6]:
train.head()

Unnamed: 0,Text,Label
0,Hey gorgeous man. My work mobile number is. Ha...,0
1,IM GONNAMISSU SO MUCH!!I WOULD SAY IL SEND U A...,0
2,I thk ü gotta go home by urself. Cos i'll b go...,0
3,OK i'm waliking ard now... Do u wan me 2 buy a...,0
4,Come to medical college at 7pm ......forward i...,0


In [7]:
val.head()

Unnamed: 0,Text,Label
0,Just finished eating. Got u a plate. NOT lefto...,0
1,"Sorry, got a late start, we're on the way",0
2,"If you don't, your prize will go to another cu...",1
3,I can't make it tonight,0
4,I don't want you to leave. But i'm barely doin...,0


In [8]:
test.head()

Unnamed: 0,Text,Label
0,Squeeeeeze!! This is christmas hug.. If u lik ...,0
1,And also I've sorta blown him off a couple tim...,0
2,Mmm thats better now i got a roast down me! i...,0
3,Mm have some kanji dont eat anything heavy ok,0
4,So there's a ring that comes with the guys cos...,0


In [9]:
y_train,X_train=train["Label"],train["Text"]
y_val,X_val=val["Label"],val["Text"]
y_test,X_test=test["Label"],test["Text"]

## Converting string to vectors, to give as input to the models

In [10]:
# replacing NAN entries by empty string
X_train = X_train.replace(np.nan, '', regex=True)
X_val = X_val.replace(np.nan, '', regex=True)
X_test = X_test.replace(np.nan, '', regex=True)

In [11]:
train_emb=model.encode(X_train)
val_emb=model.encode(X_val)
test_emb=model.encode(X_test)

In [12]:
train_emb.shape,val_emb.shape,test_emb.shape

((4025, 384), (711, 384), (836, 384))

## Training Models

In [13]:
def eval_metrics(actual, pred):
    precision, recall, thresholds = precision_recall_curve(actual, pred)
    auc_precision_recall = auc(recall, precision)
    return (auc_precision_recall)

In [14]:
mlflow.sklearn.autolog()

n_estimators = 300
max_depth = 6
clf = RandomForestClassifier(n_estimators = n_estimators, max_depth = max_depth,random_state=101)
clf.fit(train_emb, y_train)

y_pred = clf.predict(test_emb)
aucpr = eval_metrics(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
conf_1=confusion_matrix(y_test, y_pred)

with mlflow.start_run(run_name=f"n_estimators : {n_estimators}, max_depth : {max_depth}"):
  
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("AUCPR",aucpr)
    mlflow.log_dict(np.array(conf_1).tolist(), "confusion_matrix.json")
    mlflow.sklearn.log_model(clf, "model")

    print("\nRandom Classifier Model (no_of_estimator={:f}, max_depth={:f}):".format(n_estimators, max_depth))
    print(f"Accuracy: {acc}")
    print(f"AUCPR: {aucpr} ")
    print(f"Confusion Matrix: {conf_1} \n \n")
    

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    mlflow.sklearn.log_model(
        sk_model=clf,
        artifact_path="sklearn-model",
        registered_model_name="random-forest-classification-model"
    )
    if tracking_url_type_store != "file":
      mlflow.sklearn.log_model(clf, "model", registered_model_name="Randomclassifier")
    else:
      mlflow.sklearn.log_model(clf, "model")


2023/03/10 18:39:16 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '2c7b3fc1cfba4a3a881d2519f47a9b21', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow



Random Classifier Model (no_of_estimator=300.000000, max_depth=6.000000):
Accuracy: 0.9497607655502392
AUCPR: 0.8288579349818898 
Confusion Matrix: [[729   0]
 [ 42  65]] 
 



Successfully registered model 'random-forest-classification-model'.
2023/03/10 18:39:55 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: random-forest-classification-model, version 1
Created version '1' of model 'random-forest-classification-model'.


In [15]:
mlflow.tracking.MlflowClient().get_model_version("random-forest-classification-model","1")

<ModelVersion: creation_timestamp=1678453795430, current_stage='None', description=None, last_updated_timestamp=1678453795430, name='random-forest-classification-model', run_id='7e21c73090f145feb43900cf5d288734', run_link=None, source='file:///c:/Users/Rishika%20Tibrewal/OneDrive/Desktop/Applied-Machine-Learning/Assignment%202/mlruns/0/7e21c73090f145feb43900cf5d288734/artifacts/sklearn-model', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [16]:
# ! mlflow ui

### Model 2: Logistic Regression

In [17]:
mlflow.sklearn.autolog()

clf = LogisticRegression()
clf.fit(train_emb, y_train)

y_pred = clf.predict(test_emb)
aucpr = eval_metrics(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
conf_2=confusion_matrix(y_test, y_pred)

with mlflow.start_run(run_name="Logistic Regression"):

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("AUCPR",aucpr)
    mlflow.sklearn.log_model(clf, "model")
    mlflow.log_dict(np.array(conf_2).tolist(), "confusion_matrix.json")

    print("\nLogistic Regression")
    print(f"Accuracy: {acc}")
    print(f"AUCPR: {aucpr}")
    print(f"Confusion Matrix: {conf_2} \n\n")
    

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    mlflow.sklearn.log_model(
        sk_model=clf,
        artifact_path="sklearn-model",
        registered_model_name="logistic-regression-model"
    )
    if tracking_url_type_store != "file":
      mlflow.sklearn.log_model(clf, "model", registered_model_name="logistic-regression-model")
    else:
      mlflow.sklearn.log_model(clf, "model")


2023/03/10 18:40:05 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'bf22b3cebf8344f48301a3c4095334cd', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow



Logistic Regression
Accuracy: 0.9808612440191388
AUCPR: 0.9324263292044895
Confusion Matrix: [[727   2]
 [ 14  93]] 




Successfully registered model 'logistic-regression-model'.
2023/03/10 18:40:19 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: logistic-regression-model, version 1
Created version '1' of model 'logistic-regression-model'.


In [18]:
print(mlflow.tracking.MlflowClient().get_model_version("logistic-regression-model", '1'))

<ModelVersion: creation_timestamp=1678453819100, current_stage='None', description=None, last_updated_timestamp=1678453819100, name='logistic-regression-model', run_id='56ecd8e640dd437fb0cc083a71caf924', run_link=None, source='file:///c:/Users/Rishika%20Tibrewal/OneDrive/Desktop/Applied-Machine-Learning/Assignment%202/mlruns/0/56ecd8e640dd437fb0cc083a71caf924/artifacts/sklearn-model', status='READY', status_message=None, tags={}, user_id=None, version=1>


### Model 3: Multilayer Perceptron

In [19]:
mlflow.sklearn.autolog()

clf = MLPClassifier(random_state=101)
clf.fit(train_emb, y_train)

y_pred = clf.predict(test_emb)
aucpr = eval_metrics(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
conf_3=confusion_matrix(y_test, y_pred)

with mlflow.start_run(run_name="Multi-layer Perceptron"):

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("AUCPR",aucpr)
    mlflow.sklearn.log_model(clf, "model")
    
    print("\nMultilayer Perceptron")
    print(f"Accuracy: {acc}")
    print(f"AUCPR: {aucpr}")
    print(f"Confusion Matrix {conf_3} \n\n")
    

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    mlflow.sklearn.log_model(
        sk_model=clf,
        artifact_path="sklearn-model",
        registered_model_name="multilayer-perceptron-model"
    )
    if tracking_url_type_store != "file":
      mlflow.sklearn.log_model(clf, "model", registered_model_name="multilayer-perceptron-model")
    else:
      mlflow.sklearn.log_model(clf, "model")

2023/03/10 18:40:23 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'c5174b4bca304cf194f6355118fc9ec7', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow



Multilayer Perceptron
Accuracy: 0.9880382775119617
AUCPR: 0.9569132157008652
Confusion Matrix [[726   3]
 [  7 100]] 




Successfully registered model 'multilayer-perceptron-model'.
2023/03/10 18:40:39 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: multilayer-perceptron-model, version 1
Created version '1' of model 'multilayer-perceptron-model'.


In [20]:
print(mlflow.tracking.MlflowClient().get_model_version("multilayer-perceptron-model", '1'))

<ModelVersion: creation_timestamp=1678453839790, current_stage='None', description=None, last_updated_timestamp=1678453839790, name='multilayer-perceptron-model', run_id='33e82757377d438c844f5c78594abd17', run_link=None, source='file:///c:/Users/Rishika%20Tibrewal/OneDrive/Desktop/Applied-Machine-Learning/Assignment%202/mlruns/0/33e82757377d438c844f5c78594abd17/artifacts/sklearn-model', status='READY', status_message=None, tags={}, user_id=None, version=1>


In [21]:
filename = "mlpmodel.joblib"
joblib.dump(clf, filename)

['mlpmodel.joblib']

Since MLP classifier performs the best among the three models in terms of accuracy and AUCPR score, we dump the model to use it further.