# Assignment 2 : SMS Spam Classification
# Rishika Tibrewal, MDS202135

## Importing Libraries

In [None]:
# ! pip install mlflow

In [None]:
# ! pip install jinja2==3.0.3

In [1]:
%matplotlib inline
import mlflow
import logging
import pandas as pd
import numpy as np
from urllib.parse import urlparse
from markupsafe import escape
import matplotlib.pyplot as plt

logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score,precision_recall_curve,auc
from sklearn.ensemble import RandomForestClassifier

## Loading training, validation, test data

In [2]:
train=pd.read_csv("Data/Training Data.csv")
val=pd.read_csv("Data/Validation Data.csv")
test=pd.read_csv("Data/Test Data.csv")

In [3]:
train.head()

Unnamed: 0,Text,Label
0,Hey gorgeous man. My work mobile number is. Ha...,0
1,IM GONNAMISSU SO MUCH!!I WOULD SAY IL SEND U A...,0
2,I thk ü gotta go home by urself. Cos i'll b go...,0
3,OK i'm waliking ard now... Do u wan me 2 buy a...,0
4,Come to medical college at 7pm ......forward i...,0


In [4]:
val.head()

Unnamed: 0,Text,Label
0,Just finished eating. Got u a plate. NOT lefto...,0
1,"Sorry, got a late start, we're on the way",0
2,"If you don't, your prize will go to another cu...",1
3,I can't make it tonight,0
4,I don't want you to leave. But i'm barely doin...,0


In [5]:
test.head()

Unnamed: 0,Text,Label
0,Squeeeeeze!! This is christmas hug.. If u lik ...,0
1,And also I've sorta blown him off a couple tim...,0
2,Mmm thats better now i got a roast down me! i...,0
3,Mm have some kanji dont eat anything heavy ok,0
4,So there's a ring that comes with the guys cos...,0


In [6]:
y_train,X_train=train["Label"],train["Text"]
y_val,X_val=val["Label"],val["Text"]
y_test,X_test=test["Label"],test["Text"]

## Converting string to vectors, to give as input to the models

In [7]:
# replacing NAN entries by empty string
X_train = X_train.replace(np.nan, '', regex=True)
X_val = X_val.replace(np.nan, '', regex=True)
X_test = X_test.replace(np.nan, '', regex=True)

In [8]:
count = CountVectorizer().fit(X_train)
X_train = count.transform(X_train)
X_val = count.transform(X_val)
X_test = count.transform(X_test)

In [9]:
tfidf_transformer = TfidfTransformer()
tfidf_train = tfidf_transformer.fit_transform(X_train)
tfidf_val = tfidf_transformer.fit_transform(X_val)
tfidf_test = tfidf_transformer.fit_transform(X_test)

In [10]:
tfidf_train.shape,tfidf_val.shape,tfidf_test.shape

((4025, 7286), (711, 7286), (836, 7286))

## Training Models

In [11]:
def eval_metrics(actual, pred):
    precision, recall, thresholds = precision_recall_curve(actual, pred)
    auc_precision_recall = auc(recall, precision)
    return (auc_precision_recall)

In [12]:
mlflow.sklearn.autolog()

n_estimators = 200
max_depth = 5
clf = RandomForestClassifier(n_estimators = n_estimators, max_depth = max_depth,random_state=101)
clf.fit(tfidf_train, y_train)

y_pred = clf.predict(tfidf_test)
aucpr = eval_metrics(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
conf_1=confusion_matrix(y_test, y_pred)

with mlflow.start_run(run_name=f"n_estimators : {n_estimators}, max_depth : {max_depth}"):
  
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("AUCPR",aucpr)
    mlflow.log_dict(np.array(conf_1).tolist(), "confusion_matrix.json")
    mlflow.sklearn.log_model(clf, "model")

    print("\nRandom Classifier Model (no_of_estimator={:f}, max_depth={:f}):".format(n_estimators, max_depth))
    print(f"Accuracy: {acc}")
    print(f"AUCPR: {aucpr} ")
    print(f"Confusion Matrix: {conf_1} \n \n")
    

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    mlflow.sklearn.log_model(
        sk_model=clf,
        artifact_path="sklearn-model",
        registered_model_name="random-forest-classification-model"
    )
    if tracking_url_type_store != "file":
      mlflow.sklearn.log_model(clf, "model", registered_model_name="Randomclassifier")
    else:
      mlflow.sklearn.log_model(clf, "model")


2023/02/26 16:28:54 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '2dde5d31e9e742349d5cae9e9aceb656', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow



Random Classifier Model (no_of_estimator=200.000000, max_depth=5.000000):
Accuracy: 0.8720095693779905
AUCPR: 0.5639952153110048 
Confusion Matrix: [[729   0]
 [107   0]] 
 



Successfully registered model 'random-forest-classification-model'.
2023/02/26 16:29:14 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: random-forest-classification-model, version 1
Created version '1' of model 'random-forest-classification-model'.


In [13]:
mlflow.tracking.MlflowClient().get_model_version("random-forest-classification-model","1")

<ModelVersion: creation_timestamp=1677409154130, current_stage='None', description=None, last_updated_timestamp=1677409154130, name='random-forest-classification-model', run_id='cc2d239d41a0441592169baf307f5d34', run_link=None, source='file:///c:/Users/Rishika%20Tibrewal/OneDrive/Desktop/AML/Applied-Machine-Learning/Assignment%202/mlruns/0/cc2d239d41a0441592169baf307f5d34/artifacts/sklearn-model', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [14]:
# ! mlflow ui

### Model 2: Multinomial Naive Bayes

In [15]:
mlflow.sklearn.autolog()

clf = MultinomialNB()
clf.fit(tfidf_train, y_train)

y_pred = clf.predict(tfidf_test)
aucpr = eval_metrics(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
conf_2=confusion_matrix(y_test, y_pred)

with mlflow.start_run(run_name="Multinomial Naive Bayes"):

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("AUCPR",aucpr)
    mlflow.sklearn.log_model(clf, "model")
    mlflow.log_dict(np.array(conf_2).tolist(), "confusion_matrix.json")

    print("\nMultinomial Naive Bayes")
    print(f"Accuracy: {acc}")
    print(f"AUCPR: {aucpr}")
    print(f"Confusion Matrix: {conf_2} \n\n")
    

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    mlflow.sklearn.log_model(
        sk_model=clf,
        artifact_path="sklearn-model",
        registered_model_name="multinomial-nb-model"
    )
    if tracking_url_type_store != "file":
      mlflow.sklearn.log_model(clf, "model", registered_model_name="multinomial-nb-model")
    else:
      mlflow.sklearn.log_model(clf, "model")


2023/02/26 16:29:57 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '28a3eb85cd1c46969108edfe804a2365', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow



Multinomial Naive Bayes
Accuracy: 0.9617224880382775
AUCPR: 0.8696060457004874
Confusion Matrix: [[729   0]
 [ 32  75]] 




Successfully registered model 'multinomial-nb-model'.
2023/02/26 16:30:14 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: multinomial-nb-model, version 1
Created version '1' of model 'multinomial-nb-model'.


In [16]:
print(mlflow.tracking.MlflowClient().get_model_version("multinomial-nb-model", '1'))

<ModelVersion: creation_timestamp=1677409214424, current_stage='None', description=None, last_updated_timestamp=1677409214424, name='multinomial-nb-model', run_id='b7fed7e2214445e4abd2b61575801177', run_link=None, source='file:///c:/Users/Rishika%20Tibrewal/OneDrive/Desktop/AML/Applied-Machine-Learning/Assignment%202/mlruns/0/b7fed7e2214445e4abd2b61575801177/artifacts/sklearn-model', status='READY', status_message=None, tags={}, user_id=None, version=1>


### Model 3: MLP Classifier

In [17]:
mlflow.sklearn.autolog()

clf = MLPClassifier(random_state=101,learning_rate='adaptive')
clf.fit(tfidf_train, y_train)

y_pred = clf.predict(tfidf_test)
aucpr = eval_metrics(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
conf_3=confusion_matrix(y_test, y_pred)

with mlflow.start_run(run_name="Multilayer Perceptron"):

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("AUCPR",aucpr)
    mlflow.sklearn.log_model(clf, "model")
    
    print("\nMultilayer Perceptron")
    print(f"Accuracy: {acc}")
    print(f"AUCPR: {aucpr}")
    print(f"Confusion Matrix {conf_3} \n\n")
    

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    mlflow.sklearn.log_model(
        sk_model=clf,
        artifact_path="sklearn-model",
        registered_model_name="multilayer-perceptron-model"
    )
    if tracking_url_type_store != "file":
      mlflow.sklearn.log_model(clf, "model", registered_model_name="multilayer-perceptron-model")
    else:
      mlflow.sklearn.log_model(clf, "model")


2023/02/26 16:30:59 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'f0a22e2f42e14db68f6865bcfd6a50b2', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow



Multilayer Perceptron
Accuracy: 0.9904306220095693
AUCPR: 0.9674015114251219
Confusion Matrix [[729   0]
 [  8  99]] 




Successfully registered model 'multilayer-perceptron-model'.
2023/02/26 16:31:49 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: multilayer-perceptron-model, version 1
Created version '1' of model 'multilayer-perceptron-model'.


In [18]:
print(mlflow.tracking.MlflowClient().get_model_version("multilayer-perceptron-model", '1'))

<ModelVersion: creation_timestamp=1677409309928, current_stage='None', description=None, last_updated_timestamp=1677409309928, name='multilayer-perceptron-model', run_id='33f7f5d350354ebbb3825e475e905b05', run_link=None, source='file:///c:/Users/Rishika%20Tibrewal/OneDrive/Desktop/AML/Applied-Machine-Learning/Assignment%202/mlruns/0/33f7f5d350354ebbb3825e475e905b05/artifacts/sklearn-model', status='READY', status_message=None, tags={}, user_id=None, version=1>


The MLP Classifier performs the best among the three used models.