# SPAM CLASSIFIER 

Before you start download spam.csv dataset from: https://www.kaggle.com/uciml/sms-spam-collection-dataset

In [1]:
# default_exp train

## Input parameters for mlflow project 

In [None]:
#export 
import argparse
parser= argparse.ArgumentParser()

parser.add_argument('--max_features', type=int)

args = parser.parse_args()
input_params = args.__dict__

In [3]:
#hide
input_params = {'max_features':3000}

In [4]:
#export
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.multiclass import OneVsRestClassifier

## Prepare data 

In [8]:
#export
# download spam.csv dataset from: https://www.kaggle.com/uciml/sms-spam-collection-dataset
df = pd.read_csv('spam.csv', encoding='latin-1')
df.set_index('v2')
y = df.pop('v1').to_numpy()
X = df.pop('v2').to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

## Train and load to mlflow  

In [None]:
#export

import warnings
import sys

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from urllib.parse import urlparse
import mlflow
import mlflow.sklearn
import mlflow.pyfunc

#conda_env=mlflow.pyfunc.get_default_conda_env()

with mlflow.start_run():
    
    svc_tfidf = Pipeline([
        ("tfidf_vectorizer", TfidfVectorizer(stop_words="english", max_features=input_params['max_features'])),
        ("linear svc", OneVsRestClassifier(SVC(kernel='linear')))
    ])
    
    model = svc_tfidf
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    ac_score = accuracy_score(y_test, y_pred)
    print(classification_report(y_test, y_pred))
    
    mlflow.log_param("max_features", input_params['max_features'])
    mlflow.log_metric("accuracy_score", ac_score)
    
    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

    if tracking_url_type_store != "file":
        mlflow.sklearn.log_model(model, "model", registered_model_name="SMSSpamModel")
    else:
        mlflow.sklearn.log_model(model, "model")

## Export train code 

The above code will be exported to the python file using nbdev library (export, hide, default_exp keyworkd are needed ) 

In [2]:
#hide
from nbdev.export import *
notebook2script()

Converted spam.ipynb.


## Train from command using mlflow

In [3]:
%env MLFLOW_TRACKING_URI=http://mlflow:5000
!mlflow run . --no-conda --experiment-name="spamclassifier" -P max_features=3000

env: MLFLOW_TRACKING_URI=http://mlflow:5000
INFO: 'spamclassifier' does not exist. Creating a new experiment
2021/04/05 21:50:05 INFO mlflow.projects.utils: === Created directory /tmp/tmpqdkfoy6m for downloading remote URIs passed to arguments of type 'path' ===
2021/04/05 21:50:05 INFO mlflow.projects.backend.local: === Running command 'python3 ./spamclassifier/train.py --max_features 3000' in run with ID '64a89b0a6b7346498316bfae4c298535' === 
              precision    recall  f1-score   support

         ham       0.99      0.99      0.99      1451
        spam       0.96      0.93      0.94       221

    accuracy                           0.99      1672
   macro avg       0.97      0.96      0.97      1672
weighted avg       0.99      0.99      0.99      1672

Registered model 'SMSSpamModel' already exists. Creating a new version of this model...
2021/04/05 21:50:07 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.       

## Load from mlflow repository and test 

In [30]:
import mlflow.sklearn
#sk_model = mlflow.sklearn.load_model("runs:/96771d893a5e46159d9f3b49bf9013e2/sk_models")
#sk_model = mlflow.sklearn.load_model("/mlflow/mlruns/2/64a89b0a6b7346498316bfae4c298535/artifacts/model")
sk_model = mlflow.sklearn.load_model("models:/SMSSpamModel/2")
#sk_model = mlflow.sklearn.load_model("models:/SMSSpamModel/Staging")

In [32]:
res=sk_model.predict([X_test[17]])
res[0]

'spam'

In [None]:
X_test[0:50]