In [1]:
import mlflow
from mlflow.tracking import MlflowClient

print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

client = MlflowClient()

# Use the search_experiments function to get a list of available experiments
experiments = client.search_experiments()
print(len(experiments))

# Display the list of available experiments
for experiment in experiments:
    print(f"Experiment Name: {experiment.name}, Experiment ID: {experiment.experiment_id}")

tracking URI: 'file:///Users/mohammedzaidsyed/Desktop/Spam/emailspam/mlruns'
3
Experiment Name: SVM, Experiment ID: 407553083182888926
Experiment Name: Random Forest, Experiment ID: 266853881450216497
Experiment Name: Naive Bayes, Experiment ID: 831114447560693852


In [2]:
import pandas as pd
def load_data(path):
    return pd.read_csv(path)

df = load_data('data/spam_ham_dataset.csv')
df.head()
# df.shape


Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [3]:
df.columns

Index(['Unnamed: 0', 'label', 'text', 'label_num'], dtype='object')

In [4]:
# df.describe()
# df.head()
df.shape

(5171, 4)

In [7]:
# Import libraries
# !pip install plotly
# !pip install nbformat==4.2.0


import seaborn as sns
import matplotlib.pyplot as plt
import re
import plotly.express as px

fig = px.box(df, y='label',color_discrete_sequence=px.colors.sequential.Agsunset,
             width=600, height=500)
fig.update_layout(title_text='Box Plot of Price')
fig.show()

In [6]:
# df.describe()
df.nunique()
# df.isna().sum()

Unnamed: 0    5171
label            2
text          4993
label_num        2
dtype: int64

In [8]:
CATEGORICAL_COLS = ["text"]

In [23]:
from typing import List
from sklearn.feature_extraction import DictVectorizer

def encode_cols(df: pd.DataFrame, categorical_cols: List[str] = None) -> pd.DataFrame:
    if categorical_cols is None:
        categorical_cols = ["text"]
        
    df[categorical_cols] = df[categorical_cols].apply(lambda x: x.astype(str).str.lower())
    return df


def extract_x_y(
    df: pd.DataFrame,
    categorical_cols: List[str] = None,
    dv: DictVectorizer = None,
    with_target: bool = True,
) -> dict:
    if categorical_cols is None:
         categorical_cols = ["text"]
    dicts = df[[*categorical_cols]].to_dict(orient="records")

    y = None
    if with_target:
        if dv is None:
            dv = DictVectorizer()
            dv.fit(dicts)
        y = df["label"].values

    x = dv.transform(dicts)
    return x, y, dv

# save the preprocessor into saved_pkl folder
import pickle
def save_picked(path: str, file):
    with open(path, "wb") as f:
        pickle.dump(file, f)


In [12]:
from sklearn.model_selection import train_test_split
# try all steps
df = load_data('data/spam_ham_dataset.csv')
df.to_csv("data/spam_ham_dataset.csv", index=False)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df.to_csv("data/train-set.csv")
test_df.to_csv("data/test-set.csv")
train_df = encode_cols(train_df)
test_df = encode_cols(test_df)
X_train, y_train, dv = extract_x_y(train_df)
X_test, y_test, _ = extract_x_y(test_df, dv=dv)

In [15]:
import mlflow
import mlflow.sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from typing import List

# Set the experiment name
mlflow.set_experiment("spam_email_classification_experiment")

# Check if there's an active run, and end it if necessary
if mlflow.active_run():
    mlflow.end_run()

# Start a run
with mlflow.start_run() as run:
    run_id = run.info.run_id

    # Set tags for the run
    mlflow.set_tag("experiment_id", run_id)

    # Load data - assuming you have functions to load your data
    train_df = load_data("data/train-set.csv")
    test_df = load_data("data/test-set.csv")

    # Preprocess the text data
    train_text = train_df['text'].tolist()
    test_text = test_df['text'].tolist()

    # Vectorize the text data
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(train_text)
    X_test = vectorizer.transform(test_text)

    y_train = train_df['label']
    y_test = test_df['label']

    # Train a classification model
    model = MultinomialNB()
    model.fit(X_train, y_train)

    # Evaluate the model
    y_train_pred = model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    mlflow.log_metric("train_accuracy", train_accuracy)
    print("Train Accuracy:", train_accuracy)
    print(classification_report(y_train, y_train_pred))

    y_test_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    mlflow.log_metric("test_accuracy", test_accuracy)
    print("Test Accuracy:", test_accuracy)
    print(classification_report(y_test, y_test_pred))

    # Log the model
    mlflow.sklearn.log_model(model, "model")

    # Register the model in MLflow Model Registry
    model_name = "spam_email_classifier_v1"
    model_description = "Naive Bayes Classifier for Spam Email Detection"
    mlflow.register_model("runs:/{}/model".format(run_id), model_name)


Train Accuracy: 0.9197292069632496
              precision    recall  f1-score   support

         ham       0.90      1.00      0.95      2930
        spam       1.00      0.72      0.84      1206

    accuracy                           0.92      4136
   macro avg       0.95      0.86      0.89      4136
weighted avg       0.93      0.92      0.92      4136

Test Accuracy: 0.8772946859903382
              precision    recall  f1-score   support

         ham       0.85      1.00      0.92       742
        spam       1.00      0.57      0.72       293

    accuracy                           0.88      1035
   macro avg       0.93      0.78      0.82      1035
weighted avg       0.90      0.88      0.87      1035




Setuptools is replacing distutils.

Successfully registered model 'spam_email_classifier_v1'.
Created version '1' of model 'spam_email_classifier_v1'.


In [19]:
from mlflow.tracking import MlflowClient

# Initialize MLflow tracking client
client = MlflowClient()

# Set the correct model type and experiment path
model_type = "multinomial_nb"
mlflow_experiment_path = 'spam_email_classifier_v1'

# Specify the version of the model to be transitioned
production_version = 1

# Transition the specified model version to the "Production" stage
client.transition_model_version_stage(name=mlflow_experiment_path, version=production_version, stage="Production")



``mlflow.tracking.client.MlflowClient.transition_model_version_stage`` is deprecated since 2.9.0. Model registry stages will be removed in a future major release. To learn more about the deprecation of model registry stages, see our migration guide here: https://mlflow.org/docs/2.9.2/model-registry.html#migrating-from-stages



<ModelVersion: aliases=[], creation_timestamp=1706714433496, current_stage='Production', description=None, last_updated_timestamp=1706714945705, name='spam_email_classifier_v1', run_id='9b67c7a096754c4899ca86b5cec240f5', run_link=None, source='file:///Users/mohammedzaidsyed/Desktop/Spam/emailspam/mlruns/570628366949585892/9b67c7a096754c4899ca86b5cec240f5/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [20]:
!mlflow ui --host 0.0.0.0 --port 5002

[2024-01-31 16:29:28 +0100] [49637] [INFO] Starting gunicorn 21.2.0
[2024-01-31 16:29:28 +0100] [49637] [INFO] Listening at: http://0.0.0.0:5002 (49637)
[2024-01-31 16:29:28 +0100] [49637] [INFO] Using worker: sync
[2024-01-31 16:29:28 +0100] [49638] [INFO] Booting worker with pid: 49638
[2024-01-31 16:29:28 +0100] [49639] [INFO] Booting worker with pid: 49639
[2024-01-31 16:29:28 +0100] [49640] [INFO] Booting worker with pid: 49640
[2024-01-31 16:29:28 +0100] [49641] [INFO] Booting worker with pid: 49641
^C
[2024-01-31 16:30:02 +0100] [49637] [INFO] Handling signal: int
[2024-01-31 16:30:02 +0100] [49639] [INFO] Worker exiting (pid: 49639)
[2024-01-31 16:30:02 +0100] [49640] [INFO] Worker exiting (pid: 49640)
[2024-01-31 16:30:02 +0100] [49638] [INFO] Worker exiting (pid: 49638)
[2024-01-31 16:30:02 +0100] [49641] [INFO] Worker exiting (pid: 49641)


In [25]:
import pickle

def save_pickle(file, path):
    """
    Save the file using pickle.
    
    Parameters:
        file: Any - The object to be saved.
        path: str - The path to save the file.
    """
    with open(path, "wb") as f:
        pickle.dump(file, f)

# Example usage:
save_pickle(model, "/Users/mohammedzaidsyed/Desktop/Spam/emailspam/web_service/model_v/model.pkl")
save_pickle(dv, "/Users/mohammedzaidsyed/Desktop/Spam/emailspam/web_service/dv_v/dv.pkl")

In [26]:
# from config import PATH_TO_MODEL, PATH_TO_PREPROCESSOR
# Load production model
model_uri = f"models:/{mlflow_experiment_path}/production"
model = mlflow.sklearn.load_model(model_uri)
save_picked("/Users/mohammedzaidsyed/Desktop/Spam/emailspam/web_service/model_v/model.pkl", model)

def load_pickle(path):
    with open(path, "rb") as f:
        file = pickle.load(f)
    return file

dv = load_pickle("/Users/mohammedzaidsyed/Desktop/Spam/emailspam/web_service/dv_v/dv.pkl")
model = load_pickle("/Users/mohammedzaidsyed/Desktop/Spam/emailspam/web_service/model_v/model.pkl")