In [None]:
%pip install pandas scikit-learn mlflow

^C
Note: you may need to restart the kernel to use updated packages.


Collecting scikit-learn
  Using cached scikit_learn-1.7.0-cp313-cp313-win_amd64.whl.metadata (14 kB)
Collecting mlflow
  Using cached mlflow-3.1.1-py3-none-any.whl.metadata (29 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Using cached scipy-1.16.0-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Collecting mlflow-skinny==3.1.1 (from mlflow)
  Using cached mlflow_skinny-3.1.1-py3-none-any.whl.metadata (30 kB)
Collecting Flask<4 (from mlflow)
  Using cached flask-3.1.1-py3-none-any.whl.metadata (3.0 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Using cached alembic-1.16.2-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Using cached docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Using cached 

In [14]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import mlflow
import pickle


In [7]:
df = pd.read_csv("emails.csv")  
df.columns = ['text', 'label']
df['label'] = df['label'].astype(int)

In [12]:
X_train, X_val, y_train, y_val = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42
)

tfidf = TfidfVectorizer(stop_words='english', max_df=0.9)
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)

# MLflow experiment
mlflow.set_experiment("spam_classification_nb")

alpha_values = [0.001, 0.01, 0.1, 0.5, 1.0, 5.0, 10.0]

for alpha in alpha_values:
    with mlflow.start_run():
        model = MultinomialNB(alpha=alpha)
        model.fit(X_train_tfidf, y_train)
        y_pred = model.predict(X_val_tfidf)
        acc = accuracy_score(y_val, y_pred)
        prec = precision_score(y_val, y_pred)
        rec = recall_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)
        mlflow.log_param("alpha", alpha)
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("precision", prec)
        mlflow.log_metric("recall", rec)
        mlflow.log_metric("f1_score", f1)
        mlflow.sklearn.log_model(model, "model")
        mlflow.sklearn.log_model(tfidf, "vectorizer")
        print(f"[alpha={alpha}] accuracy={acc:.4f}, f1={f1:.4f}")




[alpha=0.001] accuracy=0.9799, f1=0.9590




[alpha=0.01] accuracy=0.9825, f1=0.9647




[alpha=0.1] accuracy=0.9799, f1=0.9591




[alpha=0.5] accuracy=0.9415, f1=0.8694




[alpha=1.0] accuracy=0.8927, f1=0.7309




[alpha=5.0] accuracy=0.8045, f1=0.3708




[alpha=10.0] accuracy=0.7862, f1=0.2687


In [18]:
best_alpha = 0.01  
final_model = MultinomialNB(alpha=best_alpha)
final_model.fit(X_train_tfidf, y_train)
with open(r"best_mnnaivebayesmodel.pkl", "wb") as model_file:
    pickle.dump(final_model, model_file)
with open(r"tfidf_vectorizer.pkl", "wb") as vec_file:
    pickle.dump(tfidf, vec_file)


In [20]:
# Inference

with open("best_mnnaivebayesmodel.pkl", "rb") as model_file:
    model = pickle.load(model_file)
with open("tfidf_vectorizer.pkl", "rb") as vec_file:
    vectorizer = pickle.load(vec_file)
email_text = "Subject: loose your fat in 9 days  are you  overweight ?  loose 9 pounds every 11  days !  do you want to loose weight fast  the natural way ?  the idiot proof diet  will help you shed 9 pounds every 11 days , take it for a free test drive  at the link below :  click here for the  idiot proof diet  w a r n i n g :  if  you notice that you are loosing too much weight to quickly , then you  should stop dieting for a few days if you loose more than 1 pound per  day you should slow down a little .  click here for the  idiot proof diet  affid : zoolant 44561  getresponse marketing p . 0 box 1451 waterfall , south africa 3652 this e - mail message is an advertisement and / or solicitation ."
X_input = vectorizer.transform([email_text])
prediction = model.predict(X_input)[0]
label = "spam" if prediction == 1 else "ham"
print(f"Prediction: {label}")

Prediction: spam
