In [1]:
# --------------------------------
# 0) Imports
# --------------------------------
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB  # <--- Added Naive Bayes

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
)

RANDOM_STATE = 42  # reproducibility

# --------------------------------
# 1) Load the dataset
# --------------------------------
df = pd.read_csv(r"C:\Users\hp\Desktop\ml learning\ds-ml-bootcamp\submissions\suuelmi057\assigment-5\mail_l7_dataset.csv")

# Basic cleaning: replace NaNs with empty strings
df = df.where(pd.notnull(df), "")

# Encode labels: spam -> 0, ham -> 1
df.loc[df["Category"].str.lower().str.strip() == "spam", "Category"] = 0
df.loc[df["Category"].str.lower().str.strip() == "ham",  "Category"] = 1

print(df.head())

# --------------------------------
# 2) Split features (X) and target (y)
# --------------------------------
X = df["Message"].astype(str)
y = df["Category"].astype(int)

# --------------------------------
# 3) Train/test split (80% train, 20% test)
# --------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

print("=== SPLIT SIZES ===")
print("Train:", X_train.shape[0], " | Test:", X_test.shape[0])

# --------------------------------
# 4) Text → TF-IDF features
# --------------------------------
tfidf = TfidfVectorizer(min_df=1, stop_words="english", lowercase=True)
X_train_features = tfidf.fit_transform(X_train)
X_test_features  = tfidf.transform(X_test)

# --------------------------------
# 5) Train Models
# --------------------------------
# Logistic Regression
lr = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)
lr.fit(X_train_features, y_train)
lr_pred = lr.predict(X_test_features)

# Random Forest
rf = RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE)
rf.fit(X_train_features, y_train)
rf_pred = rf.predict(X_test_features.toarray())

# Naive Bayes (MultinomialNB)
nb = MultinomialNB()
nb.fit(X_train_features, y_train)
nb_pred = nb.predict(X_test_features)

# --------------------------------
# 6) Helper functions: metrics + confusion matrix print
# --------------------------------
def print_clf_metrics(name, y_true, y_pred, pos_label=0):
    acc  = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, pos_label=pos_label)
    rec  = recall_score(y_true, y_pred, pos_label=pos_label)
    f1   = f1_score(y_true, y_pred, pos_label=pos_label)
    print(f"\n{name} Performance:")
    print(f"  Accuracy : {acc:.3f}")
    print(f"  Precision: {prec:.3f}  (positive = spam=0)")
    print(f"  Recall   : {rec:.3f}  (positive = spam=0)")
    print(f"  F1-Score : {f1:.3f}  (positive = spam=0)")

def print_confmat(name, y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred, labels=[1, 0])
    cm_df = pd.DataFrame(
        cm,
        index   = ["Actual: Ham (1)",  "Actual: Spam (0)"],
        columns = ["Pred: Ham (1)",    "Pred: Spam (0)"]
    )
    print(f"\n{name} – Confusion Matrix:\n{cm_df}")

# --------------------------------
# 7) Show results for all three models
# --------------------------------
for name, pred in [("Logistic Regression", lr_pred), 
                   ("Random Forest", rf_pred), 
                   ("Naive Bayes", nb_pred)]:
    print_clf_metrics(name, y_test, pred, pos_label=0)
    print_confmat(name, y_test, pred)

# --------------------------------
# 8) Assignment Sanity Checks
# --------------------------------
check_messages = [
    "Free entry in 2 a weekly competition!",
    "I will meet you at the cafe tomorrow",
    "Congratulations, you won a free ticket"
]

def lab2str(v):
    return "Spam (0)" if v == 0 else "Ham (1)"

print("\n=== ASSIGNMENT SANITY CHECKS ===")
for msg in check_messages:
    msg_tfidf = tfidf.transform([msg])
    
    res_lr = lr.predict(msg_tfidf)[0]
    res_rf = rf.predict(msg_tfidf.toarray())[0]
    res_nb = nb.predict(msg_tfidf)[0]
    
    print(f"\nMessage: {msg}")
    print(f"LR Pred: {lab2str(res_lr)} | RF Pred: {lab2str(res_rf)} | NB Pred: {lab2str(res_nb)}")

  Category                                            Message
0        1  Go until jurong point, crazy.. Available only ...
1        1                      Ok lar... Joking wif u oni...
2        0  Free entry in 2 a wkly comp to win FA Cup fina...
3        1  U dun say so early hor... U c already then say...
4        1  Nah I don't think he goes to usf, he lives aro...
=== SPLIT SIZES ===
Train: 4457  | Test: 1115

Logistic Regression Performance:
  Accuracy : 0.968
  Precision: 1.000  (positive = spam=0)
  Recall   : 0.758  (positive = spam=0)
  F1-Score : 0.863  (positive = spam=0)

Logistic Regression – Confusion Matrix:
                  Pred: Ham (1)  Pred: Spam (0)
Actual: Ham (1)             966               0
Actual: Spam (0)             36             113

Random Forest Performance:
  Accuracy : 0.983
  Precision: 1.000  (positive = spam=0)
  Recall   : 0.872  (positive = spam=0)
  F1-Score : 0.932  (positive = spam=0)

Random Forest – Confusion Matrix:
                  Pred