In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
# Define random state
RANDOM_STATE = 42
# 2) Load the dataset
df = pd.read_csv("mail_l7_dataset.csv")
# Handle missing values
df['Message'] = df['Message'].fillna("")
# Encode labels
df['Category'] = df['Category'].map({'spam': 0, 'ham': 1})
X = df['Message']
y = df['Category']
# 3) Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)
# 4) TF-IDF
tfidf = TfidfVectorizer(min_df=1, stop_words="english", lowercase=True)
X_train_features = tfidf.fit_transform(X_train)
X_test_features  = tfidf.transform(X_test)
# 5) Logistic Regression
lr = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)
lr.fit(X_train_features, y_train)
lr_pred = lr.predict(X_test_features)
# 6) Random Forest
rf = RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE)
rf.fit(X_train_features.toarray(), y_train)
rf_pred = rf.predict(X_test_features.toarray())
# 7) Naive Bayes
nb = MultinomialNB()
nb.fit(X_train_features, y_train)
nb_pred = nb.predict(X_test_features)
# 8) Evaluation Functions
def print_clf_metrics(name, y_true, y_pred, pos_label=0):
    acc  = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, pos_label=pos_label)
    rec  = recall_score(y_true, y_pred, pos_label=pos_label)
    f1   = f1_score(y_true, y_pred, pos_label=pos_label)

    print(f"\n{name} Performance:")
    print(f"  Accuracy : {acc:.3f}")
    print(f"  Precision: {prec:.3f}")
    print(f"  Recall   : {rec:.3f}")
    print(f"  F1-Score : {f1:.3f}")
def print_confmat(name, y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred, labels=[1, 0])
    cm_df = pd.DataFrame(
        cm,
        index=["Actual: Ham (1)", "Actual: Spam (0)"],
        columns=["Pred: Ham (1)", "Pred: Spam (0)"]
    )
    print(f"\n{name} – Confusion Matrix:\n{cm_df}")
# 9) Show results
print_clf_metrics("Logistic Regression", y_test, lr_pred)
print_confmat("Logistic Regression", y_test, lr_pred)
print_clf_metrics("Random Forest", y_test, rf_pred)
print_confmat("Random Forest", y_test, rf_pred)
print_clf_metrics("Naive Bayes", y_test, nb_pred)
print_confmat("Naive Bayes", y_test, nb_pred)
# 10) Single-message predictions
examples = [
    "Free entry in 2 a weekly competition!",
    "I will meet you at the cafe tomorrow",
    "Congratulations, you won a free ticket"
]
print("\n=== CUSTOM MESSAGE CHECK ===")
for text in examples:
    tfidf_text = tfidf.transform([text])
    lr_p = int(lr.predict(tfidf_text)[0])
    rf_p = int(rf.predict(tfidf_text.toarray())[0])
    nb_p = int(nb.predict(tfidf_text)[0])
    def lab2str(v):
        return "Spam (0)" if v == 0 else "Ham (1)"
    print("\nText:", text)
    print("LR :", lab2str(lr_p))
    print("RF :", lab2str(rf_p))
    print("NB :", lab2str(nb_p))


Logistic Regression Performance:
  Accuracy : 0.968
  Precision: 1.000
  Recall   : 0.758
  F1-Score : 0.863

Logistic Regression – Confusion Matrix:
                  Pred: Ham (1)  Pred: Spam (0)
Actual: Ham (1)             966               0
Actual: Spam (0)             36             113

Random Forest Performance:
  Accuracy : 0.983
  Precision: 1.000
  Recall   : 0.872
  F1-Score : 0.932

Random Forest – Confusion Matrix:
                  Pred: Ham (1)  Pred: Spam (0)
Actual: Ham (1)             966               0
Actual: Spam (0)             19             130

Naive Bayes Performance:
  Accuracy : 0.977
  Precision: 1.000
  Recall   : 0.826
  F1-Score : 0.904

Naive Bayes – Confusion Matrix:
                  Pred: Ham (1)  Pred: Spam (0)
Actual: Ham (1)             966               0
Actual: Spam (0)             26             123

=== CUSTOM MESSAGE CHECK ===

Text: Free entry in 2 a weekly competition!
LR : Ham (1)
RF : Ham (1)
NB : Spam (0)

Text: I will meet you at th