In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# 1. Load Dataset
try:
    df = pd.read_csv("mail_l7_dataset.csv")
    print("✅ File loaded successfully!")
except FileNotFoundError:
    print("❌ Error: 'mail_l7_dataset.csv' not found. Check the folder!")

# 2. Preprocess - FIXED VERSION TO AVOID TYPEERROR
df = df.where(pd.notnull(df), "")

# Clean text and convert to numbers (spam=0, ham=1) in one step
df['Category'] = df['Category'].str.lower().str.strip().map({'spam': 0, 'ham': 1})

# If there are any errors in mapping, fill with 0 and convert to int
df['Category'] = df['Category'].fillna(0).astype(int)
print("✅ Labels encoded successfully!")

# 3. Split (80/20)
X_train, X_test, y_train, y_test = train_test_split(df["Message"], df["Category"], test_size=0.2, random_state=42)

# 4. TF-IDF
tfidf = TfidfVectorizer(min_df=1, stop_words="english", lowercase=True)
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

# 5. Train 3 Models
lr = LogisticRegression().fit(X_train_vec, y_train)
rf = RandomForestClassifier(n_estimators=100).fit(X_train_vec, y_train)
nb = MultinomialNB().fit(X_train_vec, y_train)

# 6. SANITY CHECKS (The 3 specific messages from the assignment)
print("\n=== SANITY CHECK RESULTS ===")
test_msgs = [
    "Free entry in 2 a weekly competition!",
    "I will meet you at the cafe tomorrow",
    "Congratulations, you won a free ticket"
]

for msg in test_msgs:
    vec = tfidf.transform([msg])
    # Predictions (0 = Spam, 1 = Ham)
    p_lr = "Spam" if lr.predict(vec)[0] == 0 else "Ham"
    p_rf = "Spam" if rf.predict(vec)[0] == 0 else "Ham"
    p_nb = "Spam" if nb.predict(vec)[0] == 0 else "Ham"
    print(f"\nMessage: {msg}")
    print(f"  Predictions -> LR: {p_lr}, RF: {p_rf}, NB: {p_nb}")

# 7. Performance Evaluation
def show_metrics(name, model):
    preds = model.predict(X_test_vec)
    print(f"\n{name} Performance Metrics:")
    print(f"  Accuracy  : {accuracy_score(y_test, preds):.3f}")
    print(f"  Precision : {precision_score(y_test, preds, pos_label=0):.3f}")
    print(f"  Recall    : {recall_score(y_test, preds, pos_label=0):.3f}")
    print(f"  F1-Score  : {f1_score(y_test, preds, pos_label=0):.3f}")
    print(f"  Confusion Matrix (Rows=Actual, Cols=Pred):\n{confusion_matrix(y_test, preds)}")

show_metrics("Logistic Regression", lr)
show_metrics("Random Forest", rf)
show_metrics("Naive Bayes", nb)

✅ File loaded successfully!
✅ Labels encoded successfully!

=== SANITY CHECK RESULTS ===

Message: Free entry in 2 a weekly competition!
  Predictions -> LR: Ham, RF: Ham, NB: Spam

Message: I will meet you at the cafe tomorrow
  Predictions -> LR: Ham, RF: Ham, NB: Ham

Message: Congratulations, you won a free ticket
  Predictions -> LR: Ham, RF: Ham, NB: Ham

Logistic Regression Performance Metrics:
  Accuracy  : 0.968
  Precision : 1.000
  Recall    : 0.758
  F1-Score  : 0.863
  Confusion Matrix (Rows=Actual, Cols=Pred):
[[113  36]
 [  0 966]]

Random Forest Performance Metrics:
  Accuracy  : 0.979
  Precision : 1.000
  Recall    : 0.846
  F1-Score  : 0.916
  Confusion Matrix (Rows=Actual, Cols=Pred):
[[126  23]
 [  0 966]]

Naive Bayes Performance Metrics:
  Accuracy  : 0.977
  Precision : 1.000
  Recall    : 0.826
  F1-Score  : 0.904
  Confusion Matrix (Rows=Actual, Cols=Pred):
[[123  26]
 [  0 966]]
