In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, classification_report,
    roc_auc_score, roc_curve
)
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC




In [None]:
df = pd.read_csv(r"D:\Codebasics\NLP_Projects\spam_fraud_message\data\raw\spam_data.csv")

In [None]:
df.head()

In [None]:
print(f"Total Dataset {len(df)}")
print(f"Total Spam {(df['label_binary']== 1).sum()}")
print(f"Total Ham {(df['label_binary']==0).sum()}")


ham_df = df[df['label_binary']==0]
spam_df = df[df['label_binary']==1]


total_class_imbalance = (len(spam_df)/len(ham_df))*100

total_class_imbalance



In [None]:
X = df['message']
y = df['label_binary']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,     # IMPORTANT
    random_state=42
)

## Naive Bayes

In [None]:
# ── VECTORIZATION ────────────────────────────────────────────────────────────
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)  # learn vocab + transform train
X_test_tfidf  = vectorizer.transform(X_test)        # only transform test (no leakage)

In [None]:
# ── TRAIN MODEL ──────────────────────────────────────────────────────────────
model_nb = MultinomialNB()
model_nb.fit(X_train_tfidf, y_train)

# ── PREDICTIONS ──────────────────────────────────────────────────────────────
y_pred       = model_nb.predict(X_test_tfidf)         # hard class labels (0 or 1)
y_pred_proba = model_nb.predict_proba(X_test_tfidf)[:, 1]  # probability of being spam


In [None]:


# ── CORE METRICS ─────────────────────────────────────────────────────────────

accuracy = accuracy_score(y_test, y_pred)
# ACCURACY: % of ALL emails classified correctly (ham + spam)
# e.g. 0.97 → 97 out of 100 emails correctly labeled
# ⚠ Misleading on imbalanced data (model can ignore spam and still score high)
print(f"Accuracy:  {accuracy:.4f}")

precision = precision_score(y_test, y_pred)
# PRECISION: Of all emails the model CALLED spam, how many actually were spam?
# e.g. 0.95 → when model says "spam", it's right 95% of the time
# High precision = few false alarms (ham flagged as spam)
print(f"Precision: {precision:.4f}")

recall = recall_score(y_test, y_pred)
# RECALL (Sensitivity): Of all ACTUAL spam emails, how many did the model catch?
# e.g. 0.89 → model catches 89% of real spam, misses 11%
# High recall = few spam emails slip through to inbox
print(f"Recall:    {recall:.4f}")

f1 = f1_score(y_test, y_pred)
# F1 SCORE: Harmonic mean of Precision and Recall
# Balances both — useful when you care about both false alarms AND missed spam
# e.g. 0.92 → good balance between precision and recall
# Range: 0 (worst) → 1 (best)
print(f"F1 Score:  {f1:.4f}")

roc_auc = roc_auc_score(y_test, y_pred_proba)
# ROC-AUC: Model's ability to distinguish spam vs ham across ALL thresholds
# e.g. 0.98 → model separates spam/ham with 98% reliability
# 0.5 = random guessing, 1.0 = perfect classifier
print(f"ROC-AUC:   {roc_auc:.4f}")

In [None]:



# ── CONFUSION MATRIX ─────────────────────────────────────────────────────────
cm = confusion_matrix(y_test, y_pred)
#                  Predicted Ham   Predicted Spam
# Actual Ham    [[  TN (correct) ,  FP (false alarm) ]]
# Actual Spam   [[  FN (missed)  ,  TP (caught spam) ]]
#
# TN = True Negative:  Ham correctly identified as ham
# FP = False Positive: Ham wrongly flagged as spam  ← annoying for user
# FN = False Negative: Spam missed, lands in inbox  ← dangerous
# TP = True Positive:  Spam correctly caught

TN, FP, FN, TP = cm.ravel()
print(f"\nConfusion Matrix:")
print(f"  True Negatives  (ham   → ham)   : {TN}")
print(f"  False Positives (ham   → spam)  : {FP}  ← legit emails wrongly blocked")
print(f"  False Negatives (spam  → ham)   : {FN}  ← spam that slipped through")
print(f"  True Positives  (spam  → spam)  : {TP}")

# ── FULL CLASSIFICATION REPORT ───────────────────────────────────────────────
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["Ham", "Spam"]))
# Shows Precision, Recall, F1 broken down per class
# 'support' = number of actual samples in each class


In [None]:

# ── VISUALIZATIONS ───────────────────────────────────────────────────────────
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1 — Confusion Matrix Heatmap
sns.heatmap(
    cm, annot=True, fmt='d', cmap='Blues', ax=axes[0],
    xticklabels=["Ham", "Spam"],
    yticklabels=["Ham", "Spam"]
)
axes[0].set_xlabel("Predicted Label")
axes[0].set_ylabel("Actual Label")
axes[0].set_title("Confusion Matrix\n(bigger diagonal = better)")

# Plot 2 — ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
# fpr = False Positive Rate (ham wrongly called spam at each threshold)
# tpr = True Positive Rate  (spam correctly caught at each threshold)
axes[1].plot(fpr, tpr, color='darkorange', lw=2, label=f"ROC Curve (AUC = {roc_auc:.4f})")
axes[1].plot([0, 1], [0, 1], color='navy', lw=1, linestyle='--', label="Random Guess (AUC = 0.5)")
axes[1].set_xlabel("False Positive Rate (ham flagged as spam)")
axes[1].set_ylabel("True Positive Rate (spam caught)")
axes[1].set_title("ROC Curve\n(closer to top-left = better)")
axes[1].legend(loc="lower right")

plt.tight_layout()
plt.savefig("evaluation_plots.png", dpi=150)
plt.show()

# ── SUMMARY ──────────────────────────────────────────────────────────────────
print("\n── METRIC SUMMARY ──────────────────────────────────────────")
print(f"  Accuracy  {accuracy:.4f}  → overall correct classifications")
print(f"  Precision {precision:.4f}  → how trustworthy 'spam' predictions are")
print(f"  Recall    {recall:.4f}  → how much actual spam is caught")
print(f"  F1        {f1:.4f}  → balance of precision & recall")
print(f"  ROC-AUC   {roc_auc:.4f}  → overall discrimination ability")
print("────────────────────────────────────────────────────────────")

In [None]:
scale_pos_weight = len(y_train[y_train==0]) / len(y_train[y_train==1])


In [None]:


# ── TRAIN XGBoost ─────────────────────────────────────────────────────────────
# scale_pos_weight: compensates for class imbalance
# = count(ham) / count(spam)  →  tells XGBoost to penalize missing spam more
# e.g. if 90% ham / 10% spam → scale_pos_weight = 9
model_xgb = XGBClassifier(
    scale_pos_weight=scale_pos_weight,  # handles imbalanced dataset
    eval_metric="logloss"               # log loss: measures probability calibration
)
model_xgb.fit(X_train_tfidf, y_train)

# ── PREDICTIONS ───────────────────────────────────────────────────────────────
y_pred_xgb       = model_xgb.predict(X_test_tfidf)
y_pred_proba_xgb = model_xgb.predict_proba(X_test_tfidf)[:, 1]  # spam probability

# ── METRICS ───────────────────────────────────────────────────────────────────
accuracy_xgb  = accuracy_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb)
recall_xgb    = recall_score(y_test, y_pred_xgb)
f1_xgb        = f1_score(y_test, y_pred_xgb)
roc_auc_xgb   = roc_auc_score(y_test, y_pred_proba_xgb)
cm_xgb        = confusion_matrix(y_test, y_pred_xgb)
TN_x, FP_x, FN_x, TP_x = cm_xgb.ravel()

print("── XGBoost Evaluation ───────────────────────────────────────")
print(f"  Accuracy  {accuracy_xgb:.4f}  → overall correct classifications")
print(f"  Precision {precision_xgb:.4f}  → how trustworthy 'spam' predictions are")
print(f"  Recall    {recall_xgb:.4f}  → how much actual spam is caught")
print(f"  F1        {f1_xgb:.4f}  → balance of precision & recall")
print(f"  ROC-AUC   {roc_auc_xgb:.4f}  → overall spam vs ham separation")
print(f"\n  True Negatives  (ham  → ham)  : {TN_x}")
print(f"  False Positives (ham  → spam) : {FP_x}  ← legit emails wrongly blocked")
print(f"  False Negatives (spam → ham)  : {FN_x}  ← spam that slipped through")
print(f"  True Positives  (spam → spam) : {TP_x}")
print("─────────────────────────────────────────────────────────────")

print("\nClassification Report (XGBoost):")
print(classification_report(y_test, y_pred_xgb, target_names=["Ham", "Spam"]))

# ── MODEL COMPARISON TABLE ────────────────────────────────────────────────────
# Reuse NB metrics computed earlier (model_nb evaluation must have run first)
y_pred_nb       = model_nb.predict(X_test_tfidf)
y_pred_proba_nb = model_nb.predict_proba(X_test_tfidf)[:, 1]

comparison = pd.DataFrame({
    "Metric": ["Accuracy", "Precision", "Recall", "F1 Score", "ROC-AUC"],
    "Naive Bayes": [
        accuracy_score(y_test, y_pred_nb),
        precision_score(y_test, y_pred_nb),
        recall_score(y_test, y_pred_nb),
        f1_score(y_test, y_pred_nb),
        roc_auc_score(y_test, y_pred_proba_nb)
    ],
    "XGBoost": [
        accuracy_xgb, precision_xgb,
        recall_xgb, f1_xgb, roc_auc_xgb
    ]
})
comparison["Winner"] = comparison.apply(
    lambda row: "XGBoost ✓" if row["XGBoost"] > row["Naive Bayes"] else "Naive Bayes ✓",
    axis=1
)
print("\nModel Comparison:")
print(comparison.to_string(index=False))

# ── VISUALIZATIONS ────────────────────────────────────────────────────────────
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
fig.suptitle("XGBoost Evaluation & Model Comparison", fontsize=14, fontweight="bold")

# ── Plot 1: XGBoost Confusion Matrix ─────────────────────────────────────────
sns.heatmap(
    cm_xgb, annot=True, fmt='d', cmap='Oranges', ax=axes[0],
    xticklabels=["Ham", "Spam"],
    yticklabels=["Ham", "Spam"]
)
axes[0].set_xlabel("Predicted Label")
axes[0].set_ylabel("Actual Label")
axes[0].set_title("XGBoost — Confusion Matrix\n(bigger diagonal = better)")

# ── Plot 2: ROC Curve Comparison ──────────────────────────────────────────────
# XGBoost ROC
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_pred_proba_xgb)
axes[1].plot(fpr_xgb, tpr_xgb, color='darkorange', lw=2,
             label=f"XGBoost   (AUC = {roc_auc_xgb:.4f})")

# Naive Bayes ROC (overlaid for comparison)
fpr_nb, tpr_nb, _ = roc_curve(y_test, y_pred_proba_nb)
axes[1].plot(fpr_nb, tpr_nb, color='steelblue', lw=2, linestyle='--',
             label=f"Naive Bayes (AUC = {roc_auc_score(y_test, y_pred_proba_nb):.4f})")

axes[1].plot([0, 1], [0, 1], color='grey', lw=1, linestyle=':', label="Random Guess")
axes[1].set_xlabel("False Positive Rate (ham flagged as spam)")
axes[1].set_ylabel("True Positive Rate (spam caught)")
axes[1].set_title("ROC Curve — NB vs XGBoost\n(closer to top-left = better)")
axes[1].legend(loc="lower right")

# ── Plot 3: Metric Bar Chart Comparison ───────────────────────────────────────
metrics     = ["Accuracy", "Precision", "Recall", "F1 Score", "ROC-AUC"]
nb_scores   = comparison["Naive Bayes"].values
xgb_scores  = comparison["XGBoost"].values
x           = np.arange(len(metrics))
width       = 0.35

bars_nb  = axes[2].bar(x - width/2, nb_scores,  width, label="Naive Bayes", color="steelblue",  alpha=0.85)
bars_xgb = axes[2].bar(x + width/2, xgb_scores, width, label="XGBoost",    color="darkorange", alpha=0.85)

# Annotate bar values
for bar in bars_nb:
    axes[2].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005,
                 f"{bar.get_height():.2f}", ha='center', va='bottom', fontsize=8)
for bar in bars_xgb:
    axes[2].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005,
                 f"{bar.get_height():.2f}", ha='center', va='bottom', fontsize=8)

axes[2].set_xticks(x)
axes[2].set_xticklabels(metrics, rotation=15)
axes[2].set_ylim(0, 1.12)
axes[2].set_ylabel("Score")
axes[2].set_title("Metric Comparison\nNaive Bayes vs XGBoost")
axes[2].legend()

plt.tight_layout()
plt.savefig("xgb_vs_nb_evaluation.png", dpi=150)
plt.show()


In [None]:
from sklearn.svm import LinearSVC

model_svm = LinearSVC(class_weight="balanced")
model_svm.fit(X_train_tfidf, y_train)


In [None]:
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV


# ── TRAIN SVM ─────────────────────────────────────────────────────────────────
# class_weight="balanced": automatically adjusts weights inversely proportional
# to class frequency → same effect as scale_pos_weight in XGBoost
# e.g. if spam is 10% of data, spam samples get 9x more weight during training
# LinearSVC: linear kernel SVM — fast and effective for high-dim text (TF-IDF)
model_svm = LinearSVC(class_weight="balanced")
model_svm.fit(X_train_tfidf, y_train)

# ── PROBABILITY CALIBRATION ───────────────────────────────────────────────────
# ⚠ LinearSVC does NOT output probabilities natively (only decision scores)
# CalibratedClassifierCV wraps it to produce proper probabilities via cross-val
# method="sigmoid" fits a Platt scaling layer on top of the SVM decision scores
# Needed for: ROC-AUC with probabilities, threshold tuning, confidence scores
model_svm_cal = CalibratedClassifierCV(model_svm, method="sigmoid", cv=5)
model_svm_cal.fit(X_train_tfidf, y_train)

# ── PREDICTIONS ───────────────────────────────────────────────────────────────
y_pred_svm       = model_svm.predict(X_test_tfidf)            # hard labels (0 or 1)
y_pred_proba_svm = model_svm_cal.predict_proba(X_test_tfidf)[:, 1]  # spam probability

# ── METRICS ───────────────────────────────────────────────────────────────────
accuracy_svm  = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm)
recall_svm    = recall_score(y_test, y_pred_svm)
f1_svm        = f1_score(y_test, y_pred_svm)
roc_auc_svm   = roc_auc_score(y_test, y_pred_proba_svm)
cm_svm        = confusion_matrix(y_test, y_pred_svm)
TN_s, FP_s, FN_s, TP_s = cm_svm.ravel()

print("── LinearSVC Evaluation ─────────────────────────────────────")
print(f"  Accuracy  {accuracy_svm:.4f}  → overall correct classifications")
print(f"  Precision {precision_svm:.4f}  → how trustworthy 'spam' predictions are")
print(f"  Recall    {recall_svm:.4f}  → how much actual spam is caught")
print(f"  F1        {f1_svm:.4f}  → balance of precision & recall")
print(f"  ROC-AUC   {roc_auc_svm:.4f}  → overall spam vs ham separation")
print(f"\n  True Negatives  (ham  → ham)  : {TN_s}")
print(f"  False Positives (ham  → spam) : {FP_s}  ← legit emails wrongly blocked")
print(f"  False Negatives (spam → ham)  : {FN_s}  ← spam that slipped through")
print(f"  True Positives  (spam → spam) : {TP_s}")
print("─────────────────────────────────────────────────────────────")

print("\nClassification Report (LinearSVC):")
print(classification_report(y_test, y_pred_svm, target_names=["Ham", "Spam"]))

# ── 3-MODEL COMPARISON TABLE ──────────────────────────────────────────────────
y_pred_nb        = model_nb.predict(X_test_tfidf)
y_pred_proba_nb  = model_nb.predict_proba(X_test_tfidf)[:, 1]
y_pred_xgb       = model_xgb.predict(X_test_tfidf)
y_pred_proba_xgb = model_xgb.predict_proba(X_test_tfidf)[:, 1]

scores = {
    "Naive Bayes": [
        accuracy_score(y_test, y_pred_nb),
        precision_score(y_test, y_pred_nb),
        recall_score(y_test, y_pred_nb),
        f1_score(y_test, y_pred_nb),
        roc_auc_score(y_test, y_pred_proba_nb)
    ],
    "XGBoost": [
        accuracy_score(y_test, y_pred_xgb),
        precision_score(y_test, y_pred_xgb),
        recall_score(y_test, y_pred_xgb),
        f1_score(y_test, y_pred_xgb),
        roc_auc_score(y_test, y_pred_proba_xgb)
    ],
    "LinearSVC": [
        accuracy_svm, precision_svm,
        recall_svm, f1_svm, roc_auc_svm
    ]
}

comparison = pd.DataFrame(scores, index=["Accuracy", "Precision", "Recall", "F1 Score", "ROC-AUC"])

# Mark the best model per metric with ✓
def highlight_winner(row):
    best = row.idxmax()
    return [f"{v:.4f} ✓" if col == best else f"{v:.4f}" for col, v in row.items()]

comparison_display = comparison.apply(highlight_winner, axis=1, result_type="expand")
comparison_display.columns = comparison.columns
print("\n── 3-Model Comparison ───────────────────────────────────────")
print(comparison_display.to_string())
print("─────────────────────────────────────────────────────────────")

# ── VISUALIZATIONS ────────────────────────────────────────────────────────────
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
fig.suptitle("LinearSVC Evaluation & 3-Model Comparison", fontsize=14, fontweight="bold")

# ── Plot 1: SVM Confusion Matrix ──────────────────────────────────────────────
sns.heatmap(
    cm_svm, annot=True, fmt='d', cmap='Greens', ax=axes[0],
    xticklabels=["Ham", "Spam"],
    yticklabels=["Ham", "Spam"]
)
axes[0].set_xlabel("Predicted Label")
axes[0].set_ylabel("Actual Label")
axes[0].set_title("LinearSVC — Confusion Matrix\n(bigger diagonal = better)")

# ── Plot 2: ROC Curve — All 3 Models ─────────────────────────────────────────
fpr_nb,  tpr_nb,  _ = roc_curve(y_test, y_pred_proba_nb)
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_pred_proba_xgb)
fpr_svm, tpr_svm, _ = roc_curve(y_test, y_pred_proba_svm)

axes[1].plot(fpr_svm, tpr_svm, color='seagreen',   lw=2,
             label=f"LinearSVC   (AUC = {roc_auc_svm:.4f})")
axes[1].plot(fpr_xgb, tpr_xgb, color='darkorange', lw=2, linestyle='--',
             label=f"XGBoost     (AUC = {roc_auc_score(y_test, y_pred_proba_xgb):.4f})")
axes[1].plot(fpr_nb,  tpr_nb,  color='steelblue',  lw=2, linestyle=':',
             label=f"Naive Bayes (AUC = {roc_auc_score(y_test, y_pred_proba_nb):.4f})")
axes[1].plot([0, 1], [0, 1], color='grey', lw=1, linestyle='--', label="Random Guess")
axes[1].set_xlabel("False Positive Rate (ham flagged as spam)")
axes[1].set_ylabel("True Positive Rate (spam caught)")
axes[1].set_title("ROC Curve — All 3 Models\n(closer to top-left = better)")
axes[1].legend(loc="lower right", fontsize=8)

# ── Plot 3: Grouped Bar Chart — All 3 Models ─────────────────────────────────
metrics    = ["Accuracy", "Precision", "Recall", "F1 Score", "ROC-AUC"]
nb_scores  = comparison["Naive Bayes"].values
xgb_scores = comparison["XGBoost"].values
svm_scores = comparison["LinearSVC"].values
x          = np.arange(len(metrics))
width      = 0.25

bars_nb  = axes[2].bar(x - width, nb_scores,  width, label="Naive Bayes", color="steelblue",  alpha=0.85)
bars_xgb = axes[2].bar(x,         xgb_scores, width, label="XGBoost",    color="darkorange", alpha=0.85)
bars_svm = axes[2].bar(x + width, svm_scores, width, label="LinearSVC",  color="seagreen",   alpha=0.85)

for bars in [bars_nb, bars_xgb, bars_svm]:
    for bar in bars:
        axes[2].text(
            bar.get_x() + bar.get_width() / 2,
            bar.get_height() + 0.004,
            f"{bar.get_height():.2f}",
            ha='center', va='bottom', fontsize=7
        )

axes[2].set_xticks(x)
axes[2].set_xticklabels(metrics, rotation=15)
axes[2].set_ylim(0, 1.15)
axes[2].set_ylabel("Score")
axes[2].set_title("All Metrics — 3 Model Comparison")
axes[2].legend(fontsize=8)

plt.tight_layout()
plt.savefig("svm_3model_evaluation.png", dpi=150)
plt.show()

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.3,
    stratify=y,
    random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.5,
    stratify=y_temp,
    random_state=42
)

In [63]:
import torch
import numpy as np
from transformers import (
    AutoTokenizer,
    BertForSequenceClassification,
    TrainingArguments,
    Trainer
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from scipy.special import softmax

# ══════════════════════════════════════════════════════════════════════════════
# STEP 0 — CHECK WHAT HARDWARE YOU HAVE
# ══════════════════════════════════════════════════════════════════════════════
print(f"CUDA available : {torch.cuda.is_available()}")
print(f"Device         : {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU ← this is your problem'}")
print(f"GPU Memory     : {round(torch.cuda.get_device_properties(0).total_memory/1e9, 1)} GB" if torch.cuda.is_available() else "")

# If CUDA is not available, you have 3 options (see bottom of file)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# ══════════════════════════════════════════════════════════════════════════════
# SPEED FIX 1 — USE A LIGHTER MODEL (biggest speedup if no GPU)
# ══════════════════════════════════════════════════════════════════════════════
# bert-base-uncased    → 110M params, SLOW on CPU (~2.5hrs as you saw)
# distilbert-base      →  66M params, 40% faster, ~97% of BERT accuracy ✓
# bert-tiny            →   4M params, 10x faster, good for quick experiments

MODEL_NAME = "distilbert-base-uncased"   # ← swap here, rest of code unchanged
# MODEL_NAME = "bert-base-uncased"       # ← use this if you have a GPU

# FIX: BertTokenizer does NOT work with DistilBERT — use AutoTokenizer instead
# AutoTokenizer automatically picks the correct tokenizer for any model name
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
# use_fast=True → uses Rust-based tokenizer, ~10x faster than pure Python


# ══════════════════════════════════════════════════════════════════════════════
# STEP 1 — TOKENIZATION
# ══════════════════════════════════════════════════════════════════════════════
def tokenize(texts):
    return tokenizer(
        list(texts),
        padding=True,
        truncation=True,
        max_length=64,    # SPEED FIX 2: reduced 128 → 64
                          # spam emails are short, 128 tokens is overkill
                          # cuts sequence processing time by ~50%
        return_tensors="pt"
    )

train_encodings = tokenize(X_train)
val_encodings   = tokenize(X_val)
test_encodings  = tokenize(X_test)


# ══════════════════════════════════════════════════════════════════════════════
# STEP 2 — DATASET
# ══════════════════════════════════════════════════════════════════════════════
class SpamDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels    = np.array(labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SpamDataset(train_encodings, y_train)
val_dataset   = SpamDataset(val_encodings,   y_val)
test_dataset  = SpamDataset(test_encodings,  y_test)


# ══════════════════════════════════════════════════════════════════════════════
# STEP 3 — MODEL
# ══════════════════════════════════════════════════════════════════════════════
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
).to(device)   # SPEED FIX 3: explicitly move model to GPU if available

pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits  = outputs.logits
        weights = torch.tensor([1.0, pos_weight], dtype=torch.float).to(logits.device)
        loss    = torch.nn.CrossEntropyLoss(weight=weights)(logits, labels)
        return (loss, outputs) if return_outputs else loss


# ══════════════════════════════════════════════════════════════════════════════
# STEP 4 — TRAINING ARGUMENTS (speed-tuned)
# ══════════════════════════════════════════════════════════════════════════════
training_args = TrainingArguments(
    output_dir    = "./results",
    num_train_epochs              = 3,

    # SPEED FIX 4: larger batch = fewer steps = faster epoch
    # 16 → 32 if GPU has ≥8GB VRAM, keep 16 for CPU/small GPU
    per_device_train_batch_size   = 32,
    per_device_eval_batch_size    = 64,   # eval doesn't need gradients, can be bigger

    learning_rate                 = 2e-5,

    # SPEED FIX 5: evaluate every 200 steps instead of every epoch
    # avoids running full val set evaluation too frequently
    eval_strategy                 = "steps",
    eval_steps                    = 200,
    save_strategy                 = "steps",
    save_steps                    = 200,

    load_best_model_at_end        = True,
    metric_for_best_model         = "f1",
    greater_is_better             = True,

    # SPEED FIX 6: mixed precision — uses float16 instead of float32
    # ~2x speedup and ~50% less GPU memory (only works on GPU)
    fp16                          = torch.cuda.is_available(),

    # SPEED FIX 7: dataloader workers
    # ⚠ Windows CRASH FIX: must be 0 on Windows — multiprocessing spawn
    # method on Windows causes "DataLoader worker exited unexpectedly"
    # 0 = load data in main process (safe on Windows, negligible speed loss)
    # Only set > 0 on Linux/Mac or inside if __name__ == "__main__" guard
    dataloader_num_workers        = 0,   # ← was 2, crashes on Windows

    # SPEED FIX 8: gradient accumulation — simulates larger batch on small GPU
    # effective batch = per_device_train_batch_size × gradient_accumulation_steps
    gradient_accumulation_steps   = 1,   # increase to 2 or 4 if GPU OOM errors

    logging_steps                 = 50,
    logging_dir                   = "./logs",
    report_to                     = "none",   # disables wandb/tensorboard noise
)


# ══════════════════════════════════════════════════════════════════════════════
# STEP 5 — METRICS
# ══════════════════════════════════════════════════════════════════════════════
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds          = np.argmax(logits, axis=1)
    proba          = softmax(logits, axis=1)[:, 1]

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, proba)

    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1, "roc_auc": auc}


# ══════════════════════════════════════════════════════════════════════════════
# STEP 6 — TRAIN & EVALUATE
# ══════════════════════════════════════════════════════════════════════════════
trainer = WeightedTrainer(
    model           = model,
    args            = training_args,
    train_dataset   = train_dataset,
    eval_dataset    = val_dataset,
    compute_metrics = compute_metrics
)

trainer.train()

test_results = trainer.evaluate(eval_dataset=test_dataset)
print("\n── Test Set Results ─────────────────────────────────────────")
for metric, value in test_results.items():
    print(f"  {metric:<30} {value:.4f}")
print("─────────────────────────────────────────────────────────────")


CUDA available : False
Device         : CPU ← this is your problem



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
200,0.0438,0.082761,0.991627,0.981651,0.955357,0.968326,0.995782





── Test Set Results ─────────────────────────────────────────
  eval_loss                      0.0813
  eval_accuracy                  0.9916
  eval_precision                 0.9730
  eval_recall                    0.9643
  eval_f1                        0.9686
  eval_roc_auc                   0.9983
  eval_runtime                   42.3698
  eval_samples_per_second        19.7310
  eval_steps_per_second          0.3300
  epoch                          3.0000
─────────────────────────────────────────────────────────────


'\nOPTION 1 — Google Colab (free GPU)\n  • Go to colab.research.google.com\n  • Runtime → Change runtime type → T4 GPU\n  • Paste this code → runs in ~10-15 mins instead of 2.5hrs\n\nOPTION 2 — Kaggle Notebooks (free GPU, 30hr/week)\n  • kaggle.com → New Notebook → Settings → Accelerator → GPU T4 x2\n\nOPTION 3 — Skip BERT, use the lighter models you already have\n  • Your LinearSVC / XGBoost with TF-IDF already runs in seconds\n  • For spam detection specifically, they often match BERT performance\n  • Only worth using BERT if those models are underperforming\n'

In [64]:
SAVE_PATH = "./final_spam_model"

trainer.save_model(SAVE_PATH)        # saves model + config
tokenizer.save_pretrained(SAVE_PATH) # saves tokenizer

('./final_spam_model\\tokenizer_config.json',
 './final_spam_model\\special_tokens_map.json',
 './final_spam_model\\vocab.txt',
 './final_spam_model\\added_tokens.json',
 './final_spam_model\\tokenizer.json')