# üõ°Ô∏è Phishing Email Detection - DistilBERT Fine-tuning

**Project:** Hybrid AI Defense - Closing the Detection Gap Against AI-Generated Phishing  
**Author:** Ramkumar  
**Model:** DistilBERT (HuggingFace Transformers)  
**Framework:** PyTorch

---

## 1. Setup & Installation

In [None]:
# Install required libraries
!pip install transformers datasets torch scikit-learn pandas numpy accelerate -q

In [None]:
# Check GPU availability
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## 2. Upload Data

Upload your processed CSV files from `data/processed/`:
- `train.csv`
- `validation.csv`
- `test.csv`

In [None]:
# Upload files (for Google Colab)
from google.colab import files
uploaded = files.upload()

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# Load data
train_df = pd.read_csv('train.csv')
val_df = pd.read_csv('validation.csv')
test_df = pd.read_csv('test.csv')

print(f"Train: {len(train_df)} samples")
print(f"Validation: {len(val_df)} samples")
print(f"Test: {len(test_df)} samples")
print(f"\nLabel distribution (train):")
print(train_df['label'].value_counts())

## 3. Prepare Dataset for Transformers

In [None]:
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

# Load DistilBERT tokenizer
MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Convert pandas to HuggingFace Dataset
train_dataset = Dataset.from_pandas(train_df[['text', 'label']])
val_dataset = Dataset.from_pandas(val_df[['text', 'label']])
test_dataset = Dataset.from_pandas(test_df[['text', 'label']])

dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

print(dataset)

In [None]:
# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=512  # DistilBERT max length
    )

# Tokenize all datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Remove text column (not needed for training)
tokenized_dataset = tokenized_dataset.remove_columns(['text'])

# Rename label to labels (required by Trainer)
tokenized_dataset = tokenized_dataset.rename_column('label', 'labels')

# Set format for PyTorch
tokenized_dataset.set_format('torch')

print("Tokenization complete!")
print(tokenized_dataset)

## 4. Load DistilBERT Model

In [None]:
from transformers import AutoModelForSequenceClassification

# Load pre-trained DistilBERT for binary classification
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,  # Binary: 0=Legit, 1=Phishing
    id2label={0: "LEGITIMATE", 1: "PHISHING"},
    label2id={"LEGITIMATE": 0, "PHISHING": 1}
)

print(f"Model loaded: {MODEL_NAME}")
print(f"Parameters: {model.num_parameters():,}")

## 5. Training Configuration

In [None]:
from transformers import TrainingArguments, Trainer

# Define metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='binary'
    )
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    logging_dir='./logs',
    logging_steps=50,
    report_to='none',  # Disable wandb
    seed=42
)

print("Training configuration set!")

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    compute_metrics=compute_metrics
)

print("Trainer initialized!")

## 6. Train the Model üöÄ

In [None]:
# Start training
print("Starting training...")
print("="*50)

trainer.train()

print("="*50)
print("Training complete!")

## 7. Evaluate on Test Set

In [None]:
# Evaluate on test set
print("Evaluating on test set...")
results = trainer.evaluate(tokenized_dataset['test'])

print("\n" + "="*50)
print("TEST RESULTS")
print("="*50)
print(f"Accuracy:  {results['eval_accuracy']:.4f}")
print(f"Precision: {results['eval_precision']:.4f}")
print(f"Recall:    {results['eval_recall']:.4f}")
print(f"F1 Score:  {results['eval_f1']:.4f}")

In [None]:
# Detailed predictions and confusion matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Get predictions
predictions = trainer.predict(tokenized_dataset['test'])
preds = np.argmax(predictions.predictions, axis=-1)
labels = predictions.label_ids

# Confusion Matrix
cm = confusion_matrix(labels, preds)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Legitimate', 'Phishing'],
            yticklabels=['Legitimate', 'Phishing'])
plt.title('Confusion Matrix - Test Set')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

print(f"\nTrue Negatives (Legit‚ÜíLegit): {cm[0][0]}")
print(f"False Positives (Legit‚ÜíPhishing): {cm[0][1]}")
print(f"False Negatives (Phishing‚ÜíLegit): {cm[1][0]}")
print(f"True Positives (Phishing‚ÜíPhishing): {cm[1][1]}")

## 8. Analyze Human vs LLM Detection

In [None]:
# Analyze by source (human vs LLM)
test_df_with_preds = test_df.copy()
test_df_with_preds['predicted'] = preds
test_df_with_preds['correct'] = test_df_with_preds['label'] == test_df_with_preds['predicted']

print("="*50)
print("ACCURACY BY SOURCE")
print("="*50)

for source in ['human', 'llm']:
    source_df = test_df_with_preds[test_df_with_preds['source'] == source]
    accuracy = source_df['correct'].mean()
    print(f"{source.upper()}-generated emails: {accuracy:.4f} ({len(source_df)} samples)")

print("\n" + "="*50)
print("ACCURACY BY SOURCE x TYPE")
print("="*50)

for source in ['human', 'llm']:
    for label in [0, 1]:
        subset = test_df_with_preds[(test_df_with_preds['source'] == source) & 
                                     (test_df_with_preds['label'] == label)]
        if len(subset) > 0:
            accuracy = subset['correct'].mean()
            label_name = 'Legit' if label == 0 else 'Phishing'
            print(f"{source.upper()} {label_name}: {accuracy:.4f} ({len(subset)} samples)")

## 9. Save the Model

In [None]:
# Save model and tokenizer
MODEL_SAVE_PATH = './phishing_detector_model'

trainer.save_model(MODEL_SAVE_PATH)
tokenizer.save_pretrained(MODEL_SAVE_PATH)

print(f"Model saved to: {MODEL_SAVE_PATH}")

In [None]:
# Download model (for Colab)
!zip -r phishing_detector_model.zip phishing_detector_model/
files.download('phishing_detector_model.zip')

## 10. Test with Sample Email

In [None]:
# Quick test function
def predict_email(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1)
        prediction = torch.argmax(probs, dim=-1).item()
    
    label = 'PHISHING ‚ö†Ô∏è' if prediction == 1 else 'LEGITIMATE ‚úÖ'
    confidence = probs[0][prediction].item() * 100
    
    print(f"Prediction: {label}")
    print(f"Confidence: {confidence:.2f}%")
    return prediction, confidence

# Test examples
print("="*50)
print("TEST 1: Phishing-like email")
print("="*50)
predict_email("URGENT: Your account has been compromised! Click here immediately to verify your identity or your account will be suspended. [URL]")

print("\n" + "="*50)
print("TEST 2: Legitimate-like email")
print("="*50)
predict_email("Hi team, just a reminder that our weekly meeting is scheduled for tomorrow at 3 PM. Please review the attached agenda beforehand.")

---
## ‚úÖ Training Complete!

**Next Steps:**
1. Download the saved model (`phishing_detector_model.zip`)
2. Extract to `model/saved_models/` in your project
3. Proceed to Week 2: Backend & URL Analysis