In [None]:
"""
ASU Cyber360 Hackathon - Phishing Email Detector
=================================================
Hybrid Architecture: Fine-tuned Transformer + Engineered Features
Target: Maximum accuracy on hidden evaluation dataset
"""

import pandas as pd
import numpy as np
import re
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModel,
    get_linear_schedule_with_warmup
)
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

# ============================================================================
# PART 1: ENGINEERED FEATURES EXTRACTION
# ============================================================================

class FeatureExtractor:
    """Extract hand-crafted features that catch phishing patterns"""

    def __init__(self):
        # Phishing indicators
        self.urgency_words = ['urgent', 'immediately', 'asap', 'expire', 'expiring',
                              'expires', 'deadline', 'hurry', 'rush', 'quick', 'now']
        self.threat_words = ['suspend', 'suspended', 'block', 'blocked', 'deactivate',
                            'deactivated', 'locked', 'freeze', 'frozen', 'terminate']
        self.action_words = ['click', 'verify', 'confirm', 'update', 'validate',
                            'authenticate', 'secure', 'restore', 'unlock']
        self.typosquatting = ['micros0ft', 'g00gle', 'paypa1', 'amaz0n', 'app1e']

    def extract_features(self, text):
        """Extract all 10 features from email text"""
        text_lower = text.lower()

        features = {}

        # 1. Urgency count
        features['urgency_count'] = sum(word in text_lower for word in self.urgency_words)

        # 2. Threat count
        features['threat_count'] = sum(word in text_lower for word in self.threat_words)

        # 3. Action count
        features['action_count'] = sum(word in text_lower for word in self.action_words)

        # 4. URL count (various URL patterns)
        url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
        features['url_count'] = len(re.findall(url_pattern, text))

        # 5. Has URL (binary)
        features['has_url'] = 1 if features['url_count'] > 0 else 0

        # 6. Obfuscated URL (hxxp://, h**p://)
        obfuscated_pattern = r'h[x*]{2}p[s]?://'
        features['obfuscated_url'] = 1 if re.search(obfuscated_pattern, text_lower) else 0

        # 7. Typosquatting
        features['typosquatting'] = 1 if any(typo in text_lower for typo in self.typosquatting) else 0

        # 8. Length
        features['length'] = len(text)

        # 9. Exclamation marks
        features['exclamation_marks'] = text.count('!')

        # 10. Capital ratio
        capitals = sum(1 for c in text if c.isupper())
        features['capital_ratio'] = capitals / len(text) if len(text) > 0 else 0

        return features

    def extract_batch(self, texts):
        """Extract features for a batch of texts"""
        features_list = [self.extract_features(text) for text in texts]
        return pd.DataFrame(features_list)


# ============================================================================
# PART 2: SYNTHETIC DATA GENERATION
# ============================================================================

class SyntheticDataGenerator:
    """Generate high-quality synthetic phishing and safe emails"""

    def __init__(self):
        # Phishing templates - more realistic and harder to detect
        self.phishing_templates = [
            "URGENT: Your {service} account will be suspended in {time}! Click {url} to verify immediately.",
            "Action Required: Unusual activity detected on your {service} account. Verify here: {url}",
            "Security Alert: Your {service} password expires {time}. Update now: {url}",
            "{service} Notice: Your account has been locked due to suspicious activity. Restore access: {url}",
            "Final Warning: Your {service} subscription payment failed. Update payment info: {url}",
            "ATTENTION: You have {number} unread messages. View them here: {url}",
            "Your {service} account requires immediate verification. Click here: {url} or account will be deactivated.",
            "Congratulations! You've won ${prize}. Claim your prize at: {url}",
            "IT Department: System maintenance requires password reset. Update here: {url}",
            "Re: Invoice #{number} - Payment confirmation needed. Download: {url}",
            # Harder examples that look more legitimate
            "Dear customer, we noticed a login from {location}. If this wasn't you, please verify: {url}",
            "Your {service} security settings need review. We recommend updating them at {url}",
            "Package delivery attempted. Reschedule at: {url}",
            "Tax refund of ${prize} is pending. Confirm your details: {url}",
            "Account verification needed for {service}. Complete within {time}: {url}",
            "Hi, we're updating our privacy policy. Review and accept at {url}",
            "Unusual payment activity on your account. Review transactions: {url}",
            "{service} support: We detected an issue with your account. Please check {url}",
            "Your document is ready. Download here: {url}",
            "Action needed: Complete your {service} profile setup at {url}"
        ]

        # Safe email templates - more varied
        self.safe_templates = [
            "Hi {name}, thanks for reaching out. I'll review your proposal and get back to you by {day}.",
            "Meeting scheduled for {day} at {time} in {location}. Agenda attached.",
            "Project update: We've completed the {phase} phase and are moving forward with testing.",
            "Welcome to {service}! We're excited to have you. Here's what you can do to get started.",
            "Your order #{number} has shipped and will arrive by {day}. Track your package in your account.",
            "Reminder: Your appointment with {person} is scheduled for {day} at {time}.",
            "Team, great work on the presentation today. Let's regroup {day} to discuss next steps.",
            "Monthly newsletter: Check out our latest features and upcoming events.",
            "{name}, I reviewed the documents you sent. Everything looks good. Let's move forward.",
            "Course announcement: {course} materials are now available. See you in class {day}.",
            # More realistic safe emails
            "Thanks for your payment of ${prize}. Your receipt is attached.",
            "Your {service} subscription has been renewed successfully.",
            "Here's the information you requested about {topic}. Let me know if you have questions.",
            "Looking forward to our meeting on {day}. Please review the attached materials beforehand.",
            "Your {service} account was successfully updated. No further action needed.",
            "Weekly summary: {number} new messages, {number} tasks completed.",
            "Reminder: Your {service} free trial ends {day}. Upgrade anytime in your account settings.",
            "Hi {name}, following up on our conversation. Here are the next steps we discussed.",
            "Your feedback on {topic} has been received. We appreciate your input.",
            "System update completed successfully. All services are running normally."
        ]

        self.services = ['PayPal', 'Amazon', 'Microsoft', 'Google', 'Apple', 'Netflix',
                        'Bank of America', 'Chase', 'Wells Fargo', 'IRS', 'Dropbox', 'LinkedIn']
        self.times = ['24 hours', '48 hours', '3 days', 'today', 'within 1 hour', '72 hours']
        self.urls = ['hxxp://bit.ly/a3f2x', 'https://secure-verify.net', 'http://account-check.com',
                    'hxxps://urgent-verify.co', 'https://bit.ly/secure23', 'http://verify-account-now.com']
        self.names = ['John', 'Sarah', 'Mike', 'Lisa', 'Tom', 'Emma', 'Alex', 'Maria']
        self.days = ['Monday', 'Tuesday', 'next week', 'tomorrow', 'Friday', 'this week']
        self.locations = ['New York', 'London', 'Tokyo', 'unusual location', 'unknown device']
        self.topics = ['the project', 'your inquiry', 'the report', 'our discussion']

    def generate_phishing_email(self):
        """Generate a single phishing email"""
        template = np.random.choice(self.phishing_templates)
        return template.format(
            service=np.random.choice(self.services),
            time=np.random.choice(self.times),
            url=np.random.choice(self.urls),
            number=np.random.randint(1000, 9999),
            prize=np.random.randint(100, 10000),
            location=np.random.choice(self.locations)
        )

    def generate_safe_email(self):
        """Generate a single safe email"""
        template = np.random.choice(self.safe_templates)
        return template.format(
            name=np.random.choice(self.names),
            day=np.random.choice(self.days),
            time=f"{np.random.randint(9, 17)}:00",
            location='Conference Room B',
            phase='development',
            service='our service',
            number=np.random.randint(1000, 9999),
            person='Dr. Smith',
            course='CS 101',
            prize=np.random.randint(10, 500),
            topic=np.random.choice(self.topics)
        )

    def generate_dataset(self, n_samples=3000):
        """Generate balanced synthetic dataset"""
        n_phishing = n_samples // 2
        n_safe = n_samples - n_phishing

        phishing_emails = [self.generate_phishing_email() for _ in range(n_phishing)]
        safe_emails = [self.generate_safe_email() for _ in range(n_safe)]

        df = pd.DataFrame({
            'Email Text': phishing_emails + safe_emails,
            'Email Type': ['Phishing Email'] * n_phishing + ['Safe Email'] * n_safe
        })

        return df.sample(frac=1, random_state=SEED).reset_index(drop=True)


# ============================================================================
# PART 3: HYBRID MODEL ARCHITECTURE
# ============================================================================

class PhishingDataset(Dataset):
    """Custom dataset for hybrid model"""

    def __init__(self, texts, features, labels, tokenizer, max_length=256):
        self.texts = texts
        self.features = features
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])

        # Tokenize text
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'features': torch.FloatTensor(self.features[idx]),
            'label': torch.LongTensor([self.labels[idx]])
        }


class HybridPhishingDetector(nn.Module):
    """
    Hybrid Architecture:
    - DistilBERT for contextual understanding
    - Engineered features for explicit pattern matching
    - Combined through dense layers for final prediction
    """

    def __init__(self, model_name='distilbert-base-uncased', n_features=10, dropout=0.5):
        super(HybridPhishingDetector, self).__init__()

        # Transformer backbone
        self.transformer = AutoModel.from_pretrained(model_name)
        transformer_dim = self.transformer.config.hidden_size

        # Feature processing with stronger regularization
        self.feature_bn = nn.BatchNorm1d(n_features)
        self.feature_fc = nn.Sequential(
            nn.Linear(n_features, 64),
            nn.ReLU(),
            nn.Dropout(0.4)
        )

        # Combined classifier with increased dropout
        self.classifier = nn.Sequential(
            nn.Linear(transformer_dim + 64, 256),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.BatchNorm1d(256),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 2)
        )

    def forward(self, input_ids, attention_mask, features):
        # Get transformer embeddings
        transformer_output = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = transformer_output.last_hidden_state[:, 0, :]  # [CLS] token

        # Process engineered features
        features_processed = self.feature_bn(features)
        features_processed = torch.relu(self.feature_fc(features_processed))

        # Combine and classify
        combined = torch.cat([pooled_output, features_processed], dim=1)
        logits = self.classifier(combined)

        return logits


# ============================================================================
# PART 4: TRAINING PIPELINE
# ============================================================================

class PhishingDetectorTrainer:
    """Complete training pipeline with best practices"""

    def __init__(self, model, device, learning_rate=2e-5):
        self.model = model.to(device)
        self.device = device
        self.optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
        self.criterion = nn.CrossEntropyLoss()
        self.scaler = StandardScaler()

    def train_epoch(self, dataloader, scheduler):
        self.model.train()
        total_loss = 0
        predictions = []
        true_labels = []

        for batch in dataloader:
            self.optimizer.zero_grad()

            input_ids = batch['input_ids'].to(self.device)
            attention_mask = batch['attention_mask'].to(self.device)
            features = batch['features'].to(self.device)
            labels = batch['label'].squeeze().to(self.device)

            logits = self.model(input_ids, attention_mask, features)
            loss = self.criterion(logits, labels)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
            self.optimizer.step()
            scheduler.step()

            total_loss += loss.item()
            predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

        avg_loss = total_loss / len(dataloader)
        accuracy = accuracy_score(true_labels, predictions)

        return avg_loss, accuracy

    def evaluate(self, dataloader):
        self.model.eval()
        predictions = []
        true_labels = []

        with torch.no_grad():
            for batch in dataloader:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                features = batch['features'].to(self.device)
                labels = batch['label'].squeeze().to(self.device)

                logits = self.model(input_ids, attention_mask, features)
                predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
                true_labels.extend(labels.cpu().numpy())

        accuracy = accuracy_score(true_labels, predictions)
        precision, recall, f1, _ = precision_recall_fscore_support(
            true_labels, predictions, average='binary'
        )

        return accuracy, precision, recall, f1, predictions, true_labels


# ============================================================================
# PART 5: MAIN EXECUTION
# ============================================================================

def main():
    """Complete pipeline: Data → Features → Training → Evaluation"""

    print("=" * 80)
    print("ASU CYBER360 HACKATHON - PHISHING DETECTION SYSTEM")
    print("=" * 80)

    # Device configuration
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"\n✓ Using device: {device}")

    # -------------------------------------------------------------------------
    # STEP 1: Load and Prepare Data
    # -------------------------------------------------------------------------
    print("\n[STEP 1] Loading datasets...")

    # Load real phishing data (2000 samples)
    df_real = pd.read_csv('Phishing_validation_emails.csv')
    print(f"✓ Loaded {len(df_real)} real samples")

    # Generate synthetic data (3000 samples)
    print("✓ Generating synthetic data...")
    generator = SyntheticDataGenerator()
    df_synthetic = generator.generate_dataset(n_samples=3000)
    print(f"✓ Generated {len(df_synthetic)} synthetic samples")

    # Combine datasets
    df_train_full = pd.concat([df_real, df_synthetic], ignore_index=True)
    df_train_full = df_train_full.sample(frac=1, random_state=SEED).reset_index(drop=True)
    print(f"✓ Total training data: {len(df_train_full)} samples")

    # Load test set (NEVER use for training!)
    df_test = pd.read_csv('se_phishing_test_set.csv')
    df_test.columns = ['Email Text', 'Email Type']  # Standardize column names
    print(f"✓ Test set: {len(df_test)} samples (held out)")

    # -------------------------------------------------------------------------
    # STEP 2: Extract Features
    # -------------------------------------------------------------------------
    print("\n[STEP 2] Extracting engineered features...")

    feature_extractor = FeatureExtractor()

    # Extract features for training data
    train_features = feature_extractor.extract_batch(df_train_full['Email Text'].values)
    print(f"✓ Extracted {train_features.shape[1]} features for training data")

    # Extract features for test data
    test_features = feature_extractor.extract_batch(df_test['Email Text'].values)
    print(f"✓ Extracted {test_features.shape[1]} features for test data")

    # -------------------------------------------------------------------------
    # STEP 3: Prepare Labels and Split
    # -------------------------------------------------------------------------
    print("\n[STEP 3] Preparing labels and validation split...")

    # Convert labels to binary (0 = Safe, 1 = Phishing)
    label_map = {'Safe Email': 0, 'Phishing Email': 1, 'safe': 0, 'phishing': 1, 0: 0, 1: 1}

    # Map training labels
    df_train_full['label'] = df_train_full['Email Type'].map(label_map)

    # Map test labels (handle various formats)
    if df_test['Email Type'].dtype == 'object':
        # String labels - normalize and map
        df_test['Email Type'] = df_test['Email Type'].str.strip().str.lower()
        df_test['label'] = df_test['Email Type'].replace({
            'safe email': 0, 'phishing email': 1,
            'safe': 0, 'phishing': 1,
            'benign': 0, 'malicious': 1,
            'legitimate': 0, 'spam': 1
        })
    else:
        # Already numeric
        df_test['label'] = df_test['Email Type'].astype(int)

    # Verify no NaN values in labels
    if df_train_full['label'].isna().any():
        print("⚠ Warning: NaN values in training labels, filling with 0")
        df_train_full['label'].fillna(0, inplace=True)

    if df_test['label'].isna().any():
        print("⚠ Warning: NaN values in test labels, filling with 0")
        df_test['label'].fillna(0, inplace=True)

    # Ensure labels are integers
    df_train_full['label'] = df_train_full['label'].astype(int)
    df_test['label'] = df_test['label'].astype(int)

    # Split training data into train/validation (80/20)
    train_texts, val_texts, train_feat, val_feat, train_labels, val_labels = train_test_split(
        df_train_full['Email Text'].values,
        train_features.values,
        df_train_full['label'].values,
        test_size=0.2,
        random_state=SEED,
        stratify=df_train_full['label']
    )

    print(f"✓ Training samples: {len(train_texts)}")
    print(f"✓ Validation samples: {len(val_texts)}")
    print(f"✓ Test samples: {len(df_test)} (held out)")

    # Scale features
    scaler = StandardScaler()
    train_feat_scaled = scaler.fit_transform(train_feat)
    val_feat_scaled = scaler.transform(val_feat)
    test_feat_scaled = scaler.transform(test_features.values)

    # -------------------------------------------------------------------------
    # STEP 4: Initialize Model and Tokenizer
    # -------------------------------------------------------------------------
    print("\n[STEP 4] Initializing hybrid model...")

    MODEL_NAME = 'distilbert-base-uncased'  # Fast and accurate
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = HybridPhishingDetector(model_name=MODEL_NAME, n_features=10, dropout=0.5)

    print(f"✓ Model: {MODEL_NAME}")
    print(f"✓ Total parameters: {sum(p.numel() for p in model.parameters()):,}")
    print(f"✓ Dropout: 0.5 (reduced overfitting)")

    # -------------------------------------------------------------------------
    # STEP 5: Create DataLoaders
    # -------------------------------------------------------------------------
    print("\n[STEP 5] Creating data loaders...")

    BATCH_SIZE = 16

    train_dataset = PhishingDataset(train_texts, train_feat_scaled, train_labels, tokenizer)
    val_dataset = PhishingDataset(val_texts, val_feat_scaled, val_labels, tokenizer)
    test_dataset = PhishingDataset(df_test['Email Text'].values, test_feat_scaled,
                                   df_test['label'].values, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

    print(f"✓ Batch size: {BATCH_SIZE}")
    print(f"✓ Training batches: {len(train_loader)}")

    # -------------------------------------------------------------------------
    # STEP 6: Training
    # -------------------------------------------------------------------------
    print("\n[STEP 6] Training model...")

    EPOCHS = 4
    trainer = PhishingDetectorTrainer(model, device, learning_rate=2e-5)

    total_steps = len(train_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        trainer.optimizer,
        num_warmup_steps=total_steps // 10,
        num_training_steps=total_steps
    )

    best_val_accuracy = 0

    for epoch in range(EPOCHS):
        print(f"\nEpoch {epoch + 1}/{EPOCHS}")
        print("-" * 60)

        # Train
        train_loss, train_acc = trainer.train_epoch(train_loader, scheduler)
        print(f"Train Loss: {train_loss:.4f} | Train Accuracy: {train_acc:.4f}")

        # Validate
        val_acc, val_prec, val_rec, val_f1, _, _ = trainer.evaluate(val_loader)
        print(f"Val Accuracy: {val_acc:.4f} | Precision: {val_prec:.4f} | Recall: {val_rec:.4f} | F1: {val_f1:.4f}")

        # Save best model
        if val_acc > best_val_accuracy:
            best_val_accuracy = val_acc
            torch.save(model.state_dict(), 'best_phishing_detector.pth')
            print("✓ Best model saved!")

    # -------------------------------------------------------------------------
    # STEP 7: Final Evaluation on Test Set
    # -------------------------------------------------------------------------
    print("\n" + "=" * 80)
    print("[FINAL EVALUATION] Testing on held-out test set")
    print("=" * 80)

    # Load best model
    model.load_state_dict(torch.load('best_phishing_detector.pth'))

    test_acc, test_prec, test_rec, test_f1, test_preds, test_true = trainer.evaluate(test_loader)

    print(f"\n🎯 FINAL TEST RESULTS:")
    print(f"   Accuracy:  {test_acc:.4f} ({test_acc*100:.2f}%)")
    print(f"   Precision: {test_prec:.4f}")
    print(f"   Recall:    {test_rec:.4f}")
    print(f"   F1-Score:  {test_f1:.4f}")

    print("\n📊 Detailed Classification Report:")
    print(classification_report(test_true, test_preds,
                               target_names=['Safe Email', 'Phishing Email']))

    print("\n" + "=" * 80)
    print("✓ Training complete! Model ready for competition submission.")
    print("=" * 80)


if __name__ == "__main__":
    main()

ASU CYBER360 HACKATHON - PHISHING DETECTION SYSTEM

✓ Using device: cuda

[STEP 1] Loading datasets...
✓ Loaded 2000 real samples
✓ Generating synthetic data...
✓ Generated 3000 synthetic samples
✓ Total training data: 5000 samples
✓ Test set: 150 samples (held out)

[STEP 2] Extracting engineered features...
✓ Extracted 10 features for training data
✓ Extracted 10 features for test data

[STEP 3] Preparing labels and validation split...
✓ Training samples: 4000
✓ Validation samples: 1000
✓ Test samples: 150 (held out)

[STEP 4] Initializing hybrid model...
✓ Model: distilbert-base-uncased
✓ Total parameters: 66,610,518
✓ Dropout: 0.5 (reduced overfitting)

[STEP 5] Creating data loaders...
✓ Batch size: 16
✓ Training batches: 250

[STEP 6] Training model...

Epoch 1/4
------------------------------------------------------------
Train Loss: 0.4035 | Train Accuracy: 0.8303
Val Accuracy: 1.0000 | Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000
✓ Best model saved!

Epoch 2/4
-------------