# 138: Container Security Compliance

In [None]:
# Setup and Imports
import json
import uuid
import hashlib
from dataclasses import dataclass, field
from typing import List, Dict, Optional, Any, Set
from enum import Enum
from datetime import datetime, timedelta
import time
import re

# Random seed for reproducibility
import random
random.seed(42)

print("✅ Setup complete - Ready for container security simulation")

## 2. 🔍 Image Scanning - CVE Detection with Trivy and Snyk

### 📝 What's Happening in This Section?

**Purpose:** Scan container images for known vulnerabilities (CVEs) before deployment, blocking images with critical security issues.

**Key Points:**
- **CVE Database**: Common Vulnerabilities and Exposures (public database of known security flaws)
- **CVSS Scoring**: Severity rating (0-10 scale: Low <4, Medium 4-7, High 7-9, Critical 9-10)
- **Vulnerability Sources**: OS packages (apt, yum), application libraries (pip, npm), base images (official vs community)
- **Scanning Layers**: Each Docker layer scanned independently (find which layer introduced CVE)
- **CI/CD Integration**: Automated scanning on every image build (fail build if critical CVEs found)

**Why This Matters:**
- **Proactive Defense**: Find vulnerabilities before deployment (10× cheaper to fix in dev vs production)
- **Compliance**: Many regulations require vulnerability scanning (PCI-DSS, HIPAA, SOC 2)
- **Supply Chain Security**: Verify base images haven't been compromised (check for backdoors, malware)
- **Prioritization**: Focus on critical/high CVEs first (not all low-severity issues need immediate fix)

**Post-Silicon Application:**
ML model serving API image scanned with Trivy:
1. **Base Image**: python:3.12-slim (scan detects 5 CVEs in underlying Debian packages)
2. **Application Libraries**: TensorFlow 2.14.0 (scan detects 1 critical CVE in NumPy dependency)
3. **Scan Result**: 1 critical + 3 high + 2 medium CVEs found
4. **Action**: CI/CD blocks deployment → developer updates requirements.txt (TensorFlow 2.15.0 fixes NumPy CVE) → re-scan shows 0 critical, 2 high → approved for deployment
5. **Time Saved**: 2 hours scanning in dev vs weeks of incident response if deployed vulnerable

This ensures zero known critical vulnerabilities in production ML services.

In [None]:
# Image Scanning Simulation

class CVESeverity(Enum):
    """CVE severity levels"""
    CRITICAL = "CRITICAL"  # 9.0-10.0 CVSS
    HIGH = "HIGH"          # 7.0-8.9 CVSS
    MEDIUM = "MEDIUM"      # 4.0-6.9 CVSS
    LOW = "LOW"            # 0.1-3.9 CVSS
    UNKNOWN = "UNKNOWN"    # Not yet scored

@dataclass
class CVE:
    """Common Vulnerability and Exposure"""
    cve_id: str  # CVE-2023-12345
    severity: CVESeverity
    cvss_score: float  # 0.0-10.0
    package: str  # openssl, numpy, tensorflow
    installed_version: str
    fixed_version: Optional[str]
    description: str
    
    def is_fixable(self) -> bool:
        """Check if CVE has a fix available"""
        return self.fixed_version is not None
    
    def is_critical_or_high(self) -> bool:
        """Check if CVE is critical or high severity"""
        return self.severity in [CVESeverity.CRITICAL, CVESeverity.HIGH]

@dataclass
class ImageLayer:
    """Docker image layer"""
    layer_id: str
    command: str  # Dockerfile instruction (RUN pip install, COPY, etc.)
    size_mb: float
    packages: List[str] = field(default_factory=list)

@dataclass
class ScanResult:
    """Container image scan result"""
    image_name: str
    image_tag: str
    scan_timestamp: datetime
    total_cves: int
    cves: List[CVE] = field(default_factory=list)
    layers: List[ImageLayer] = field(default_factory=list)
    
    def get_cves_by_severity(self, severity: CVESeverity) -> List[CVE]:
        """Get CVEs by severity"""
        return [cve for cve in self.cves if cve.severity == severity]
    
    def get_critical_and_high_count(self) -> int:
        """Count critical and high severity CVEs"""
        return len([cve for cve in self.cves if cve.is_critical_or_high()])
    
    def passes_quality_gate(self, max_critical: int = 0, max_high: int = 2) -> bool:
        """Check if scan passes deployment quality gate"""
        critical_count = len(self.get_cves_by_severity(CVESeverity.CRITICAL))
        high_count = len(self.get_cves_by_severity(CVESeverity.HIGH))
        
        return critical_count <= max_critical and high_count <= max_high
    
    def get_summary(self) -> Dict:
        """Get scan summary"""
        return {
            'image': f"{self.image_name}:{self.image_tag}",
            'scan_time': self.scan_timestamp.isoformat(),
            'total_cves': self.total_cves,
            'severity_breakdown': {
                'CRITICAL': len(self.get_cves_by_severity(CVESeverity.CRITICAL)),
                'HIGH': len(self.get_cves_by_severity(CVESeverity.HIGH)),
                'MEDIUM': len(self.get_cves_by_severity(CVESeverity.MEDIUM)),
                'LOW': len(self.get_cves_by_severity(CVESeverity.LOW))
            },
            'fixable': len([cve for cve in self.cves if cve.is_fixable()]),
            'passes_gate': self.passes_quality_gate()
        }

class ImageScanner:
    """Container image vulnerability scanner (simulates Trivy/Snyk)"""
    
    def __init__(self, scanner_name: str = "Trivy"):
        self.scanner_name = scanner_name
        self.cve_database = self._load_cve_database()
    
    def _load_cve_database(self) -> Dict[str, List[CVE]]:
        """Simulate loading CVE database"""
        return {
            'openssl': [
                CVE(
                    cve_id='CVE-2023-0286',
                    severity=CVESeverity.CRITICAL,
                    cvss_score=9.8,
                    package='openssl',
                    installed_version='1.1.1k',
                    fixed_version='1.1.1t',
                    description='X.400 address type confusion in X.509 GeneralName (critical remote code execution)'
                )
            ],
            'numpy': [
                CVE(
                    cve_id='CVE-2021-34141',
                    severity=CVESeverity.HIGH,
                    cvss_score=7.5,
                    package='numpy',
                    installed_version='1.21.0',
                    fixed_version='1.22.0',
                    description='Buffer overflow in PyArray_NewFromDescr_int (high DoS risk)'
                )
            ],
            'tensorflow': [
                CVE(
                    cve_id='CVE-2023-25801',
                    severity=CVESeverity.HIGH,
                    cvss_score=8.8,
                    package='tensorflow',
                    installed_version='2.11.0',
                    fixed_version='2.11.1',
                    description='Code injection via SavedModel deserialization (high privilege escalation)'
                )
            ],
            'pillow': [
                CVE(
                    cve_id='CVE-2023-44271',
                    severity=CVESeverity.MEDIUM,
                    cvss_score=6.5,
                    package='pillow',
                    installed_version='9.5.0',
                    fixed_version='10.0.1',
                    description='Uncontrolled resource consumption in ImageFont (medium DoS)'
                )
            ],
            'requests': [
                CVE(
                    cve_id='CVE-2023-32681',
                    severity=CVESeverity.MEDIUM,
                    cvss_score=5.9,
                    package='requests',
                    installed_version='2.28.0',
                    fixed_version='2.31.0',
                    description='Proxy-Authorization header leak on cross-origin redirect'
                )
            ],
            'urllib3': [
                CVE(
                    cve_id='CVE-2023-45803',
                    severity=CVESeverity.LOW,
                    cvss_score=3.7,
                    package='urllib3',
                    installed_version='1.26.15',
                    fixed_version='2.0.7',
                    description='Cookie request header leak on cross-origin redirect'
                )
            ]
        }
    
    def scan_image(self, image_name: str, image_tag: str, packages: List[str]) -> ScanResult:
        """Scan container image for vulnerabilities"""
        print(f"\n{'=' * 70}")
        print(f"{self.scanner_name} - Scanning Image: {image_name}:{image_tag}")
        print(f"{'=' * 70}")
        
        time.sleep(0.3)  # Simulate scanning time
        
        # Find CVEs for installed packages
        found_cves = []
        for package in packages:
            if package in self.cve_database:
                found_cves.extend(self.cve_database[package])
        
        # Create scan result
        scan_result = ScanResult(
            image_name=image_name,
            image_tag=image_tag,
            scan_timestamp=datetime.now(),
            total_cves=len(found_cves),
            cves=found_cves
        )
        
        # Display results
        print(f"\nScan completed at {scan_result.scan_timestamp.strftime('%Y-%m-%d %H:%M:%S')}")
        print(f"Total vulnerabilities found: {scan_result.total_cves}\n")
        
        # Display by severity
        for severity in [CVESeverity.CRITICAL, CVESeverity.HIGH, CVESeverity.MEDIUM, CVESeverity.LOW]:
            severity_cves = scan_result.get_cves_by_severity(severity)
            if severity_cves:
                print(f"\n{severity.value} ({len(severity_cves)}):")
                for cve in severity_cves:
                    fix_info = f"(fixed in {cve.fixed_version})" if cve.fixed_version else "(no fix available)"
                    print(f"  • {cve.cve_id} - {cve.package} {cve.installed_version} {fix_info}")
                    print(f"    CVSS: {cve.cvss_score} - {cve.description}")
        
        # Quality gate check
        print(f"\n{'=' * 70}")
        critical_count = len(scan_result.get_cves_by_severity(CVESeverity.CRITICAL))
        high_count = len(scan_result.get_cves_by_severity(CVESeverity.HIGH))
        
        if scan_result.passes_quality_gate():
            print(f"✅ PASS - Image meets security quality gate")
            print(f"   Critical: {critical_count}/0, High: {high_count}/2")
        else:
            print(f"❌ FAIL - Image does NOT meet security quality gate")
            print(f"   Critical: {critical_count}/0, High: {high_count}/2")
            print(f"   Action: Fix critical/high CVEs before deployment")
        
        print(f"{'=' * 70}")
        
        return scan_result

# Example 1: Scan ML Model Serving Image (Vulnerable)
print("=" * 70)
print("Example 1: Scan Vulnerable ML Model Image")
print("=" * 70)

scanner = ImageScanner(scanner_name="Trivy")

vulnerable_packages = [
    'openssl',      # Has CRITICAL CVE
    'numpy',        # Has HIGH CVE
    'tensorflow',   # Has HIGH CVE
    'pillow',       # Has MEDIUM CVE
    'requests'      # Has MEDIUM CVE
]

scan1 = scanner.scan_image(
    image_name="ml-yield-predictor",
    image_tag="v1.0.0",
    packages=vulnerable_packages
)

# Display JSON summary
print(f"\nJSON Summary:")
print(json.dumps(scan1.get_summary(), indent=2))

# Example 2: Scan Patched Image (Secure)
print(f"\n\n{'=' * 70}")
print("Example 2: Scan Patched ML Model Image (After Fixing CVEs)")
print("=" * 70)

# Simulate patched image (removed vulnerable packages)
patched_packages = [
    'urllib3'  # Only low-severity CVE
]

scan2 = scanner.scan_image(
    image_name="ml-yield-predictor",
    image_tag="v1.1.0",
    packages=patched_packages
)

print(f"\nJSON Summary:")
print(json.dumps(scan2.get_summary(), indent=2))

# Example 3: Scan Report Comparison
print(f"\n\n{'=' * 70}")
print("Example 3: Before/After CVE Remediation")
print("=" * 70)

print(f"\nv1.0.0 (Vulnerable):")
print(f"  Total CVEs: {scan1.total_cves}")
print(f"  Critical: {len(scan1.get_cves_by_severity(CVESeverity.CRITICAL))}")
print(f"  High: {len(scan1.get_cves_by_severity(CVESeverity.HIGH))}")
print(f"  Passes Gate: {scan1.passes_quality_gate()}")

print(f"\nv1.1.0 (Patched):")
print(f"  Total CVEs: {scan2.total_cves}")
print(f"  Critical: {len(scan2.get_cves_by_severity(CVESeverity.CRITICAL))}")
print(f"  High: {len(scan2.get_cves_by_severity(CVESeverity.HIGH))}")
print(f"  Passes Gate: {scan2.passes_quality_gate()}")

print(f"\n✅ Remediation Summary:")
print(f"  Critical CVEs Fixed: {len(scan1.get_cves_by_severity(CVESeverity.CRITICAL)) - len(scan2.get_cves_by_severity(CVESeverity.CRITICAL))}")
print(f"  High CVEs Fixed: {len(scan1.get_cves_by_severity(CVESeverity.HIGH)) - len(scan2.get_cves_by_severity(CVESeverity.HIGH))}")
print(f"  Deployment Status: v1.1.0 approved for production")

print(f"\n✅ Image scanning demonstrated: CVE detection, severity classification, quality gates!")

## 3. 🛡️ Runtime Security - Falco for Anomaly Detection

### 📝 What's Happening in This Section?

**Purpose:** Monitor container runtime behavior to detect and respond to security threats (privilege escalation, data exfiltration, crypto-mining).

**Key Points:**
- **Behavioral Monitoring**: Track syscalls, file access, network connections, process spawns in real-time
- **Anomaly Detection**: Compare runtime behavior vs expected baseline (alert on deviations)
- **Falco Rules**: Pre-built detection rules (shell spawned in container, sensitive file read, privilege escalation)
- **Kernel-Level Instrumentation**: eBPF captures syscalls without performance overhead
- **Incident Response**: Alert → investigate → kill pod → forensics

**Why This Matters:**
- **Zero-Day Protection**: Detect unknown exploits (behavioral monitoring catches attacks without CVE signatures)
- **Compliance**: PCI-DSS, HIPAA require intrusion detection (Falco provides audit logs)
- **Insider Threats**: Detect unauthorized data access (engineer reading /etc/shadow, exfiltrating STDF files)
- **Crypto-Mining**: Detect malicious workloads (unexpected CPU spikes, unknown process execution)

**Post-Silicon Application:**
Falco monitors STDF processing pods in production:
1. **Baseline Behavior**: Pod reads STDF from S3, processes data, writes to DynamoDB (normal workflow)
2. **Anomaly Detected**: Falco detects unexpected shell spawn (`/bin/bash` executed inside pod)
3. **Investigation**: Shell accessed `/etc/shadow` (privilege escalation attempt)
4. **Response**: Falco alerts SOC team → pod killed automatically → forensics on pod logs
5. **Root Cause**: Compromised dependency (malicious npm package) attempted to establish reverse shell

This prevents data exfiltration from compromised containers in real-time.

In [None]:
# Runtime Security Simulation

class ThreatSeverity(Enum):
    """Security threat severity"""
    CRITICAL = "CRITICAL"
    HIGH = "HIGH"
    MEDIUM = "MEDIUM"
    LOW = "LOW"
    INFO = "INFO"

class ThreatAction(Enum):
    """Response action for threats"""
    ALERT = "ALERT"
    ALERT_AND_KILL = "ALERT_AND_KILL"
    BLOCK = "BLOCK"
    LOG = "LOG"

@dataclass
class SecurityEvent:
    """Runtime security event"""
    event_id: str
    timestamp: datetime
    pod_name: str
    namespace: str
    rule_name: str
    severity: ThreatSeverity
    description: str
    syscalls: List[str] = field(default_factory=list)
    files_accessed: List[str] = field(default_factory=list)
    processes_spawned: List[str] = field(default_factory=list)
    network_connections: List[Dict[str, str]] = field(default_factory=list)

@dataclass
class FalcoRule:
    """Falco detection rule"""
    name: str
    description: str
    condition: str  # Simplified condition (real Falco uses complex expressions)
    severity: ThreatSeverity
    action: ThreatAction
    enabled: bool = True
    
    def matches(self, event_data: Dict[str, Any]) -> bool:
        """Check if event matches rule condition"""
        # Simplified matching logic
        if "shell_spawn" in self.condition:
            return event_data.get('process') in ['/bin/bash', '/bin/sh', '/bin/zsh']
        elif "sensitive_file" in self.condition:
            sensitive_files = ['/etc/shadow', '/etc/passwd', '/root/.ssh/id_rsa']
            return any(f in event_data.get('file', '') for f in sensitive_files)
        elif "privilege_escalation" in self.condition:
            return event_data.get('syscall') in ['setuid', 'setgid', 'capset']
        elif "network_exfiltration" in self.condition:
            return event_data.get('dest_port') in [4444, 1337, 8888]  # Common reverse shell ports
        elif "crypto_mining" in self.condition:
            mining_processes = ['xmrig', 'minerd', 'cpuminer']
            return any(proc in event_data.get('process', '') for proc in mining_processes)
        
        return False

class FalcoRuntimeSecurity:
    """Falco runtime security monitor"""
    
    def __init__(self):
        self.rules = self._load_default_rules()
        self.events: List[SecurityEvent] = []
    
    def _load_default_rules(self) -> List[FalcoRule]:
        """Load default Falco rules"""
        return [
            FalcoRule(
                name="Shell Spawned in Container",
                description="Detect when a shell is spawned inside a container",
                condition="shell_spawn",
                severity=ThreatSeverity.CRITICAL,
                action=ThreatAction.ALERT_AND_KILL
            ),
            FalcoRule(
                name="Read Sensitive File",
                description="Detect reading of sensitive system files",
                condition="sensitive_file",
                severity=ThreatSeverity.HIGH,
                action=ThreatAction.ALERT
            ),
            FalcoRule(
                name="Privilege Escalation Attempt",
                description="Detect syscalls attempting privilege escalation",
                condition="privilege_escalation",
                severity=ThreatSeverity.CRITICAL,
                action=ThreatAction.ALERT_AND_KILL
            ),
            FalcoRule(
                name="Outbound Connection to Suspicious Port",
                description="Detect network connections to common reverse shell ports",
                condition="network_exfiltration",
                severity=ThreatSeverity.HIGH,
                action=ThreatAction.ALERT
            ),
            FalcoRule(
                name="Crypto-Mining Process Detected",
                description="Detect known crypto-mining binaries",
                condition="crypto_mining",
                severity=ThreatSeverity.CRITICAL,
                action=ThreatAction.ALERT_AND_KILL
            )
        ]
    
    def monitor_event(self, pod_name: str, namespace: str, event_data: Dict[str, Any]) -> Optional[SecurityEvent]:
        """Monitor runtime event and check against rules"""
        for rule in self.rules:
            if not rule.enabled:
                continue
            
            if rule.matches(event_data):
                # Security event detected!
                event = SecurityEvent(
                    event_id=f"evt-{uuid.uuid4().hex[:8]}",
                    timestamp=datetime.now(),
                    pod_name=pod_name,
                    namespace=namespace,
                    rule_name=rule.name,
                    severity=rule.severity,
                    description=rule.description
                )
                
                # Populate event details
                if 'process' in event_data:
                    event.processes_spawned.append(event_data['process'])
                if 'file' in event_data:
                    event.files_accessed.append(event_data['file'])
                if 'syscall' in event_data:
                    event.syscalls.append(event_data['syscall'])
                if 'dest_ip' in event_data and 'dest_port' in event_data:
                    event.network_connections.append({
                        'dest_ip': event_data['dest_ip'],
                        'dest_port': str(event_data['dest_port'])
                    })
                
                self.events.append(event)
                self._handle_threat(event, rule)
                
                return event
        
        return None
    
    def _handle_threat(self, event: SecurityEvent, rule: FalcoRule):
        """Handle detected security threat"""
        print(f"\n{'=' * 70}")
        print(f"🚨 SECURITY ALERT - {event.severity.value}")
        print(f"{'=' * 70}")
        print(f"Event ID: {event.event_id}")
        print(f"Timestamp: {event.timestamp.strftime('%Y-%m-%d %H:%M:%S')}")
        print(f"Pod: {event.namespace}/{event.pod_name}")
        print(f"Rule: {event.rule_name}")
        print(f"Description: {event.description}")
        
        if event.processes_spawned:
            print(f"Processes Spawned: {', '.join(event.processes_spawned)}")
        if event.files_accessed:
            print(f"Files Accessed: {', '.join(event.files_accessed)}")
        if event.syscalls:
            print(f"Syscalls: {', '.join(event.syscalls)}")
        if event.network_connections:
            for conn in event.network_connections:
                print(f"Network Connection: {conn['dest_ip']}:{conn['dest_port']}")
        
        print(f"\nAction: {rule.action.value}")
        
        if rule.action == ThreatAction.ALERT_AND_KILL:
            print(f"❌ Killing pod {event.pod_name} (high-risk threat)")
        elif rule.action == ThreatAction.ALERT:
            print(f"⚠️  Alert sent to SOC team (investigation required)")
        
        print(f"{'=' * 70}")
    
    def get_events_by_severity(self, severity: ThreatSeverity) -> List[SecurityEvent]:
        """Get events by severity"""
        return [evt for evt in self.events if evt.severity == severity]
    
    def get_summary(self) -> Dict:
        """Get security event summary"""
        return {
            'total_events': len(self.events),
            'by_severity': {
                'CRITICAL': len(self.get_events_by_severity(ThreatSeverity.CRITICAL)),
                'HIGH': len(self.get_events_by_severity(ThreatSeverity.HIGH)),
                'MEDIUM': len(self.get_events_by_severity(ThreatSeverity.MEDIUM)),
                'LOW': len(self.get_events_by_severity(ThreatSeverity.LOW))
            },
            'events': [
                {
                    'event_id': evt.event_id,
                    'timestamp': evt.timestamp.isoformat(),
                    'pod': f"{evt.namespace}/{evt.pod_name}",
                    'rule': evt.rule_name,
                    'severity': evt.severity.value
                }
                for evt in self.events
            ]
        }

# Example 1: Detect Shell Spawn in Container (Critical Threat)
print("=" * 70)
print("Example 1: Shell Spawn Detection (Reverse Shell Attack)")
print("=" * 70)

falco = FalcoRuntimeSecurity()

print("\n📊 Monitoring pod: ml-training/yield-predictor-abc123")
print("   Normal behavior: Python process reads S3, processes data, writes to DB")

# Simulate normal operations (no alerts)
print("\n✅ Normal Event: Python process executing...")
falco.monitor_event(
    pod_name="yield-predictor-abc123",
    namespace="ml-training",
    event_data={'process': '/usr/bin/python3.12', 'file': '/app/train_model.py'}
)

print("✅ Normal Event: Reading STDF data from S3...")
falco.monitor_event(
    pod_name="yield-predictor-abc123",
    namespace="ml-training",
    event_data={'process': 'aws', 'file': 's3://ml-stdf-data/wafer_test.stdf'}
)

# Simulate attack: Shell spawn (CRITICAL)
print("\n🔴 Suspicious Event: Shell spawned!")
time.sleep(0.2)
event1 = falco.monitor_event(
    pod_name="yield-predictor-abc123",
    namespace="ml-training",
    event_data={'process': '/bin/bash'}
)

# Example 2: Detect Sensitive File Access (High Threat)
print("\n\n" + "=" * 70)
print("Example 2: Sensitive File Access Detection")
print("=" * 70)

print("\n📊 Monitoring pod: stdf-parser/parser-xyz789")

# Simulate attack: Reading /etc/shadow
print("\n🔴 Suspicious Event: Attempt to read /etc/shadow!")
time.sleep(0.2)
event2 = falco.monitor_event(
    pod_name="parser-xyz789",
    namespace="stdf-parser",
    event_data={'process': 'cat', 'file': '/etc/shadow'}
)

# Example 3: Detect Privilege Escalation (Critical Threat)
print("\n\n" + "=" * 70)
print("Example 3: Privilege Escalation Detection")
print("=" * 70)

print("\n📊 Monitoring pod: model-serving/api-server-def456")

# Simulate attack: setuid syscall
print("\n🔴 Suspicious Event: Privilege escalation attempt!")
time.sleep(0.2)
event3 = falco.monitor_event(
    pod_name="api-server-def456",
    namespace="model-serving",
    event_data={'syscall': 'setuid', 'process': 'exploit'}
)

# Example 4: Detect Network Exfiltration (High Threat)
print("\n\n" + "=" * 70)
print("Example 4: Network Exfiltration Detection")
print("=" * 70)

print("\n📊 Monitoring pod: data-pipeline/etl-worker-ghi012")

# Simulate attack: Connection to reverse shell port
print("\n🔴 Suspicious Event: Outbound connection to suspicious port!")
time.sleep(0.2)
event4 = falco.monitor_event(
    pod_name="etl-worker-ghi012",
    namespace="data-pipeline",
    event_data={'dest_ip': '203.0.113.42', 'dest_port': 4444}
)

# Example 5: Detect Crypto-Mining (Critical Threat)
print("\n\n" + "=" * 70)
print("Example 5: Crypto-Mining Detection")
print("=" * 70)

print("\n📊 Monitoring pod: ml-training/gpu-node-jkl345")

# Simulate attack: Crypto-mining malware
print("\n🔴 Suspicious Event: Crypto-mining process detected!")
time.sleep(0.2)
event5 = falco.monitor_event(
    pod_name="gpu-node-jkl345",
    namespace="ml-training",
    event_data={'process': 'xmrig', 'file': '/tmp/xmrig'}
)

# Security Summary
print("\n\n" + "=" * 70)
print("Security Event Summary")
print("=" * 70)

summary = falco.get_summary()
print(json.dumps(summary, indent=2))

print(f"\n✅ Runtime security demonstrated: Shell detection, file monitoring, privilege escalation, network anomalies!")

## 4. 🔒 Network Security - NetworkPolicy and Cilium

### 📝 What's Happening in This Code?

**Purpose:** Implement network micro-segmentation to limit lateral movement after container compromise.

**Key Points:**
- **NetworkPolicy**: Kubernetes-native network firewall (namespace isolation, ingress/egress rules, pod selectors)
- **Zero-Trust Networking**: Default deny all traffic, explicitly allow only required connections
- **Identity-Based Security**: Policies based on pod labels (not IP addresses which change with pod restarts)
- **L7 Filtering**: Application-layer security (HTTP methods, API endpoints, header-based rules)
- **Cilium**: eBPF-powered networking with observability, API-aware filtering, DNS security

**Why This Matters:**
- **Limit Blast Radius**: If STDF parser pod compromised, attacker can't access database (blocked by NetworkPolicy)
- **Compliance**: PCI-DSS requires network segmentation, HIPAA mandates access controls
- **Micro-Segmentation**: Each team/service isolated (prevent cross-team data leakage)
- **Defense in Depth**: Even if container runtime compromised, network policy blocks lateral movement

**Post-Silicon Application:**
- **Scenario**: Multi-tenant ML platform (3 teams share cluster)
- **Implementation**:
  - NetworkPolicy: Team A (yield prediction) can't access Team B's database (binning optimization)
  - Cilium L7 Policy: Only `/predict` API allowed for model serving pods (block admin endpoints `/retrain`)
  - DNS Policy: Block DNS queries to crypto-mining pools (prevent C2 communication)
- **Result**: Team isolation enforced, 95% reduction in cross-team incidents, $950K risk reduction

In [None]:
# Network Security Simulation

class TrafficDirection(Enum):
    """Network traffic direction"""
    INGRESS = "INGRESS"  # Incoming traffic
    EGRESS = "EGRESS"    # Outgoing traffic

class PolicyEffect(Enum):
    """Network policy effect"""
    ALLOW = "ALLOW"
    DENY = "DENY"

@dataclass
class NetworkRule:
    """Network policy rule"""
    direction: TrafficDirection
    effect: PolicyEffect
    from_labels: Dict[str, str] = field(default_factory=dict)  # Source pod labels
    to_labels: Dict[str, str] = field(default_factory=dict)    # Destination pod labels
    ports: List[int] = field(default_factory=list)
    protocols: List[str] = field(default_factory=lambda: ["TCP"])
    
    def matches(self, source_labels: Dict[str, str], dest_labels: Dict[str, str], 
                port: int, protocol: str) -> bool:
        """Check if traffic matches this rule"""
        # Check labels
        if self.from_labels and not self._labels_match(self.from_labels, source_labels):
            return False
        if self.to_labels and not self._labels_match(self.to_labels, dest_labels):
            return False
        
        # Check port
        if self.ports and port not in self.ports:
            return False
        
        # Check protocol
        if protocol not in self.protocols:
            return False
        
        return True
    
    def _labels_match(self, required: Dict[str, str], actual: Dict[str, str]) -> bool:
        """Check if all required labels match"""
        return all(actual.get(k) == v for k, v in required.items())

@dataclass
class L7HTTPRule:
    """Layer 7 HTTP filtering rule"""
    allowed_methods: List[str] = field(default_factory=lambda: ["GET", "POST"])
    allowed_paths: List[str] = field(default_factory=list)
    denied_paths: List[str] = field(default_factory=list)
    required_headers: Dict[str, str] = field(default_factory=dict)
    
    def is_allowed(self, method: str, path: str, headers: Dict[str, str]) -> bool:
        """Check if HTTP request is allowed"""
        # Check method
        if method not in self.allowed_methods:
            return False
        
        # Check denied paths
        if any(path.startswith(denied) for denied in self.denied_paths):
            return False
        
        # Check allowed paths (if specified)
        if self.allowed_paths and not any(path.startswith(allowed) for allowed in self.allowed_paths):
            return False
        
        # Check required headers
        for header, value in self.required_headers.items():
            if headers.get(header) != value:
                return False
        
        return True

class NetworkPolicy:
    """Kubernetes NetworkPolicy simulator"""
    
    def __init__(self, name: str, namespace: str):
        self.name = name
        self.namespace = namespace
        self.rules: List[NetworkRule] = []
        self.l7_rules: List[L7HTTPRule] = []
        self.default_deny = True  # Zero-trust: deny all by default
    
    def add_rule(self, rule: NetworkRule):
        """Add network rule"""
        self.rules.append(rule)
    
    def add_l7_rule(self, rule: L7HTTPRule):
        """Add L7 HTTP filtering rule"""
        self.l7_rules.append(rule)
    
    def evaluate(self, source_labels: Dict[str, str], dest_labels: Dict[str, str],
                 port: int, protocol: str = "TCP") -> PolicyEffect:
        """Evaluate if traffic is allowed"""
        # Check L3/L4 rules
        for rule in self.rules:
            if rule.matches(source_labels, dest_labels, port, protocol):
                return rule.effect
        
        # Default deny if no match
        return PolicyEffect.DENY if self.default_deny else PolicyEffect.ALLOW
    
    def evaluate_http(self, method: str, path: str, headers: Dict[str, str]) -> PolicyEffect:
        """Evaluate L7 HTTP request"""
        # If no L7 rules, allow (L3/L4 already checked)
        if not self.l7_rules:
            return PolicyEffect.ALLOW
        
        # Check L7 rules
        for rule in self.l7_rules:
            if rule.is_allowed(method, path, headers):
                return PolicyEffect.ALLOW
        
        return PolicyEffect.DENY

class CiliumNetworkPolicy:
    """Cilium network policy with L7 awareness"""
    
    def __init__(self, name: str):
        self.name = name
        self.allowed_dns: List[str] = []
        self.denied_dns: List[str] = []
    
    def add_allowed_dns(self, pattern: str):
        """Add allowed DNS pattern"""
        self.allowed_dns.append(pattern)
    
    def add_denied_dns(self, pattern: str):
        """Add denied DNS pattern (e.g., crypto-mining pools)"""
        self.denied_dns.append(pattern)
    
    def evaluate_dns(self, dns_query: str) -> PolicyEffect:
        """Evaluate DNS query"""
        # Check denied list first (blocklist)
        for pattern in self.denied_dns:
            if pattern in dns_query:
                return PolicyEffect.DENY
        
        # If allow list exists, check it
        if self.allowed_dns:
            for pattern in self.allowed_dns:
                if pattern in dns_query:
                    return PolicyEffect.ALLOW
            return PolicyEffect.DENY  # Not in allow list
        
        return PolicyEffect.ALLOW  # No restrictions

# Example 1: Multi-Tenant Isolation (Team-Based Segmentation)
print("=" * 70)
print("Example 1: Multi-Tenant ML Platform - Team Isolation")
print("=" * 70)

policy = NetworkPolicy(name="team-isolation", namespace="ml-platform")

# Rule 1: Allow Team A pods to access Team A database
policy.add_rule(NetworkRule(
    direction=TrafficDirection.EGRESS,
    effect=PolicyEffect.ALLOW,
    from_labels={'team': 'yield-prediction', 'role': 'model'},
    to_labels={'team': 'yield-prediction', 'role': 'database'},
    ports=[5432],
    protocols=['TCP']
))

# Rule 2: Allow Team B pods to access Team B database
policy.add_rule(NetworkRule(
    direction=TrafficDirection.EGRESS,
    effect=PolicyEffect.ALLOW,
    from_labels={'team': 'binning-optimization', 'role': 'model'},
    to_labels={'team': 'binning-optimization', 'role': 'database'},
    ports=[5432],
    protocols=['TCP']
))

print("\n📊 Scenario: 3 teams share ML cluster (yield-prediction, binning-optimization, failure-analysis)")
print("   Goal: Prevent cross-team data access (zero-trust networking)")

# Test: Team A accessing Team A database (ALLOWED)
print("\n✅ Test 1: Team A pod → Team A database (port 5432)")
result1 = policy.evaluate(
    source_labels={'team': 'yield-prediction', 'role': 'model'},
    dest_labels={'team': 'yield-prediction', 'role': 'database'},
    port=5432
)
print(f"   Result: {result1.value} ✅ (same team, authorized)")

# Test: Team A accessing Team B database (DENIED)
print("\n❌ Test 2: Team A pod → Team B database (port 5432)")
result2 = policy.evaluate(
    source_labels={'team': 'yield-prediction', 'role': 'model'},
    dest_labels={'team': 'binning-optimization', 'role': 'database'},
    port=5432
)
print(f"   Result: {result2.value} ❌ (cross-team access blocked by policy)")
print("   Security: Prevents data leakage between teams")

# Test: Unauthorized port (DENIED)
print("\n❌ Test 3: Team A pod → Team A database (port 22 SSH)")
result3 = policy.evaluate(
    source_labels={'team': 'yield-prediction', 'role': 'model'},
    dest_labels={'team': 'yield-prediction', 'role': 'database'},
    port=22
)
print(f"   Result: {result3.value} ❌ (only port 5432 allowed, SSH blocked)")

# Example 2: L7 HTTP API Filtering (Model Serving Security)
print("\n\n" + "=" * 70)
print("Example 2: L7 API Security - Model Serving Endpoint Protection")
print("=" * 70)

api_policy = NetworkPolicy(name="api-security", namespace="model-serving")

# L7 Rule: Only allow /predict endpoint, block admin endpoints
api_policy.add_l7_rule(L7HTTPRule(
    allowed_methods=['POST'],
    allowed_paths=['/predict', '/health'],
    denied_paths=['/admin', '/retrain', '/delete-model'],
    required_headers={'X-API-Key': 'valid-key-123'}
))

print("\n📊 Scenario: ML model serving API with public endpoint")
print("   Goal: Only allow prediction requests, block admin operations")

# Test: Valid prediction request (ALLOWED)
print("\n✅ Test 1: POST /predict (with API key)")
result4 = api_policy.evaluate_http(
    method='POST',
    path='/predict',
    headers={'X-API-Key': 'valid-key-123'}
)
print(f"   Result: {result4.value} ✅ (valid prediction request)")

# Test: Admin endpoint (DENIED)
print("\n❌ Test 2: POST /admin/retrain (with API key)")
result5 = api_policy.evaluate_http(
    method='POST',
    path='/admin/retrain',
    headers={'X-API-Key': 'valid-key-123'}
)
print(f"   Result: {result5.value} ❌ (admin endpoint blocked)")
print("   Security: Prevents unauthorized model retraining")

# Test: Missing API key (DENIED)
print("\n❌ Test 3: POST /predict (without API key)")
result6 = api_policy.evaluate_http(
    method='POST',
    path='/predict',
    headers={}
)
print(f"   Result: {result6.value} ❌ (missing required API key)")

# Test: Wrong HTTP method (DENIED)
print("\n❌ Test 4: GET /predict (with API key)")
result7 = api_policy.evaluate_http(
    method='GET',
    path='/predict',
    headers={'X-API-Key': 'valid-key-123'}
)
print(f"   Result: {result7.value} ❌ (only POST allowed)")

# Example 3: DNS Security with Cilium (Prevent C2 Communication)
print("\n\n" + "=" * 70)
print("Example 3: DNS Security - Block Crypto-Mining and C2 Domains")
print("=" * 70)

dns_policy = CiliumNetworkPolicy(name="dns-security")

# Allow legitimate services
dns_policy.add_allowed_dns("amazonaws.com")  # S3 for STDF data
dns_policy.add_allowed_dns("postgres.ml-platform.svc.cluster.local")  # Internal DB

# Block crypto-mining pools
dns_policy.add_denied_dns("monero.crypto.pool")
dns_policy.add_denied_dns("bitcoin.mining.net")
dns_policy.add_denied_dns("xmr-pool.com")

# Block known C2 domains
dns_policy.add_denied_dns("malicious-c2.darkweb.onion")

print("\n📊 Scenario: Prevent compromised pods from communicating with malicious domains")
print("   Goal: Block DNS queries to crypto-mining pools and C2 servers")

# Test: Legitimate S3 access (ALLOWED)
print("\n✅ Test 1: DNS query for s3.amazonaws.com")
result8 = dns_policy.evaluate_dns("s3.amazonaws.com")
print(f"   Result: {result8.value} ✅ (legitimate AWS service)")

# Test: Crypto-mining pool (DENIED)
print("\n❌ Test 2: DNS query for monero.crypto.pool")
result9 = dns_policy.evaluate_dns("monero.crypto.pool")
print(f"   Result: {result9.value} ❌ (crypto-mining pool blocked)")
print("   Security: Prevents hijacked pod from mining cryptocurrency")

# Test: C2 domain (DENIED)
print("\n❌ Test 3: DNS query for malicious-c2.darkweb.onion")
result10 = dns_policy.evaluate_dns("malicious-c2.darkweb.onion")
print(f"   Result: {result10.value} ❌ (C2 server blocked)")
print("   Security: Prevents data exfiltration to attacker's server")

# Test: Internal database (ALLOWED)
print("\n✅ Test 4: DNS query for postgres.ml-platform.svc.cluster.local")
result11 = dns_policy.evaluate_dns("postgres.ml-platform.svc.cluster.local")
print(f"   Result: {result11.value} ✅ (internal service allowed)")

print("\n\n" + "=" * 70)
print("Network Security Summary")
print("=" * 70)
print("✅ Multi-tenant isolation: Team A can't access Team B's database")
print("✅ L7 API security: Only /predict allowed, /admin blocked")
print("✅ DNS security: Crypto-mining pools and C2 domains blocked")
print("✅ Zero-trust networking: Default deny all, explicit allow required")
print(f"\n💰 Business Value: $950K risk reduction from prevented lateral movement attacks!")

## 5. 🔐 Secrets Management - HashiCorp Vault and Sealed Secrets

### 📝 What's Happening in This Code?

**Purpose:** Securely manage sensitive credentials (API keys, database passwords, certificates) without hardcoding in containers.

**Key Points:**
- **HashiCorp Vault**: Dynamic secrets with short TTLs (e.g., 1-hour database credentials, auto-rotated)
- **Sealed Secrets**: Encrypted secrets stored in Git (controller decrypts at runtime)
- **Secrets Injection**: Secrets mounted as environment variables or files (never in image layers)
- **Automatic Rotation**: Vault auto-rotates secrets (reduces window of exposure)
- **Audit Logging**: Track who accessed which secret when (compliance requirement)

**Why This Matters:**
- **Prevent Credential Leakage**: Hardcoded secrets in Docker images leak when pushed to registry
- **Reduce Blast Radius**: 1-hour credentials less valuable than permanent passwords
- **Compliance**: PCI-DSS requires encryption of stored credentials, audit trails
- **Zero-Trust**: Applications get only secrets they need (principle of least privilege)

**Post-Silicon Application:**
- **Scenario**: ML training job needs AWS credentials to read STDF data from S3
- **Traditional Approach**: Store AWS_ACCESS_KEY_ID in ConfigMap → leaked when container image pushed to registry
- **Vault Approach**:
  1. Job pod requests credentials from Vault (authenticated via Kubernetes ServiceAccount)
  2. Vault generates temporary 1-hour AWS STS credentials
  3. Credentials injected as environment variables
  4. After training complete, credentials auto-expire (attacker gets nothing)
- **Result**: 99.5% reduction in credential exposure window, $1.2M savings from prevented breaches

In [None]:
# Secrets Management Simulation

class SecretType(Enum):
    """Type of secret"""
    DATABASE = "DATABASE"
    AWS_CREDENTIALS = "AWS_CREDENTIALS"
    API_KEY = "API_KEY"
    TLS_CERT = "TLS_CERT"
    SSH_KEY = "SSH_KEY"

@dataclass
class VaultSecret:
    """Dynamic secret from HashiCorp Vault"""
    secret_id: str
    secret_type: SecretType
    path: str
    data: Dict[str, str]
    ttl_seconds: int  # Time to live
    created_at: datetime
    accessed_by: List[str] = field(default_factory=list)  # Audit trail
    
    def is_expired(self) -> bool:
        """Check if secret is expired"""
        age_seconds = (datetime.now() - self.created_at).total_seconds()
        return age_seconds > self.ttl_seconds
    
    def time_remaining(self) -> int:
        """Get remaining time in seconds"""
        age_seconds = (datetime.now() - self.created_at).total_seconds()
        remaining = max(0, int(self.ttl_seconds - age_seconds))
        return remaining
    
    def record_access(self, accessor: str):
        """Record access for audit log"""
        self.accessed_by.append(f"{accessor} at {datetime.now().isoformat()}")

@dataclass
class SealedSecret:
    """Sealed Secret (encrypted at rest)"""
    name: str
    namespace: str
    encrypted_data: str  # Base64-encoded encrypted secret
    decrypted_data: Optional[Dict[str, str]] = None
    sealed: bool = True
    
    def unseal(self, controller_key: str) -> bool:
        """Simulate unsealing (decryption by controller)"""
        # In real implementation, this uses public-key cryptography
        # Only the cluster controller can decrypt
        if controller_key == "cluster-controller-private-key":
            # Simulate decryption
            self.decrypted_data = {
                'password': 'decrypted-db-password',
                'api-key': 'decrypted-api-key'
            }
            self.sealed = False
            return True
        return False

class VaultEngine:
    """HashiCorp Vault secrets engine"""
    
    def __init__(self):
        self.secrets: Dict[str, VaultSecret] = {}
        self.audit_log: List[str] = []
    
    def create_database_credentials(self, db_name: str, username: str, 
                                   ttl_seconds: int = 3600) -> VaultSecret:
        """Generate dynamic database credentials"""
        secret_id = f"db-{uuid.uuid4().hex[:8]}"
        
        # Simulate generating temporary credentials
        temp_password = hashlib.sha256(f"{username}{time.time()}".encode()).hexdigest()[:16]
        
        secret = VaultSecret(
            secret_id=secret_id,
            secret_type=SecretType.DATABASE,
            path=f"database/{db_name}/creds/{username}",
            data={
                'username': f"{username}_temp",
                'password': temp_password,
                'host': f"{db_name}.ml-platform.svc.cluster.local",
                'port': '5432'
            },
            ttl_seconds=ttl_seconds,
            created_at=datetime.now()
        )
        
        self.secrets[secret_id] = secret
        self.audit_log.append(f"[{datetime.now().isoformat()}] Created {secret_id} for {username}")
        
        return secret
    
    def create_aws_credentials(self, role: str, ttl_seconds: int = 3600) -> VaultSecret:
        """Generate temporary AWS STS credentials"""
        secret_id = f"aws-{uuid.uuid4().hex[:8]}"
        
        # Simulate AWS STS credential generation
        access_key = f"ASIA{hashlib.md5(f'{role}{time.time()}'.encode()).hexdigest()[:16].upper()}"
        secret_key = hashlib.sha256(f"{role}{time.time()}".encode()).hexdigest()
        session_token = hashlib.sha256(f"session{time.time()}".encode()).hexdigest()
        
        secret = VaultSecret(
            secret_id=secret_id,
            secret_type=SecretType.AWS_CREDENTIALS,
            path=f"aws/sts/{role}",
            data={
                'access_key_id': access_key,
                'secret_access_key': secret_key,
                'session_token': session_token,
                'region': 'us-west-2'
            },
            ttl_seconds=ttl_seconds,
            created_at=datetime.now()
        )
        
        self.secrets[secret_id] = secret
        self.audit_log.append(f"[{datetime.now().isoformat()}] Created AWS credentials {secret_id} for role {role}")
        
        return secret
    
    def get_secret(self, secret_id: str, accessor: str) -> Optional[VaultSecret]:
        """Retrieve secret (with access logging)"""
        secret = self.secrets.get(secret_id)
        if not secret:
            return None
        
        if secret.is_expired():
            self.audit_log.append(f"[{datetime.now().isoformat()}] {accessor} attempted to access expired secret {secret_id}")
            return None
        
        secret.record_access(accessor)
        self.audit_log.append(f"[{datetime.now().isoformat()}] {accessor} accessed {secret_id}")
        
        return secret
    
    def rotate_secret(self, secret_id: str) -> Optional[VaultSecret]:
        """Rotate secret (generate new credentials)"""
        old_secret = self.secrets.get(secret_id)
        if not old_secret:
            return None
        
        # Generate new credentials
        if old_secret.secret_type == SecretType.DATABASE:
            db_name = old_secret.path.split('/')[1]
            username = old_secret.data['username'].replace('_temp', '')
            new_secret = self.create_database_credentials(db_name, username, old_secret.ttl_seconds)
        elif old_secret.secret_type == SecretType.AWS_CREDENTIALS:
            role = old_secret.path.split('/')[-1]
            new_secret = self.create_aws_credentials(role, old_secret.ttl_seconds)
        else:
            return None
        
        # Revoke old secret
        del self.secrets[secret_id]
        self.audit_log.append(f"[{datetime.now().isoformat()}] Rotated {secret_id} → {new_secret.secret_id}")
        
        return new_secret
    
    def get_audit_log(self) -> List[str]:
        """Get audit log (for compliance)"""
        return self.audit_log

# Example 1: Dynamic Database Credentials
print("=" * 70)
print("Example 1: Dynamic Database Credentials (1-Hour TTL)")
print("=" * 70)

vault = VaultEngine()

print("\n📊 Scenario: ML training job needs database access for STDF data")
print("   Traditional Approach: Hardcode password in ConfigMap (permanent exposure)")
print("   Vault Approach: Generate temporary 1-hour credentials")

# Pod requests credentials
print("\n🔐 Step 1: Training pod requests database credentials...")
db_secret = vault.create_database_credentials(
    db_name="stdf_data",
    username="ml_training_user",
    ttl_seconds=3600  # 1 hour
)

print(f"   Secret ID: {db_secret.secret_id}")
print(f"   Path: {db_secret.path}")
print(f"   TTL: {db_secret.ttl_seconds} seconds ({db_secret.ttl_seconds // 60} minutes)")
print(f"   Created: {db_secret.created_at.strftime('%Y-%m-%d %H:%M:%S')}")
print(f"\n   Generated Credentials:")
print(f"   - Username: {db_secret.data['username']}")
print(f"   - Password: {db_secret.data['password'][:8]}... (masked)")
print(f"   - Host: {db_secret.data['host']}")
print(f"   - Port: {db_secret.data['port']}")

# Pod uses credentials
print("\n✅ Step 2: Training pod uses credentials to access database...")
accessed_secret = vault.get_secret(db_secret.secret_id, accessor="pod/ml-training-abc123")
if accessed_secret:
    print(f"   Access granted! Time remaining: {accessed_secret.time_remaining()} seconds")

# Simulate time passing
print("\n⏰ Step 3: Training completes, credentials auto-expire...")
time.sleep(0.5)
print(f"   Credentials expire in: {db_secret.time_remaining()} seconds")
print("   After expiration: Attacker who compromised pod gets nothing!")

# Example 2: AWS STS Credentials for S3 Access
print("\n\n" + "=" * 70)
print("Example 2: Temporary AWS Credentials (1-Hour STS Tokens)")
print("=" * 70)

print("\n📊 Scenario: STDF parser needs S3 access to read wafer test data")
print("   Traditional Approach: Store AWS_ACCESS_KEY_ID in environment (permanent)")
print("   Vault Approach: Generate temporary STS credentials")

# Pod requests AWS credentials
print("\n🔐 Step 1: Parser pod requests AWS credentials...")
aws_secret = vault.create_aws_credentials(
    role="ml-s3-readonly",
    ttl_seconds=3600  # 1 hour
)

print(f"   Secret ID: {aws_secret.secret_id}")
print(f"   Path: {aws_secret.path}")
print(f"   TTL: {aws_secret.ttl_seconds} seconds ({aws_secret.ttl_seconds // 60} minutes)")
print(f"\n   Generated AWS Credentials:")
print(f"   - Access Key ID: {aws_secret.data['access_key_id']}")
print(f"   - Secret Access Key: {aws_secret.data['secret_access_key'][:8]}... (masked)")
print(f"   - Session Token: {aws_secret.data['session_token'][:12]}... (masked)")
print(f"   - Region: {aws_secret.data['region']}")

# Pod uses credentials
print("\n✅ Step 2: Parser pod reads STDF files from S3...")
accessed_aws = vault.get_secret(aws_secret.secret_id, accessor="pod/stdf-parser-xyz789")
if accessed_aws:
    print(f"   S3 access granted! Reading s3://ml-stdf-data/wafer_test_2024.stdf")

# Example 3: Secret Rotation
print("\n\n" + "=" * 70)
print("Example 3: Automatic Secret Rotation")
print("=" * 70)

print(f"\n📊 Scenario: Database credentials need rotation (security policy requires 30-day rotation)")

# Create initial secret
print("\n🔐 Step 1: Create initial database secret...")
initial_secret = vault.create_database_credentials(
    db_name="production_db",
    username="api_server",
    ttl_seconds=2592000  # 30 days
)
print(f"   Initial Secret ID: {initial_secret.secret_id}")
print(f"   Password: {initial_secret.data['password'][:8]}...")

# Rotate secret
print("\n🔄 Step 2: Rotate secret after 30 days...")
rotated_secret = vault.rotate_secret(initial_secret.secret_id)
if rotated_secret:
    print(f"   New Secret ID: {rotated_secret.secret_id}")
    print(f"   New Password: {rotated_secret.data['password'][:8]}...")
    print(f"   Old secret revoked (can't be used anymore)")

# Example 4: Sealed Secrets (GitOps-Friendly)
print("\n\n" + "=" * 70)
print("Example 4: Sealed Secrets (Encrypted Secrets in Git)")
print("=" * 70)

print("\n📊 Scenario: Need to store secrets in Git for GitOps workflow")
print("   Traditional Approach: Store plaintext secrets (security risk)")
print("   Sealed Secrets Approach: Encrypt secrets, only cluster can decrypt")

# Create sealed secret
print("\n🔐 Step 1: Developer encrypts secret locally...")
sealed = SealedSecret(
    name="ml-api-key",
    namespace="model-serving",
    encrypted_data="AgB7Em5vF3YzZ...encrypted-base64-blob...xK9pL=="  # Encrypted
)
print(f"   Sealed Secret: {sealed.name}")
print(f"   Encrypted Data: {sealed.encrypted_data[:30]}...")
print(f"   Sealed: {sealed.sealed}")

# Push to Git
print("\n✅ Step 2: Push encrypted secret to Git (safe to commit)")
print("   Secret is encrypted with cluster public key")
print("   Only cluster controller has private key to decrypt")

# Cluster controller unseals
print("\n🔓 Step 3: Cluster controller unseals secret at runtime...")
if sealed.unseal(controller_key="cluster-controller-private-key"):
    print(f"   Secret unsealed successfully!")
    print(f"   Decrypted data: {sealed.decrypted_data}")
else:
    print(f"   ❌ Failed to unseal (invalid key)")

# Audit Log
print("\n\n" + "=" * 70)
print("Vault Audit Log (Compliance Requirement)")
print("=" * 70)

for log_entry in vault.get_audit_log():
    print(log_entry)

print("\n\n" + "=" * 70)
print("Secrets Management Summary")
print("=" * 70)
print("✅ Dynamic credentials: 1-hour database passwords (99.5% less exposure)")
print("✅ AWS STS: Temporary credentials auto-expire after job completes")
print("✅ Secret rotation: Automatic 30-day credential rotation")
print("✅ Sealed Secrets: GitOps-friendly encrypted secrets")
print("✅ Audit logging: Track who accessed which secret (PCI-DSS compliance)")
print(f"\n💰 Business Value: $1.2M savings from prevented credential leakage incidents!")

## 6. 📋 Real-World Projects: Container Security in Production

### Project 1: Automated CVE Scanning Pipeline 🔍
**Objective:** Build automated image scanning pipeline that blocks deployments with critical CVEs

**Business Value:** $2.4M/year savings from prevented security breaches (reduces MTTR by 85%)

**Features to Implement:**
- **CI/CD Integration:**
  - Scan every Docker image in GitHub Actions workflow before pushing to registry
  - Quality gate: Block deployment if critical CVEs > 0 or high CVEs > 2
  - Generate SBOM (Software Bill of Materials) for compliance
- **Registry Scanning:**
  - Continuous scanning of images in production registry (detect newly disclosed CVEs)
  - Automated alerts when vulnerable images detected (Slack/PagerDuty integration)
  - Dashboard showing vulnerability trends across all teams
- **Remediation Workflow:**
  - Jira ticket auto-creation for critical vulnerabilities
  - Pull request with Dockerfile updates (e.g., upgrade base image python:3.12.0 → python:3.12.1)
  - Track remediation time (KPI: critical CVEs fixed within 24 hours)

**Tech Stack:** Trivy, GitHub Actions, PostgreSQL (CVE history), Grafana (dashboards), Python

**Post-Silicon Application:**
- Scan ML model serving images for yield prediction API
- Block deployment of STDF parser with OpenSSL RCE vulnerability
- Generate compliance reports for SOC 2 audit (evidence of proactive security)

**Success Metrics:**
- 100% of production images scanned before deployment
- 0 critical CVEs in production (automated blocking enforced)
- 95% of high CVEs remediated within 72 hours
- 85% reduction in security incident response time

---

### Project 2: Runtime Threat Detection with Falco 🛡️
**Objective:** Implement behavioral monitoring to detect zero-day exploits and insider threats

**Business Value:** $1.8M/year protection from IP theft and data exfiltration

**Features to Implement:**
- **Behavioral Baselines:**
  - Learn normal behavior for each pod type (e.g., ML training pods read S3, write to database)
  - Detect deviations (unexpected shell spawn, file access to /etc/shadow, network connections)
- **Real-Time Alerting:**
  - Falco rules for container escapes, privilege escalation, crypto-mining
  - Integration with SIEM (Splunk/ELK) for correlation with other security events
  - Automated pod termination for critical threats (kill compromised pod, preserve evidence)
- **Forensics and Investigation:**
  - Capture syscall traces (eBPF) for post-incident analysis
  - Export pod logs/filesystem snapshot before termination (evidence for root cause analysis)
  - Playbooks for common threats (e.g., reverse shell → investigate npm dependencies)

**Tech Stack:** Falco, eBPF, Prometheus (metrics), Alertmanager, Python (custom rules)

**Post-Silicon Application:**
- Monitor STDF processing pods for unauthorized data access
- Detect compromised ML model serving pod attempting to exfiltrate training data
- Alert on privilege escalation in multi-tenant environment (prevent cross-team access)

**Success Metrics:**
- 99.9% detection rate for known attack patterns (MITRE ATT&CK framework)
- <5 minutes mean time to detection (MTTD) for critical threats
- 0 false positives for production workloads (tuned behavioral baselines)
- 100% of security incidents have forensic data for investigation

---

### Project 3: Zero-Trust Network Segmentation 🔒
**Objective:** Implement micro-segmentation with NetworkPolicy and Cilium for multi-tenant ML platform

**Business Value:** $950K/year risk reduction from limited lateral movement after breaches

**Features to Implement:**
- **Namespace Isolation:**
  - Each team gets dedicated namespace with default deny-all NetworkPolicy
  - Explicit allow rules for required connections (e.g., model → database, ingress → API)
- **L7 API Security:**
  - Cilium L7 policies for HTTP APIs (only /predict allowed, /admin blocked for public endpoints)
  - Rate limiting per client (prevent DDoS attacks)
  - Header-based routing (require X-API-Key for model serving)
- **DNS Security:**
  - Block DNS queries to crypto-mining pools, C2 domains
  - Allow-list for legitimate services (S3, internal databases)
  - DNS logging for threat intelligence (detect compromised pods)
- **Monitoring and Compliance:**
  - Hubble (Cilium observability) for network flow visualization
  - Compliance dashboards (prove PCI-DSS network segmentation)
  - Automated policy testing (verify team A can't access team B's data)

**Tech Stack:** Kubernetes NetworkPolicy, Cilium, Hubble, Prometheus, Python (policy generator)

**Post-Silicon Application:**
- Isolate yield prediction team from binning optimization team (prevent data leakage)
- Block ML model serving pods from accessing training data storage (principle of least privilege)
- Enforce API security: Only /predict allowed, /retrain blocked for production endpoints

**Success Metrics:**
- 100% of production namespaces have default deny NetworkPolicy
- 0 cross-team network connections (verified via Hubble)
- 99.5% reduction in attack blast radius (lateral movement blocked)
- <1 hour to implement new network policy (templated policies)

---

### Project 4: Secrets Management with Vault and GitOps 🔐
**Objective:** Eliminate hardcoded secrets, implement short-lived credentials and automatic rotation

**Business Value:** $1.2M/year savings from prevented credential leakage incidents

**Features to Implement:**
- **Vault Integration:**
  - Kubernetes authentication (pods get secrets via ServiceAccount)
  - Dynamic database credentials (1-hour TTL, auto-generated temp passwords)
  - AWS STS integration (temporary credentials for S3 access)
- **Sealed Secrets for GitOps:**
  - Encrypt secrets with cluster public key before committing to Git
  - Automated unsealing by cluster controller at deployment time
  - Audit log of who committed which secrets
- **Automatic Rotation:**
  - 30-day rotation for database passwords (security policy compliance)
  - 7-day rotation for API keys
  - Monitoring for expiring certificates (TLS, service mesh)
- **Compliance and Auditing:**
  - Vault audit logs exported to SIEM (who accessed which secret when)
  - Alerts for secrets accessed outside business hours (potential insider threat)
  - Compliance reports for SOC 2, PCI-DSS (evidence of encryption at rest)

**Tech Stack:** HashiCorp Vault, Sealed Secrets, Kubernetes, PostgreSQL (audit logs), Python

**Post-Silicon Application:**
- ML training jobs get temporary AWS credentials (S3 access for STDF data)
- STDF parser gets 1-hour database credentials (access to wafer test data)
- API keys for model serving auto-rotate every 7 days (prevent long-term exposure)

**Success Metrics:**
- 0 hardcoded secrets in ConfigMaps/environment variables
- 100% of database credentials have <24 hour TTL
- 99.5% reduction in credential exposure window (1 hour vs permanent)
- 100% audit coverage (all secret access logged)

---

### Project 5: Compliance Automation (GDPR, HIPAA, SOC 2) 📋
**Objective:** Automate compliance checks and evidence collection for security audits

**Business Value:** $480K/year from automated compliance (reduces audit prep time by 90%)

**Features to Implement:**
- **Policy as Code:**
  - OPA (Open Policy Admission) for compliance policies (e.g., all pods must have resource limits)
  - Automated rejection of non-compliant deployments (no pod without NetworkPolicy)
  - Compliance dashboards (% of workloads compliant)
- **Evidence Collection:**
  - Automated SBOM generation (track all dependencies for vulnerability management)
  - CVE scan reports exported to audit-ready format (PDF/CSV)
  - Network policy reports (prove team isolation for PCI-DSS)
  - Secrets audit logs (prove encryption at rest for GDPR)
- **Continuous Compliance:**
  - Daily compliance scans (detect drift from security baseline)
  - Alerts for compliance violations (e.g., pod without resource limits deployed)
  - Automated remediation (e.g., add missing labels, apply default NetworkPolicy)
- **Audit Trails:**
  - Immutable audit logs in S3 (tamper-proof evidence)
  - Compliance reports on-demand (who accessed what data when)
  - Integration with GRC tools (ServiceNow, AuditBoard)

**Tech Stack:** OPA, Kyverno, Trivy, PostgreSQL (audit logs), Python (report generation)

**Post-Silicon Application:**
- Prove STDF data access controls for SOC 2 audit (who accessed wafer test data)
- GDPR compliance: Audit trail of ML model predictions (data subject access requests)
- HIPAA compliance: Encryption of secrets, network isolation for medical device test data

**Success Metrics:**
- 100% of workloads compliant with security policies (OPA enforcement)
- 90% reduction in audit preparation time (automated evidence collection)
- 0 compliance violations in production (automated blocking)
- <1 hour to generate compliance report for auditors

---

### Project 6: Container Image Hardening and Optimization 🔧
**Objective:** Build minimal, secure container images (reduce attack surface by 85%)

**Business Value:** $180K/year from reduced storage costs and faster deployments

**Features to Implement:**
- **Distroless Base Images:**
  - Replace Alpine/Ubuntu with distroless (no shell, no package manager)
  - Multi-stage builds (build stage with tools, runtime stage with only binary)
  - Vulnerability reduction: 95% fewer packages = 95% fewer CVEs
- **Image Optimization:**
  - Layer caching for faster builds (separate dependencies from app code)
  - .dockerignore to exclude unnecessary files (test data, git history)
  - Compression: Reduce image size by 70% (faster pulls, lower registry costs)
- **Security Hardening:**
  - Non-root user (run as UID 1000, not root)
  - Read-only filesystem (prevent malware from writing to disk)
  - Drop all capabilities (minimize attack surface)
  - Seccomp profiles (whitelist only required syscalls)
- **Automated Testing:**
  - Container structure tests (verify non-root user, no shell)
  - Performance benchmarks (image size, startup time)
  - Security scans (Trivy, Snyk) in CI/CD pipeline

**Tech Stack:** Docker, Distroless images, Multi-stage builds, Container Structure Test

**Post-Silicon Application:**
- Build minimal STDF parser image (20MB vs 1GB Alpine)
- Harden ML model serving API (no shell, read-only filesystem)
- Optimize training job images (faster startup = lower GPU costs)

**Success Metrics:**
- 85% reduction in image size (1GB → 150MB)
- 95% reduction in CVE count (distroless vs Alpine)
- 100% of images run as non-root user
- 50% faster deployment time (smaller images)

---

### Project 7: Kubernetes Security Best Practices Enforcement ⚙️
**Objective:** Implement Pod Security Standards and admission control for secure-by-default cluster

**Business Value:** $320K/year from prevented misconfigurations and security incidents

**Features to Implement:**
- **Pod Security Standards:**
  - Enforce restricted profile (no privileged containers, no host network, drop all capabilities)
  - Admission webhooks to block non-compliant pods (OPA/Kyverno policies)
  - Namespace-level defaults (all pods in production must be restricted)
- **Resource Limits:**
  - Require CPU/memory limits (prevent resource exhaustion attacks)
  - LimitRanges for default values (prevent runaway pods)
  - Monitoring for pods exceeding limits (detect crypto-mining)
- **RBAC Best Practices:**
  - Principle of least privilege (developers can't create ClusterRoleBindings)
  - ServiceAccount per workload (no default ServiceAccount)
  - Audit RBAC permissions (detect over-permissive roles)
- **Image Security:**
  - Only allow images from trusted registries (block DockerHub)
  - Require image signatures (Sigstore/Notary verification)
  - Block latest tag (enforce semantic versioning)

**Tech Stack:** Kubernetes Pod Security Standards, OPA, Kyverno, RBAC, Python

**Post-Silicon Application:**
- Enforce restricted profile for all ML workloads (no privileged containers)
- Block STDF parser pods without resource limits (prevent resource exhaustion)
- Require signed images for production model serving (supply chain security)

**Success Metrics:**
- 100% of production pods meet restricted security profile
- 0 privileged containers in production
- 100% of images from trusted registries (no DockerHub)
- 99% reduction in RBAC over-permissions (automated auditing)

---

### Project 8: Incident Response Automation and Forensics 🚨
**Objective:** Build automated incident response pipeline for container security events

**Business Value:** $650K/year from 75% reduction in mean time to remediation (MTTR)

**Features to Implement:**
- **Automated Detection:**
  - Falco alerts integrated with PagerDuty/Slack (real-time notifications)
  - Correlation engine (detect multi-stage attacks: shell spawn → privilege escalation → exfiltration)
  - Threat intelligence feeds (block known malicious IPs/domains)
- **Automated Response:**
  - Kill compromised pod (isolate threat, preserve cluster)
  - Capture forensic snapshot (logs, filesystem, network connections)
  - Quarantine affected nodes (cordon node, prevent new pods)
  - Automated patching (deploy patched image, restart pod)
- **Forensics and Investigation:**
  - Export pod logs to S3 (immutable evidence)
  - eBPF syscall traces (reconstruct attack timeline)
  - Network flow logs (identify exfiltration targets)
  - Root cause analysis dashboard (visualize attack path)
- **Playbook Automation:**
  - Runbooks for common threats (reverse shell, crypto-mining, privilege escalation)
  - Automated remediation steps (e.g., rotate credentials, patch image, update firewall)
  - Post-incident reports (lessons learned, TTPs)

**Tech Stack:** Falco, eBPF, Python (automation scripts), S3 (forensic storage), Jupyter (analysis)

**Post-Silicon Application:**
- Detect compromised STDF parser attempting to access yield data (unauthorized database query)
- Automated response: Kill pod, capture logs, notify SOC team, deploy patched parser
- Forensic analysis: Identify compromised npm package as attack vector

**Success Metrics:**
- <5 minutes mean time to detection (MTTD)
- <15 minutes mean time to remediation (MTTR) for critical threats
- 100% of incidents have forensic data (logs, syscalls, network flows)
- 75% of incidents auto-remediated without human intervention

## 7. 🎯 Comprehensive Takeaways: Container Security Mastery

### Core Concepts

**Image Security:**
- **CVE Scanning**: Continuous vulnerability scanning with Trivy/Snyk (detect CVEs before deployment)
- **Quality Gates**: Automated blocking of deployments with critical vulnerabilities (0 critical, ≤2 high CVEs)
- **SBOM Generation**: Software Bill of Materials for dependency tracking (compliance requirement)
- **Base Image Selection**: Distroless images reduce attack surface by 95% (no shell, no package manager)
- **Image Signing**: Sigstore/Notary for supply chain security (verify image authenticity)

**Runtime Security:**
- **Behavioral Monitoring**: Falco detects anomalies via eBPF syscall tracing (shell spawn, privilege escalation)
- **Zero-Day Protection**: Behavioral baselines catch unknown threats (not signature-based detection)
- **Automated Response**: Kill compromised pods, capture forensics, alert SOC team
- **Threat Intelligence**: Integration with MITRE ATT&CK framework for known attack patterns

**Network Security:**
- **Zero-Trust Networking**: Default deny all traffic, explicit allow rules required (NetworkPolicy)
- **Micro-Segmentation**: Team-based isolation (team A can't access team B's database)
- **L7 Filtering**: Application-layer security with Cilium (HTTP method/path/header filtering)
- **DNS Security**: Block crypto-mining pools and C2 domains (prevent command-and-control)

**Secrets Management:**
- **Dynamic Secrets**: Short-lived credentials from Vault (1-hour database passwords, AWS STS tokens)
- **Automatic Rotation**: 30-day credential rotation (security policy compliance)
- **Sealed Secrets**: GitOps-friendly encrypted secrets (only cluster controller can decrypt)
- **Audit Logging**: Track who accessed which secret when (compliance requirement for PCI-DSS, SOC 2)

**Compliance:**
- **Policy as Code**: OPA/Kyverno for automated compliance enforcement (reject non-compliant deployments)
- **Evidence Collection**: Automated SBOM, CVE reports, audit logs (reduce audit prep time by 90%)
- **Continuous Compliance**: Daily scans for drift detection (alerts on compliance violations)
- **Audit Trails**: Immutable logs in S3 (tamper-proof evidence for auditors)

---

### Best Practices

**Development:**
1. **Scan images in CI/CD**: Block builds with critical CVEs before pushing to registry
2. **Multi-stage builds**: Separate build tools from runtime (smaller images, fewer vulnerabilities)
3. **Non-root user**: Run containers as UID 1000, not root (reduce privilege escalation risk)
4. **Read-only filesystem**: Mount root filesystem read-only (prevent malware persistence)
5. **No secrets in images**: Use Vault/Sealed Secrets, never hardcode credentials

**Deployment:**
1. **Pod Security Standards**: Enforce restricted profile (no privileged containers, drop all capabilities)
2. **Resource limits**: Require CPU/memory limits (prevent resource exhaustion attacks)
3. **NetworkPolicy**: Default deny all traffic, explicit allow rules (zero-trust networking)
4. **Image verification**: Only allow images from trusted registries with valid signatures
5. **ServiceAccount per workload**: No default ServiceAccount (principle of least privilege)

**Operations:**
1. **Runtime monitoring**: Deploy Falco for behavioral anomaly detection
2. **Continuous scanning**: Scan production registry for newly disclosed CVEs (alert on vulnerabilities)
3. **Automated patching**: Deploy updated images when CVE fixes available
4. **Incident response playbooks**: Automated remediation for common threats (shell spawn, crypto-mining)
5. **Regular audits**: Review RBAC permissions, NetworkPolicies, secrets access logs

**Compliance:**
1. **Automated evidence collection**: SBOM, CVE reports, audit logs (reduce manual work)
2. **Policy enforcement**: OPA admission control (block non-compliant deployments)
3. **Immutable audit logs**: Export to S3 with retention policies (tamper-proof evidence)
4. **Compliance dashboards**: Real-time visibility into security posture (% workloads compliant)
5. **Regular compliance scans**: Daily drift detection (alert on new violations)

---

### Advanced Patterns

**Defense in Depth:**
- **Layer 1 - Image**: Scan for CVEs, use distroless base, verify signatures
- **Layer 2 - Runtime**: Falco behavioral monitoring, seccomp profiles, AppArmor
- **Layer 3 - Network**: NetworkPolicy isolation, Cilium L7 filtering, DNS security
- **Layer 4 - Secrets**: Vault dynamic credentials, Sealed Secrets, automatic rotation
- **Layer 5 - Compliance**: OPA policies, continuous auditing, immutable logs

**Multi-Tenant Security:**
- **Namespace isolation**: Each team gets dedicated namespace with default deny NetworkPolicy
- **RBAC segregation**: Developers can only access their namespace (no cross-team access)
- **Resource quotas**: Prevent one team from consuming all cluster resources
- **Network segmentation**: Team A pods can't reach team B services (Cilium identity-based policies)
- **Secrets isolation**: Team A can't access team B's Vault secrets (Kubernetes authentication)

**Incident Response Automation:**
1. **Detection**: Falco alerts on suspicious behavior (shell spawn, privilege escalation)
2. **Correlation**: Link related events (shell → file access → network exfiltration)
3. **Containment**: Kill compromised pod, cordon node, isolate network
4. **Forensics**: Capture logs, syscall traces, filesystem snapshot
5. **Remediation**: Deploy patched image, rotate credentials, update firewall rules
6. **Post-Incident**: Root cause analysis, update playbooks, improve detection

**Supply Chain Security:**
- **Dependency scanning**: Analyze all packages in SBOM for known vulnerabilities
- **Image provenance**: Track where image was built (verify trusted CI/CD pipeline)
- **Signature verification**: Cosign/Notary ensure image not tampered with
- **SLSA framework**: Track build provenance from source code to production image
- **Third-party dependencies**: Monitor for malicious packages (npm, PyPI supply chain attacks)

---

### Common Pitfalls

**Image Security Mistakes:**
1. ❌ **Using latest tag**: Unpredictable updates, can't reproduce builds → Use semantic versioning (v1.2.3)
2. ❌ **Running as root**: Privilege escalation risk → Use non-root user (USER 1000 in Dockerfile)
3. ❌ **Alpine base without scanning**: Alpine has vulnerabilities too → Scan all images, consider distroless
4. ❌ **Secrets in environment variables**: Visible in pod spec → Use Vault/Sealed Secrets
5. ❌ **No resource limits**: Resource exhaustion attacks → Require CPU/memory limits

**Runtime Security Mistakes:**
1. ❌ **No behavioral monitoring**: Can't detect zero-day exploits → Deploy Falco for syscall monitoring
2. ❌ **Privileged containers**: Full host access → Enforce restricted Pod Security Standard
3. ❌ **Host network/PID**: Breaking container isolation → Block in admission control
4. ❌ **No incident response plan**: Slow remediation → Automate playbooks (kill pod, capture forensics)
5. ❌ **Ignoring alerts**: Falco noise from false positives → Tune behavioral baselines

**Network Security Mistakes:**
1. ❌ **No NetworkPolicy**: All pods can reach all services → Default deny all traffic
2. ❌ **Overly permissive rules**: Allow all ports/protocols → Least privilege (only required ports)
3. ❌ **IP-based policies**: IPs change with pod restarts → Use label-based policies
4. ❌ **No L7 filtering**: Can't block /admin endpoint → Use Cilium for HTTP path filtering
5. ❌ **DNS unrestricted**: Pods can query crypto-mining pools → Block malicious domains

**Secrets Management Mistakes:**
1. ❌ **Hardcoded secrets**: Permanent exposure → Use Vault dynamic secrets (1-hour TTL)
2. ❌ **Long-lived credentials**: Large exposure window → Automatic 30-day rotation
3. ❌ **Secrets in ConfigMap**: Plaintext storage → Use Sealed Secrets for GitOps
4. ❌ **No audit logging**: Can't track who accessed secrets → Enable Vault audit logs
5. ❌ **Secrets in image layers**: Visible in registry → Mount secrets as volumes, not ENV

**Compliance Mistakes:**
1. ❌ **Manual compliance checks**: Slow and error-prone → Automate with OPA policies
2. ❌ **No audit trail**: Can't prove compliance → Export immutable logs to S3
3. ❌ **Drift from baseline**: Unnoticed violations → Daily compliance scans
4. ❌ **No evidence collection**: Slow audit prep → Automate SBOM/CVE report generation
5. ❌ **Policy not enforced**: Violations slip through → Admission control blocks non-compliant deploys

---

### Production Checklist

**Pre-Deployment:**
- [ ] Image scanned for CVEs (0 critical, ≤2 high)
- [ ] SBOM generated and reviewed
- [ ] Image signed with Cosign/Notary
- [ ] Image from trusted registry (no DockerHub)
- [ ] Dockerfile uses non-root user (USER 1000)
- [ ] Multi-stage build (separate build/runtime)
- [ ] Resource limits defined (CPU, memory)
- [ ] No hardcoded secrets (Vault/Sealed Secrets)
- [ ] NetworkPolicy defined (default deny)
- [ ] RBAC permissions reviewed (least privilege)

**Runtime:**
- [ ] Falco deployed for behavioral monitoring
- [ ] Pod Security Standard enforced (restricted profile)
- [ ] Read-only root filesystem
- [ ] All capabilities dropped
- [ ] Seccomp/AppArmor profiles applied
- [ ] L7 NetworkPolicy for API endpoints
- [ ] DNS filtering enabled (block malicious domains)
- [ ] Secrets injected via Vault (short-lived)
- [ ] Monitoring alerts configured (PagerDuty/Slack)
- [ ] Incident response playbook documented

**Compliance:**
- [ ] OPA policies enforced (automated compliance)
- [ ] Audit logs exported to S3 (immutable)
- [ ] SBOM available for auditors
- [ ] CVE scan reports generated
- [ ] NetworkPolicy compliance verified
- [ ] Secrets audit trail reviewed
- [ ] Resource limits enforced (prevent DoS)
- [ ] RBAC audit completed (no over-permissions)
- [ ] Compliance dashboard reviewed
- [ ] Evidence package ready for auditors

---

### Troubleshooting Guide

**CVE Scanning Issues:**
- **Problem**: Too many false positives (CVEs in unused dependencies)
  - **Solution**: Use .trivyignore to suppress false positives, verify CVE applicability
- **Problem**: Scans too slow (>10 minutes per image)
  - **Solution**: Cache CVE database, scan only changed layers, parallelize scans
- **Problem**: Quality gate too strict (blocks all deployments)
  - **Solution**: Gradual rollout (warn mode → enforce mode), team-specific thresholds

**Runtime Security Issues:**
- **Problem**: Falco alert storm (too many false positives)
  - **Solution**: Tune behavioral baselines, whitelist known-good processes, adjust sensitivity
- **Problem**: Legitimate processes blocked (e.g., kubectl exec for debugging)
  - **Solution**: Create exception rules for debug pods, use audit mode first
- **Problem**: High CPU from eBPF monitoring
  - **Solution**: Optimize Falco rules, reduce event sampling rate, use kernel 5.8+ (eBPF improvements)

**Network Policy Issues:**
- **Problem**: Pod can't reach database (connectivity broken after NetworkPolicy applied)
  - **Solution**: Add explicit allow rule for database port, verify label selectors match
- **Problem**: DNS not working (pods can't resolve service names)
  - **Solution**: Allow UDP port 53 to kube-dns/CoreDNS namespace
- **Problem**: Ingress traffic blocked (external clients can't reach API)
  - **Solution**: Add ingress rule for load balancer IP range, check L7 policies

**Secrets Management Issues:**
- **Problem**: Pod can't access Vault secrets (authentication fails)
  - **Solution**: Verify ServiceAccount has Vault role binding, check network connectivity to Vault
- **Problem**: Secrets expired (pod crashes after 1 hour)
  - **Solution**: Implement secret renewal logic, increase TTL for long-running jobs, use Vault agent
- **Problem**: Sealed Secret won't unseal (controller can't decrypt)
  - **Solution**: Verify cluster controller key matches encryption key, check namespace match

---

### Next Steps

**Immediate Actions:**
1. Implement CVE scanning in CI/CD pipeline (block critical vulnerabilities)
2. Deploy Falco for runtime monitoring (detect behavioral anomalies)
3. Create NetworkPolicy for production namespaces (default deny all traffic)
4. Migrate secrets to Vault (eliminate hardcoded credentials)
5. Enable Pod Security Standards (enforce restricted profile)

**Short-Term (1-3 Months):**
1. Automate compliance evidence collection (SBOM, CVE reports, audit logs)
2. Implement L7 NetworkPolicy with Cilium (API-level security)
3. Build incident response automation (kill pod, capture forensics)
4. Migrate to distroless base images (reduce attack surface by 95%)
5. Set up continuous registry scanning (detect new CVEs in production)

**Long-Term (3-6 Months):**
1. Implement supply chain security (image signing, SLSA provenance)
2. Build compliance automation platform (OPA policies, continuous scanning)
3. Advanced threat detection (ML-based anomaly detection)
4. Multi-cluster security (federated policies, centralized audit logs)
5. Security chaos engineering (test incident response automation)

**Related Notebooks:**
- **Notebook 131**: Docker ML Containerization (build secure images)
- **Notebook 132-133**: Kubernetes ML Fundamentals (RBAC, Pod Security)
- **Notebook 134**: Service Mesh (mTLS, network security)
- **Notebook 135**: GitOps with ArgoCD (Sealed Secrets, policy automation)
- **Notebook 136**: CI/CD for ML (automated CVE scanning in pipelines)
- **Notebook 137**: Infrastructure as Code (Vault deployment with Terraform)
- **Next**: Advanced Topics (Zero-Trust Architecture, Service Mesh Security)

---

### Key Metrics to Track

**Security Metrics:**
- **CVE Detection Rate**: % of images scanned before production deployment (target: 100%)
- **Vulnerability Remediation Time**: Days from CVE disclosure to patch deployed (target: <7 days for critical)
- **Runtime Threat Detection**: Mean time to detection (MTTD) for security events (target: <5 minutes)
- **Incident Response**: Mean time to remediation (MTTR) for security incidents (target: <15 minutes)
- **False Positive Rate**: % of Falco alerts that are false positives (target: <5%)

**Compliance Metrics:**
- **Policy Compliance Rate**: % of workloads meeting security policies (target: 100%)
- **Audit Readiness**: Time to generate compliance evidence package (target: <1 hour)
- **Secrets Exposure**: % of credentials with TTL >24 hours (target: 0%)
- **Network Isolation**: % of namespaces with NetworkPolicy (target: 100%)
- **Drift Detection**: Time from compliance violation to remediation (target: <24 hours)

**Business Impact:**
- **Security Incident Cost Avoidance**: $$$ saved from prevented breaches
- **Compliance Audit Efficiency**: % reduction in audit preparation time
- **Developer Productivity**: % reduction in security-related deployment delays
- **Infrastructure Efficiency**: % reduction in image storage costs (smaller images)
- **Risk Reduction**: Quantified reduction in security risk (e.g., CVSS score improvements)

---

**Congratulations!** 🎉 You've mastered container security and compliance - from CVE scanning to runtime threat detection to secrets management. You can now build production-grade secure ML platforms that meet SOC 2, PCI-DSS, and HIPAA requirements while protecting against modern threats! 🚀🔒

## 🎯 Key Takeaways

### When to Use Container Security
- **Production deployments**: Any containerized application in production (ML models, web services)
- **Regulated industries**: Automotive (ISO 26262), aerospace (DO-178C), medical devices (IEC 62304)
- **Multi-tenant environments**: Shared clusters where isolation is critical
- **Supply chain security**: Verify base images, dependencies haven't been tampered with
- **Compliance requirements**: SOC 2, PCI-DSS, HIPAA mandate container security controls

### Limitations
- **Performance overhead**: Image scanning adds 30-60s to CI/CD, runtime security adds 2-5% CPU
- **False positives**: CVE scanners flag vulnerabilities in unused code paths (require triaging)
- **Complexity**: Security policies (OPA, Kyverno) require learning curve for policy-as-code
- **Tool sprawl**: Image scanning (Trivy), runtime security (Falco), policy enforcement (OPA) = integration complexity

### Alternatives
- **VM-based security**: Traditional hypervisor isolation (heavier, better isolation)
- **Serverless**: Cloud-managed security (Lambda, Cloud Run) - less control, simpler
- **Manual audits**: Periodic security reviews (doesn't scale, misses real-time threats)
- **Network-only security**: Firewalls, VPNs without container-specific controls (incomplete)

### Best Practices
- **Minimal base images**: Use distroless or alpine (10-50MB vs. 200MB+ for ubuntu) - smaller attack surface
- **Image scanning in CI**: Fail builds on HIGH/CRITICAL CVEs (Trivy, Grype, Snyk)
- **Non-root containers**: Run as UID 1000+, drop capabilities (CAP_NET_RAW, CAP_SYS_ADMIN)
- **Read-only filesystems**: Mount root filesystem read-only, use tmpfs for writes
- **Network policies**: Default deny, whitelist only necessary pod-to-pod communication
- **Secret management**: Use Kubernetes Secrets or external vaults (never hardcode in images)

## 📊 Diagnostic Checks Summary

### Implementation Checklist
✅ **Image Security**
- Base image selection: Distroless (google/distroless), Alpine (<10MB), slim variants
- Vulnerability scanning: Trivy, Grype in CI/CD (fail on HIGH/CRITICAL CVEs)
- Image signing: Cosign, Notary for supply chain verification
- Registry security: Private registries (Harbor, ECR), image pull secrets

✅ **Runtime Security**
- Non-root execution: `USER 1000` in Dockerfile, runAsNonRoot: true in K8s
- Capabilities: Drop all, add only needed (NET_BIND_SERVICE for port <1024)
- Read-only root filesystem: readOnlyRootFilesystem: true, use emptyDir for temp files
- AppArmor/SELinux: Mandatory access control profiles

✅ **Network Security**
- Network policies: Default deny ingress/egress, whitelist pod-to-pod, pod-to-external
- Service mesh (Istio/Linkerd): mTLS for pod-to-pod encryption
- Ingress security: TLS termination, WAF (Web Application Firewall), rate limiting
- Egress filtering: Block unnecessary external connections (only allow specific APIs)

✅ **Secret Management**
- Kubernetes Secrets: Encrypted at rest (EncryptionConfiguration), RBAC-controlled access
- External vaults: HashiCorp Vault, AWS Secrets Manager for sensitive credentials
- Secret rotation: Automated rotation every 90 days, immediate rotation on breach
- No hardcoded secrets: Scan images for API keys, passwords (gitleaks, truffleHog)

### Quality Metrics
- **Image scan pass rate**: >95% of images with zero HIGH/CRITICAL CVEs
- **Security patch time**: <7 days to update base images after CVE disclosure
- **Secret exposure**: Zero secrets in image layers or Git history
- **Network policy coverage**: 100% of pods with NetworkPolicy defined

### Post-Silicon Validation Applications
**1. ML Model Container Security**
- Threat: Adversarial attacks inject malicious inputs to steal model IP
- Defense: Input validation, rate limiting (100 req/min/IP), anomaly detection
- Compliance: ISO 26262 for automotive ML models (validated containers, signed images)
- Business value: Prevent IP theft (model extraction attacks), ensure automotive safety compliance

**2. Test Data Processing Pipeline Security**
- Threat: STDF parser vulnerabilities could corrupt yield database
- Defense: Read-only containers, sandboxed execution (gVisor), network isolation
- Compliance: SOC 2 for customer data handling (encrypted secrets, audit logs)
- Business value: Protect proprietary test data, maintain customer trust

**3. Yield Prediction API Security**
- Threat: DDoS attacks on inference service disrupt manufacturing decisions
- Defense: Rate limiting, WAF, horizontal pod autoscaling (HPA) for resilience
- Compliance: Container hardening per CIS Benchmarks (non-root, dropped caps)
- Business value: 99.9% uptime for critical yield prediction service

### Business ROI Estimation

**Scenario 1: Medium-Volume Semiconductor Fab (100K wafers/year)**
- Container vulnerability remediation: Prevent data breach = **$5M/year** avoided incident costs
- Secret management: Eliminate hardcoded credentials = **$1M/year** reduced credential rotation overhead
- Network policies: Limit lateral movement in breach = **$3M/year** reduced blast radius
- **Total ROI: $9M/year** (cost: $150K security tools + $200K training = $8.65M net)

**Scenario 2: High-Volume Automotive Semiconductor (500K wafers/year)**
- ISO 26262 compliance: Validated container infrastructure = **$8M/year** audit efficiency
- Zero-trust networking: Prevent supply chain attacks = **$25M/year** IP protection
- Runtime security (Falco): Detect anomalous behavior in real-time = **$12M/year** faster incident response
- **Total ROI: $45M/year** (cost: $800K enterprise security platform + $500K team = $43.7M net)

**Scenario 3: Advanced Node R&D Fab (<10K wafers/year, IP-heavy)**
- Image signing: Verify provenance of ML training containers = **$4M/year** IP theft prevention
- Secret rotation: Automated credential management = **$500K/year** reduced manual overhead
- Compliance automation: CIS Benchmark scanning = **$2M/year** audit readiness
- **Total ROI: $6.5M/year** (cost: $200K security tools + $150K training = $6.15M net)

## 📈 Progress Update

**Notebook 138: Container Security & Compliance** expanded from 11 → 15 cells ✅

**Session progress: 11 notebooks completed**
- 12-cell: 129, 133, 162, 163, 164
- 11-cell: 111, 112, 116, 130, 138, 151

Completion rate: ~73% (128/175 notebooks)

---

## 🎓 Mastery Achievement

**You now have production-grade expertise in:**
- ✅ Scanning container images for vulnerabilities with Trivy and implementing remediation workflows
- ✅ Hardening containers (non-root, read-only FS, dropped capabilities, minimal base images)
- ✅ Implementing network policies and service mesh security (Istio mTLS) in Kubernetes
- ✅ Managing secrets securely with Kubernetes Secrets and external vaults (HashiCorp Vault)
- ✅ Achieving compliance (ISO 26262, SOC 2, CIS Benchmarks) for semiconductor ML workloads

**Next Steps:**
- **Supply Chain Security**: SLSA framework, SBOM (Software Bill of Materials) generation
- **Runtime Threat Detection**: Falco for anomaly detection, eBPF-based security monitoring
- **Zero Trust Architecture**: Implement full zero-trust networking with identity-based access