# Lab 04: LLM-Powered Log Analysis

Use Large Language Models to analyze security logs and extract insights.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/depalmar/ai_for_the_win/blob/main/notebooks/lab04_llm_log_analysis.ipynb)

## Learning Objectives
- Log parsing and normalization
- Using LLMs for log interpretation
- Pattern extraction and summarization
- Anomaly explanation generation

In [None]:
# Install dependencies (uncomment for Colab)
# !pip install anthropic pandas

In [None]:
import os
import re
import json
from datetime import datetime
from typing import List, Dict
from dataclasses import dataclass

# Set your API key
# os.environ['ANTHROPIC_API_KEY'] = 'your-api-key-here'

## 1. Log Data Structures

In [None]:
@dataclass
class LogEntry:
    """Parsed log entry."""
    timestamp: str
    source: str
    event_type: str
    severity: str
    message: str
    raw: str

# Sample security logs
SAMPLE_LOGS = """
2024-01-15 09:15:23 AUTH Failed login attempt for user admin from 192.168.1.100
2024-01-15 09:15:24 AUTH Failed login attempt for user admin from 192.168.1.100
2024-01-15 09:15:25 AUTH Failed login attempt for user admin from 192.168.1.100
2024-01-15 09:15:26 AUTH Account locked: admin after 3 failed attempts
2024-01-15 09:17:00 FIREWALL Blocked connection from 10.0.0.50 to 203.0.113.5:4444
2024-01-15 09:17:01 FIREWALL Blocked connection from 10.0.0.50 to 203.0.113.5:4444
2024-01-15 09:18:00 PROCESS Suspicious process spawned: powershell.exe -enc SGVsbG8gV29ybGQ=
2024-01-15 09:18:05 NETWORK Unusual DNS query: malware-c2.evil.com
2024-01-15 09:19:00 FILE New executable created: C:\\Users\\Admin\\Downloads\\update.exe
2024-01-15 09:20:00 PROCESS Process injection detected: explorer.exe -> svchost.exe
""".strip()

## 2. Log Parser

In [None]:
class LogParser:
    """Parse various log formats."""
    
    PATTERNS = {
        'standard': r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) (\w+) (.+)',
        'syslog': r'(\w{3}\s+\d+\s+\d{2}:\d{2}:\d{2}) (\S+) (\S+): (.+)'
    }
    
    SEVERITY_KEYWORDS = {
        'critical': ['injection', 'ransomware', 'exfiltration'],
        'high': ['failed', 'blocked', 'suspicious', 'malware'],
        'medium': ['unusual', 'warning', 'locked'],
        'low': ['info', 'success', 'allowed']
    }
    
    def parse(self, raw_logs: str) -> List[LogEntry]:
        entries = []
        for line in raw_logs.strip().split('\n'):
            if not line.strip():
                continue
            entry = self._parse_line(line)
            if entry:
                entries.append(entry)
        return entries
    
    def _parse_line(self, line: str) -> LogEntry:
        match = re.match(self.PATTERNS['standard'], line)
        if match:
            timestamp, source, message = match.groups()
            return LogEntry(
                timestamp=timestamp,
                source=source,
                event_type=source,
                severity=self._classify_severity(message),
                message=message,
                raw=line
            )
        return None
    
    def _classify_severity(self, message: str) -> str:
        message_lower = message.lower()
        for severity, keywords in self.SEVERITY_KEYWORDS.items():
            if any(kw in message_lower for kw in keywords):
                return severity
        return 'info'

# Parse logs
parser = LogParser()
entries = parser.parse(SAMPLE_LOGS)

print(f"Parsed {len(entries)} log entries")
for entry in entries[:3]:
    print(f"  [{entry.severity.upper()}] {entry.source}: {entry.message[:50]}...")

## 3. LLM Log Analyzer

In [None]:
class LLMLogAnalyzer:
    """Use LLM to analyze security logs."""
    
    def __init__(self):
        try:
            from anthropic import Anthropic
            self.client = Anthropic()
            self.available = True
        except:
            self.available = False
            print("Note: Anthropic client not available. Using mock responses.")
    
    def analyze_logs(self, entries: List[LogEntry]) -> Dict:
        """Analyze log entries and generate insights."""
        logs_text = "\n".join([e.raw for e in entries])
        
        prompt = f"""Analyze these security logs and provide:
1. Summary of events
2. Potential security incidents detected
3. Attack timeline if applicable
4. Recommended actions

LOGS:
{logs_text}

Provide a structured analysis."""
        
        if self.available:
            response = self.client.messages.create(
                model="claude-sonnet-4-20250514",
                max_tokens=1024,
                messages=[{"role": "user", "content": prompt}]
            )
            return {"analysis": response.content[0].text}
        else:
            return self._mock_analysis(entries)
    
    def _mock_analysis(self, entries: List[LogEntry]) -> Dict:
        """Mock analysis for demo purposes."""
        return {
            "analysis": """
## Security Log Analysis

### Summary
The logs show a potential attack sequence spanning approximately 5 minutes.

### Incidents Detected
1. **Brute Force Attack** (09:15:23 - 09:15:26)
   - 3 failed login attempts for 'admin' from 192.168.1.100
   - Account was locked after threshold reached

2. **C2 Communication Attempt** (09:17:00 - 09:18:05)
   - Blocked outbound connection to 203.0.113.5:4444
   - DNS query to known malware domain: malware-c2.evil.com

3. **Malware Execution** (09:18:00 - 09:20:00)
   - Encoded PowerShell execution
   - New executable dropped
   - Process injection detected

### Recommended Actions
1. Isolate affected hosts (10.0.0.50, workstation with admin user)
2. Block C2 IP 203.0.113.5 at perimeter
3. Analyze dropped executable update.exe
4. Reset admin credentials after investigation
5. Check for lateral movement
"""
        }

# Analyze logs
analyzer = LLMLogAnalyzer()
result = analyzer.analyze_logs(entries)
print(result['analysis'])

## 4. Pattern Detection

In [None]:
class PatternDetector:
    """Detect common attack patterns in logs."""
    
    ATTACK_PATTERNS = {
        'brute_force': {
            'keywords': ['failed login', 'authentication failed'],
            'threshold': 3,
            'window_seconds': 60
        },
        'c2_communication': {
            'keywords': ['blocked connection', 'suspicious dns', 'malware'],
            'ports': [4444, 8888, 31337, 6667]
        },
        'lateral_movement': {
            'keywords': ['psexec', 'wmi', 'remote', 'injection']
        },
        'data_exfiltration': {
            'keywords': ['large transfer', 'upload', 'exfil']
        }
    }
    
    def detect(self, entries: List[LogEntry]) -> List[Dict]:
        detections = []
        
        for pattern_name, config in self.ATTACK_PATTERNS.items():
            matches = self._find_matches(entries, config)
            if matches:
                detections.append({
                    'pattern': pattern_name,
                    'count': len(matches),
                    'entries': matches[:5]  # First 5 matches
                })
        
        return detections
    
    def _find_matches(self, entries: List[LogEntry], config: Dict) -> List[LogEntry]:
        matches = []
        for entry in entries:
            message_lower = entry.message.lower()
            if any(kw in message_lower for kw in config.get('keywords', [])):
                matches.append(entry)
        return matches

# Detect patterns
detector = PatternDetector()
patterns = detector.detect(entries)

print("\nDetected Attack Patterns:")
print("=" * 40)
for p in patterns:
    print(f"\n{p['pattern'].upper()} ({p['count']} events)")
    for entry in p['entries']:
        print(f"  - [{entry.timestamp}] {entry.message[:60]}...")

## 5. Log Statistics

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Convert to DataFrame for analysis
df = pd.DataFrame([{
    'timestamp': e.timestamp,
    'source': e.source,
    'severity': e.severity,
    'message': e.message
} for e in entries])

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Events by source
df['source'].value_counts().plot(kind='bar', ax=axes[0], color='steelblue')
axes[0].set_title('Events by Source')
axes[0].set_ylabel('Count')

# Events by severity
severity_order = ['critical', 'high', 'medium', 'low', 'info']
severity_colors = {'critical': 'red', 'high': 'orange', 'medium': 'yellow', 'low': 'green', 'info': 'blue'}
severity_counts = df['severity'].value_counts()
colors = [severity_colors.get(s, 'gray') for s in severity_counts.index]
severity_counts.plot(kind='bar', ax=axes[1], color=colors)
axes[1].set_title('Events by Severity')
axes[1].set_ylabel('Count')

plt.tight_layout()
plt.show()

## Summary

In this lab, we built an LLM-powered log analysis system:

1. **Log Parsing** - Extracted structured data from raw logs
2. **LLM Analysis** - Used Claude to generate security insights
3. **Pattern Detection** - Identified common attack patterns
4. **Visualization** - Created dashboards for log statistics

### Key Takeaways:
- LLMs excel at summarizing and correlating log events
- Combine rule-based detection with LLM analysis
- Structured prompts yield better results
- Pre-filter logs to reduce token usage

### Next Steps:
1. Add real-time log streaming
2. Build query interface for log search
3. Create automated alerting pipeline