# Lab 16: Threat Actor Profiling

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/depalmar/ai_for_the_win/blob/main/notebooks/lab16_threat_actor_profiling.ipynb)

Profile and attribute threat actors using AI.

## Learning Objectives
- TTP extraction and encoding
- Campaign clustering for attribution
- Malware similarity analysis
- LLM-powered profile generation

In [None]:
!pip install pandas scikit-learn anthropic -q

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import MultiLabelBinarizer
from typing import List, Dict
import json

## 1. TTP Extraction

In [None]:
# Sample incident data with TTPs
INCIDENTS = [
    {"id": "INC001", "ttps": ["T1566.001", "T1059.001", "T1003.001", "T1071.001"], "malware": "Emotet"},
    {"id": "INC002", "ttps": ["T1566.001", "T1059.001", "T1003.001", "T1071.001"], "malware": "TrickBot"},
    {"id": "INC003", "ttps": ["T1190", "T1505.003", "T1059.003", "T1021.002"], "malware": "WebShell"},
    {"id": "INC004", "ttps": ["T1566.002", "T1204.002", "T1059.001", "T1003.001"], "malware": "QakBot"},
    {"id": "INC005", "ttps": ["T1190", "T1505.003", "T1059.003", "T1486"], "malware": "Ransomware"},
]

# MITRE ATT&CK technique descriptions
TTP_DESCRIPTIONS = {
    "T1566.001": "Phishing: Spearphishing Attachment",
    "T1566.002": "Phishing: Spearphishing Link",
    "T1059.001": "Command and Scripting: PowerShell",
    "T1059.003": "Command and Scripting: Windows Command Shell",
    "T1003.001": "Credential Dumping: LSASS Memory",
    "T1071.001": "Application Layer Protocol: Web Protocols",
    "T1190": "Exploit Public-Facing Application",
    "T1505.003": "Server Software Component: Web Shell",
    "T1021.002": "Remote Services: SMB/Windows Admin Shares",
    "T1204.002": "User Execution: Malicious File",
    "T1486": "Data Encrypted for Impact",
}

print("Sample Incidents:")
for inc in INCIDENTS:
    print(f"  {inc['id']}: {inc['malware']} - {len(inc['ttps'])} TTPs")

## 2. TTP-Based Clustering

In [None]:
# Encode TTPs as binary features
mlb = MultiLabelBinarizer()
ttp_matrix = mlb.fit_transform([inc['ttps'] for inc in INCIDENTS])

print(f"TTP Feature Matrix Shape: {ttp_matrix.shape}")
print(f"TTPs encoded: {list(mlb.classes_)}")

# Cluster incidents
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(ttp_matrix)

print("\nClustering Results:")
for i, inc in enumerate(INCIDENTS):
    print(f"  {inc['id']} ({inc['malware']}): Cluster {clusters[i]}")

## 3. Actor Profile Database

In [None]:
# Known threat actor profiles
ACTOR_PROFILES = {
    "TA505": {
        "common_ttps": ["T1566.001", "T1059.001", "T1003.001", "T1071.001"],
        "malware": ["Emotet", "TrickBot", "Dridex"],
        "motivation": "Financial",
        "origin": "Eastern Europe"
    },
    "APT28": {
        "common_ttps": ["T1566.002", "T1190", "T1505.003", "T1003.001"],
        "malware": ["X-Agent", "Zebrocy"],
        "motivation": "Espionage",
        "origin": "Russia"
    },
}

def calculate_actor_similarity(incident_ttps: List[str], actor_ttps: List[str]) -> float:
    """Calculate Jaccard similarity between incident and actor TTPs."""
    set1, set2 = set(incident_ttps), set(actor_ttps)
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union if union > 0 else 0

# Match incidents to actors
print("Actor Attribution Scores:")
for inc in INCIDENTS[:2]:
    print(f"\n{inc['id']} ({inc['malware']}):")
    for actor, profile in ACTOR_PROFILES.items():
        score = calculate_actor_similarity(inc['ttps'], profile['common_ttps'])
        print(f"  {actor}: {score:.2%}")

## 4. LLM-Powered Profile Generation

In [None]:
from anthropic import Anthropic

def generate_actor_profile(incidents: List[Dict]) -> str:
    """Use LLM to generate threat actor profile."""
    
    client = Anthropic()
    
    prompt = f"""
    Analyze these related security incidents and create a threat actor profile.
    
    INCIDENTS:
    {json.dumps(incidents, indent=2)}
    
    TTP REFERENCE:
    {json.dumps(TTP_DESCRIPTIONS, indent=2)}
    
    Create a profile including:
    1. Likely actor type (nation-state, criminal, hacktivist)
    2. Probable motivation
    3. Common attack patterns
    4. Recommended defenses
    5. Similar known actors (if any)
    """
    
    response = client.messages.create(
        model="claude-sonnet-4-20250514",
        max_tokens=1500,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.content[0].text

# Uncomment to run with API key
# profile = generate_actor_profile(INCIDENTS[:2])
# print(profile)

## 5. Diamond Model Analysis

In [None]:
def create_diamond_model(incident: Dict, infrastructure: List[str], victim_info: Dict) -> Dict:
    """Create Diamond Model representation of an intrusion."""
    return {
        "adversary": {
            "known_actor": None,  # To be determined
            "motivation": None,
            "capability_ttps": incident['ttps']
        },
        "capability": {
            "malware": incident.get('malware'),
            "tools": [],
            "techniques": incident['ttps']
        },
        "infrastructure": {
            "c2_servers": infrastructure,
            "domains": [],
            "type": "unknown"
        },
        "victim": victim_info
    }

# Create Diamond Model for incident
diamond = create_diamond_model(
    INCIDENTS[0],
    infrastructure=["45.33.32.156", "evil-c2.com"],
    victim_info={"sector": "Finance", "region": "North America"}
)

print("Diamond Model:")
print(json.dumps(diamond, indent=2))

## Key Takeaways

1. **TTP Encoding**: Convert techniques to features for ML
2. **Clustering**: Group similar incidents for attribution
3. **Similarity Matching**: Compare against known actors
4. **Diamond Model**: Structure intrusion data

## Next Steps
- **Lab 17**: Adversarial Machine Learning
- **Lab 18**: Fine-Tuning for Security