# Lab 07: Advanced AI YARA Rule Generator

Use AI to generate robust YARA rules from malware samples, behavioral patterns, and threat intelligence.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/depalmar/ai_for_the_win/blob/main/notebooks/lab07_yara_generator.ipynb)

## Learning Objectives
- YARA rule syntax, conditions, and modules (PE, Math, Hash)
- Multi-family malware signature extraction
- Obfuscation-resistant pattern detection
- String encoding detection (Base64, XOR, ROT13, custom)
- PE structure analysis for robust signatures
- LLM-assisted rule generation and optimization
- Rule validation and false positive testing
- MITRE ATT&CK integration

## Malware Families Covered

Generate rules for modern threat families:
- **Ransomware**: LockBit, BlackCat/ALPHV, Conti, Royal, Play
- **Loaders**: Bumblebee, IcedID, QakBot, GuLoader, SocGholish
- **Stealers**: RedLine, Raccoon, Vidar, LummaC2
- **RATs**: Remcos, AsyncRAT, njRAT, Quasar, DarkComet
- **APT Tools**: Cobalt Strike, Sliver, Havoc, Brute Ratel

In [None]:
# !pip install anthropic yara-python

In [None]:
import re
from typing import List, Dict
from dataclasses import dataclass

## 1. YARA Rule Structure

In [None]:
# Comprehensive malware signature database for YARA rule generation

MALWARE_SIGNATURES = {
    # Ransomware families
    "lockbit": {
        "family": "LockBit",
        "category": "ransomware",
        "strings": {
            "ascii": [
                "restore-my-files.txt",
                "lockbit",
                "Your data are stolen and encrypted",
                ".lockbit",
                "LockBit 3.0",
            ],
            "wide": [
                "vssadmin delete shadows",
                "bcdedit /set {default} recoveryenabled no",
            ],
            "hex": [
                "4C 6F 63 6B 42 69 74",  # "LockBit"
                "2E 6C 6F 63 6B 62 69 74",  # ".lockbit"
            ],
        },
        "pe_characteristics": {
            "imports": ["CryptEncrypt", "CryptDecrypt", "FindFirstFileW", "MoveFileExW"],
            "sections": [".text", ".rdata", ".data"],
            "entropy_threshold": 7.0,
        },
        "mutex_patterns": ["Global\\\\LockBit.*", "Local\\\\LockBit.*"],
        "mitre": ["T1486", "T1490", "T1083"],
    },
    
    "blackcat": {
        "family": "BlackCat/ALPHV",
        "category": "ransomware",
        "strings": {
            "ascii": [
                "RECOVER-",
                "alphv",
                "ALPHV",
                "--access-token",
                "--paths",
            ],
            "wide": [
                "bcrypt.dll",
                "BCryptEncrypt",
            ],
            "hex": [
                "41 4C 50 48 56",  # "ALPHV"
            ],
        },
        "pe_characteristics": {
            "imports": ["BCryptEncrypt", "BCryptDecrypt", "NtQuerySystemInformation"],
            "compiler": "Rust",  # Written in Rust
            "sections": [".text", ".rdata", ".data", ".pdata"],
        },
        "mitre": ["T1486", "T1490", "T1082"],
    },
    
    # Loaders
    "bumblebee": {
        "family": "Bumblebee",
        "category": "loader",
        "strings": {
            "ascii": [
                "bumblebee",
                "/gate.php",
                "POST /",
            ],
            "wide": [
                "wab.exe",
                "odbcconf.exe",
            ],
            "base64": [
                "aHR0cHM6Ly8",  # "https://" base64 prefix
            ],
        },
        "pe_characteristics": {
            "imports": ["VirtualAlloc", "VirtualProtect", "CreateThread", "LoadLibraryA"],
            "is_dll": True,
            "exports": ["DllRegisterServer", "SetPath"],
        },
        "mitre": ["T1059.007", "T1218.011", "T1055"],
    },
    
    "qakbot": {
        "family": "QakBot/QBot",
        "category": "loader",
        "strings": {
            "ascii": [
                "/t5",
                "stager",
                "spx",
            ],
            "xor_key": 0x5A,  # Common XOR key
            "encrypted_strings": True,
        },
        "pe_characteristics": {
            "imports": ["CreateProcessInternalW", "NtCreateThreadEx", "RtlCreateUserThread"],
            "packed": True,
            "sections": [".text", ".rdata", "CODE"],
        },
        "config_extraction": {
            "c2_offset": 0x1000,
            "encryption": "RC4",
        },
        "mitre": ["T1055.012", "T1027", "T1140"],
    },
    
    # Stealers
    "redline": {
        "family": "RedLine Stealer",
        "category": "stealer",
        "strings": {
            "ascii": [
                "RedLine",
                "Yandex\\\\YaAddon",
                "\\\\Login Data",
                "\\\\Cookies",
                "\\\\Web Data",
                "wallet.dat",
                "Electrum",
                "Exodus",
            ],
            "wide": [
                "SELECT * FROM Win32_Processor",
                "System.Management",
            ],
        },
        "pe_characteristics": {
            "dotnet": True,
            "imports": ["mscoree.dll"],
            "resources": ["Costura."],
        },
        "browser_targets": ["Chrome", "Firefox", "Edge", "Opera", "Brave"],
        "crypto_wallets": ["Bitcoin", "Ethereum", "Monero", "Electrum", "Exodus"],
        "mitre": ["T1555.003", "T1539", "T1005"],
    },
    
    "raccoon": {
        "family": "Raccoon Stealer",
        "category": "stealer",
        "strings": {
            "ascii": [
                "machineId=",
                "configId=",
                "Raccoon",
                "ews_",
                "grbrs_",
                "scrnsht_",
            ],
            "hex": [
                "52 61 63 63 6F 6F 6E",  # "Raccoon"
            ],
        },
        "pe_characteristics": {
            "imports": ["InternetOpenUrlA", "HttpSendRequestA", "CryptUnprotectData"],
            "sections": [".text", ".rdata", ".data", ".rsrc"],
        },
        "c2_patterns": ["/gate/", "/ews/", "/grbrs/"],
        "mitre": ["T1555.003", "T1539", "T1113"],
    },
    
    # RATs
    "remcos": {
        "family": "Remcos RAT",
        "category": "rat",
        "strings": {
            "ascii": [
                "Remcos",
                "Breaking-Security",
                "licence",
                "keylog",
                "screenshot",
                "webcam",
            ],
            "wide": [
                "Software\\\\Remcos",
                "exepath",
            ],
            "hex": [
                "52 65 6D 63 6F 73",  # "Remcos"
            ],
        },
        "pe_characteristics": {
            "imports": ["GetAsyncKeyState", "SetWindowsHookExA", "GetClipboardData"],
            "resources": ["SETTINGS", "RCDATA"],
            "entropy_threshold": 6.5,
        },
        "config_location": "RCDATA resource",
        "mitre": ["T1056.001", "T1113", "T1125"],
    },
    
    "asyncrat": {
        "family": "AsyncRAT",
        "category": "rat",
        "strings": {
            "ascii": [
                "AsyncRAT",
                "ABORIBG",
                "Pastebin=",
                "Anti=",
                "BSOD=",
            ],
            "wide": [
                "TcpClient",
                "SslStream",
                "GetExecutingAssembly",
            ],
        },
        "pe_characteristics": {
            "dotnet": True,
            "imports": ["mscoree.dll"],
            "obfuscators": ["ConfuserEx", "SmartAssembly"],
        },
        "mitre": ["T1056.001", "T1113", "T1219"],
    },
    
    # APT Tools
    "cobalt_strike": {
        "family": "Cobalt Strike",
        "category": "apt_tool",
        "strings": {
            "ascii": [
                "%s (admin)",
                "beacon",
                "ReflectiveLoader",
                "cobaltstrike",
            ],
            "hex": [
                "FC 48 83 E4 F0 E8",  # Shellcode prologue
                "4D 5A 41 52 55 48 89 E5",  # MZ header variant
            ],
            "watermark_offset": 0x1000,
        },
        "pe_characteristics": {
            "imports": ["VirtualAlloc", "CreateThread", "RtlMoveMemory"],
            "reflective": True,
            "malleable_c2": True,
        },
        "beacon_config": {
            "sleep_mask": True,
            "process_injection": ["CreateRemoteThread", "NtMapViewOfSection"],
        },
        "mitre": ["T1055.001", "T1071.001", "T1059.003"],
    },
    
    "sliver": {
        "family": "Sliver C2",
        "category": "apt_tool",
        "strings": {
            "ascii": [
                "sliver",
                "--mtls",
                "--dns",
                "implant",
            ],
            "go_build": True,  # Written in Go
        },
        "pe_characteristics": {
            "imports": ["GetProcAddress", "LoadLibraryA"],
            "compiler": "Go",
            "large_size": True,  # Go binaries are large
        },
        "mitre": ["T1055", "T1071.001", "T1071.004"],
    },
}

# String encoding/obfuscation patterns
OBFUSCATION_PATTERNS = {
    "base64": {
        "pattern": r'[A-Za-z0-9+/=]{20,}',
        "description": "Base64 encoded strings",
    },
    "xor_single_byte": {
        "common_keys": [0x5A, 0x41, 0xFF, 0x69, 0x37],
        "description": "Single-byte XOR encoding",
    },
    "xor_multi_byte": {
        "description": "Multi-byte XOR with key",
    },
    "rot13": {
        "description": "ROT13 character substitution",
    },
    "stack_strings": {
        "pattern": r'mov.*0x[0-9a-f]{2}.*mov.*0x[0-9a-f]{2}',
        "description": "Stack-based string construction",
    },
    "api_hashing": {
        "algorithms": ["CRC32", "DJB2", "SDBM", "ROR13"],
        "description": "API resolution via hash",
    },
}

print(f"Loaded {len(MALWARE_SIGNATURES)} malware family signatures")
print(f"Loaded {len(OBFUSCATION_PATTERNS)} obfuscation detection patterns")
print(f"\nCategories:")
categories = {}
for family, info in MALWARE_SIGNATURES.items():
    cat = info['category']
    categories[cat] = categories.get(cat, 0) + 1
for cat, count in sorted(categories.items()):
    print(f"  {cat}: {count} families")

## 2. String Extractor

In [None]:
class StringExtractor:
    """Extract meaningful strings from samples."""
    
    # Suspicious patterns
    SUSPICIOUS_PATTERNS = [
        r'cmd\.exe',
        r'powershell',
        r'wscript',
        r'cscript',
        r'regsvr32',
        r'mshta',
        r'rundll32',
        r'http://|https://',
        r'\.exe|\.dll|\.bat|\.ps1',
        r'\\Users\\.*\\AppData',
        r'\\Temp\\',
    ]
    
    def extract(self, content: str) -> List[str]:
        """Extract suspicious strings."""
        strings = []
        
        for pattern in self.SUSPICIOUS_PATTERNS:
            matches = re.findall(pattern, content, re.IGNORECASE)
            strings.extend(matches)
        
        # Extract URLs
        urls = re.findall(r'https?://[^\s<>"]+', content)
        strings.extend(urls)
        
        return list(set(strings))

# Test extraction
sample_content = """
cmd.exe /c powershell -enc SGVsbG8=
Download from http://malware.evil.com/payload.exe
Save to C:\\Users\\victim\\AppData\\Local\\Temp\\update.exe
"""

extractor = StringExtractor()
extracted = extractor.extract(sample_content)
print("Extracted strings:")
for s in extracted:
    print(f"  - {s}")

## 3. AI Rule Generator

In [None]:
class AIYaraGenerator:
    """Generate YARA rules using AI."""
    
    def __init__(self):
        try:
            from anthropic import Anthropic
            self.client = Anthropic()
            self.available = True
        except:
            self.available = False
    
    def generate_from_description(self, description: str, name: str = "Generated_Rule") -> str:
        """Generate YARA rule from description."""
        if not self.available:
            return self._mock_rule(description, name)
        
        prompt = f"""Generate a YARA rule based on this description:

{description}

Requirements:
- Rule name: {name}
- Include meta section with author, description, date
- Use meaningful string identifiers
- Create an effective condition
- Minimize false positives

Return ONLY the YARA rule, no explanation."""
        
        response = self.client.messages.create(
            model="claude-sonnet-4-20250514",
            max_tokens=1024,
            messages=[{"role": "user", "content": prompt}]
        )
        
        return response.content[0].text
    
    def _mock_rule(self, description: str, name: str) -> str:
        return f"""rule {name} {{
    meta:
        author = "AI Generator"
        description = "{description[:50]}..."
        date = "2024-01-15"
    
    strings:
        $s1 = "cmd.exe" nocase
        $s2 = "powershell" nocase
        $s3 = "-enc" nocase
        $url = /https?:\\/\\/[^\\s]+/ nocase
    
    condition:
        uint16(0) == 0x5A4D and 2 of them
}}"""

In [None]:
# Generate rule
generator = AIYaraGenerator()

description = """
Detect Emotet malware that:
- Drops via macro-enabled Office documents
- Uses PowerShell with encoded commands
- Connects to C2 servers
- Creates persistence via registry
"""

rule = generator.generate_from_description(description, "Emotet_Loader")
print(rule)

## 4. Rule Validator

In [None]:
def validate_yara_rule(rule_text: str) -> Dict:
    """Validate YARA rule syntax."""
    result = {"valid": True, "errors": [], "warnings": []}
    
    # Basic structure checks
    if "rule " not in rule_text:
        result["valid"] = False
        result["errors"].append("Missing 'rule' keyword")
    
    if "condition:" not in rule_text:
        result["valid"] = False
        result["errors"].append("Missing condition section")
    
    # Check for balanced braces
    if rule_text.count("{") != rule_text.count("}"):
        result["valid"] = False
        result["errors"].append("Unbalanced braces")
    
    # Warnings
    if "strings:" not in rule_text:
        result["warnings"].append("No strings section - rule may be too broad")
    
    if "meta:" not in rule_text:
        result["warnings"].append("No meta section - consider adding metadata")
    
    # Try to compile with yara-python if available
    try:
        import yara
        yara.compile(source=rule_text)
    except ImportError:
        result["warnings"].append("yara-python not installed - syntax not verified")
    except Exception as e:
        result["valid"] = False
        result["errors"].append(f"Compilation error: {str(e)}")
    
    return result

# Validate
validation = validate_yara_rule(rule)
print(f"Valid: {validation['valid']}")
print(f"Errors: {validation['errors']}")
print(f"Warnings: {validation['warnings']}")

## Summary

We built an AI-powered YARA rule generator:

1. **Rule Structure** - Proper YARA syntax
2. **String Extraction** - Automated pattern detection
3. **AI Generation** - LLM-based rule creation
4. **Validation** - Syntax checking

### Next Steps:
1. Add more sophisticated extraction (PE headers, imports)
2. Implement rule testing against sample sets
3. Create rule optimization pipeline