# Lab 07: AI YARA Rule Generator

Use AI to generate YARA rules from malware samples and descriptions.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/depalmar/ai_for_the_win/blob/main/notebooks/lab07_yara_generator.ipynb)

## Learning Objectives
- YARA rule syntax and structure
- String and pattern extraction
- LLM-assisted rule generation
- Rule validation and testing

In [None]:
# !pip install anthropic yara-python

In [None]:
import re
from typing import List, Dict
from dataclasses import dataclass

## 1. YARA Rule Structure

In [None]:
@dataclass
class YARARule:
    """YARA rule structure."""
    name: str
    meta: Dict[str, str]
    strings: List[Dict]
    condition: str
    
    def to_string(self) -> str:
        """Convert to YARA rule string."""
        lines = [f"rule {self.name} {{"]
        
        # Meta section
        if self.meta:
            lines.append("    meta:")
            for key, value in self.meta.items():
                lines.append(f'        {key} = "{value}"')
        
        # Strings section
        if self.strings:
            lines.append("    strings:")
            for s in self.strings:
                if s['type'] == 'text':
                    lines.append(f'        ${s["name"]} = "{s["value"]}"')
                elif s['type'] == 'hex':
                    lines.append(f'        ${s["name"]} = {{ {s["value"]} }}')
                elif s['type'] == 'regex':
                    lines.append(f'        ${s["name"]} = /{s["value"]}/')
        
        # Condition
        lines.append("    condition:")
        lines.append(f"        {self.condition}")
        lines.append("}")
        
        return "\n".join(lines)

# Example rule
example_rule = YARARule(
    name="Emotet_Dropper",
    meta={
        "author": "Security Team",
        "description": "Detects Emotet dropper",
        "date": "2024-01-15"
    },
    strings=[
        {"name": "s1", "type": "text", "value": "cmd.exe /c"},
        {"name": "s2", "type": "text", "value": "powershell -enc"},
        {"name": "h1", "type": "hex", "value": "4D 5A 90 00"}
    ],
    condition="uint16(0) == 0x5A4D and any of them"
)

print(example_rule.to_string())

## 2. String Extractor

In [None]:
class StringExtractor:
    """Extract meaningful strings from samples."""
    
    # Suspicious patterns
    SUSPICIOUS_PATTERNS = [
        r'cmd\.exe',
        r'powershell',
        r'wscript',
        r'cscript',
        r'regsvr32',
        r'mshta',
        r'rundll32',
        r'http://|https://',
        r'\.exe|\.dll|\.bat|\.ps1',
        r'\\Users\\.*\\AppData',
        r'\\Temp\\',
    ]
    
    def extract(self, content: str) -> List[str]:
        """Extract suspicious strings."""
        strings = []
        
        for pattern in self.SUSPICIOUS_PATTERNS:
            matches = re.findall(pattern, content, re.IGNORECASE)
            strings.extend(matches)
        
        # Extract URLs
        urls = re.findall(r'https?://[^\s<>"]+', content)
        strings.extend(urls)
        
        return list(set(strings))

# Test extraction
sample_content = """
cmd.exe /c powershell -enc SGVsbG8=
Download from http://malware.evil.com/payload.exe
Save to C:\\Users\\victim\\AppData\\Local\\Temp\\update.exe
"""

extractor = StringExtractor()
extracted = extractor.extract(sample_content)
print("Extracted strings:")
for s in extracted:
    print(f"  - {s}")

## 3. AI Rule Generator

In [None]:
class AIYaraGenerator:
    """Generate YARA rules using AI."""
    
    def __init__(self):
        try:
            from anthropic import Anthropic
            self.client = Anthropic()
            self.available = True
        except:
            self.available = False
    
    def generate_from_description(self, description: str, name: str = "Generated_Rule") -> str:
        """Generate YARA rule from description."""
        if not self.available:
            return self._mock_rule(description, name)
        
        prompt = f"""Generate a YARA rule based on this description:

{description}

Requirements:
- Rule name: {name}
- Include meta section with author, description, date
- Use meaningful string identifiers
- Create an effective condition
- Minimize false positives

Return ONLY the YARA rule, no explanation."""
        
        response = self.client.messages.create(
            model="claude-sonnet-4-20250514",
            max_tokens=1024,
            messages=[{"role": "user", "content": prompt}]
        )
        
        return response.content[0].text
    
    def _mock_rule(self, description: str, name: str) -> str:
        return f"""rule {name} {{
    meta:
        author = "AI Generator"
        description = "{description[:50]}..."
        date = "2024-01-15"
    
    strings:
        $s1 = "cmd.exe" nocase
        $s2 = "powershell" nocase
        $s3 = "-enc" nocase
        $url = /https?:\\/\\/[^\\s]+/ nocase
    
    condition:
        uint16(0) == 0x5A4D and 2 of them
}}"""

In [None]:
# Generate rule
generator = AIYaraGenerator()

description = """
Detect Emotet malware that:
- Drops via macro-enabled Office documents
- Uses PowerShell with encoded commands
- Connects to C2 servers
- Creates persistence via registry
"""

rule = generator.generate_from_description(description, "Emotet_Loader")
print(rule)

## 4. Rule Validator

In [None]:
def validate_yara_rule(rule_text: str) -> Dict:
    """Validate YARA rule syntax."""
    result = {"valid": True, "errors": [], "warnings": []}
    
    # Basic structure checks
    if "rule " not in rule_text:
        result["valid"] = False
        result["errors"].append("Missing 'rule' keyword")
    
    if "condition:" not in rule_text:
        result["valid"] = False
        result["errors"].append("Missing condition section")
    
    # Check for balanced braces
    if rule_text.count("{") != rule_text.count("}"):
        result["valid"] = False
        result["errors"].append("Unbalanced braces")
    
    # Warnings
    if "strings:" not in rule_text:
        result["warnings"].append("No strings section - rule may be too broad")
    
    if "meta:" not in rule_text:
        result["warnings"].append("No meta section - consider adding metadata")
    
    # Try to compile with yara-python if available
    try:
        import yara
        yara.compile(source=rule_text)
    except ImportError:
        result["warnings"].append("yara-python not installed - syntax not verified")
    except Exception as e:
        result["valid"] = False
        result["errors"].append(f"Compilation error: {str(e)}")
    
    return result

# Validate
validation = validate_yara_rule(rule)
print(f"Valid: {validation['valid']}")
print(f"Errors: {validation['errors']}")
print(f"Warnings: {validation['warnings']}")

## Summary

We built an AI-powered YARA rule generator:

1. **Rule Structure** - Proper YARA syntax
2. **String Extraction** - Automated pattern detection
3. **AI Generation** - LLM-based rule creation
4. **Validation** - Syntax checking

### Next Steps:
1. Add more sophisticated extraction (PE headers, imports)
2. Implement rule testing against sample sets
3. Create rule optimization pipeline