# Step 2: Process Deterministic Findings

This notebook processes findings that can be deterministically converted to Ansible playbooks.

**Input:** 
- Enhanced findings JSON file (from Step 1)
- Ansible targets JSON file (from Step 1)

**Output:**
- Ansible playbooks for deterministic findings
- JSON file with findings that need LLM processing
- Processing summary

In [1]:
# Import required libraries
import sys
import os
import json
from pathlib import Path
from datetime import datetime

# Add src to path
sys.path.insert(0, '../src')

# Import our modules
from ansible_playbook_generator import DeterministicPlaybookGenerator
from shared.prompt_utils import load_findings_file, get_severity_counts

print("📦 Libraries imported successfully")
print(f"🐍 Python version: {sys.version.split()[0]}")
print(f"📁 Current working directory: {os.getcwd()}")

📦 Libraries imported successfully
🐍 Python version: 3.11.12
📁 Current working directory: /Users/wjackson/Developer/AI-Building-Blocks/ansible_playbook_from_stig/notebooks


In [2]:
# Configuration - Update these paths from Step 1 output
# Copy these values from the Step 1 notebook output:

RUN_TIMESTAMP = "20250714_120000"  # Update from Step 1
ENHANCED_FINDINGS_FILE = "../findings/node2.example.com-STIG-20250710162433_20250714_110147_enhanced_findings.json"  # Update from Step 1
ANSIBLE_TARGETS_FILE = "../findings/node2.example.com-STIG-20250710162433_20250714_110147_ansible_targets.json"  # Update from Step 1

# Output configuration
PLAYBOOKS_BASE_DIR = "../playbooks"
PLAYBOOKS_RUN_DIR = f"{PLAYBOOKS_BASE_DIR}/{RUN_TIMESTAMP}"

print(f"🕐 Processing run: {RUN_TIMESTAMP}")
print(f"📂 Input files:")
print(f"   Enhanced findings: {ENHANCED_FINDINGS_FILE}")
print(f"   Ansible targets: {ANSIBLE_TARGETS_FILE}")
print(f"📁 Output directory: {PLAYBOOKS_RUN_DIR}")

# Verify input files exist
for file_path in [ENHANCED_FINDINGS_FILE, ANSIBLE_TARGETS_FILE]:
    if Path(file_path).exists():
        print(f"✅ Found: {file_path}")
    else:
        print(f"❌ Missing: {file_path}")
        print("Please update the file paths in the cell above from Step 1 output")

🕐 Processing run: 20250714_120000
📂 Input files:
   Enhanced findings: ../findings/node2.example.com-STIG-20250710162433_20250714_110147_enhanced_findings.json
   Ansible targets: ../findings/node2.example.com-STIG-20250710162433_20250714_110147_ansible_targets.json
📁 Output directory: ../playbooks/20250714_120000
✅ Found: ../findings/node2.example.com-STIG-20250710162433_20250714_110147_enhanced_findings.json
✅ Found: ../findings/node2.example.com-STIG-20250710162433_20250714_110147_ansible_targets.json


In [3]:
# Create output directory structure
playbooks_dir = Path(PLAYBOOKS_RUN_DIR)
playbooks_dir.mkdir(parents=True, exist_ok=True)

# Create subdirectories
deterministic_dir = playbooks_dir / "deterministic"
llm_needed_dir = playbooks_dir / "llm_needed"

deterministic_dir.mkdir(exist_ok=True)
llm_needed_dir.mkdir(exist_ok=True)

print(f"📁 Created directory structure:")
print(f"   Main: {playbooks_dir}")
print(f"   Deterministic: {deterministic_dir}")
print(f"   LLM needed: {llm_needed_dir}")

📁 Created directory structure:
   Main: ../playbooks/20250714_120000
   Deterministic: ../playbooks/20250714_120000/deterministic
   LLM needed: ../playbooks/20250714_120000/llm_needed


In [4]:
# Load the findings and targets data
print("📊 Loading findings data...")

# Load enhanced findings
with open(ENHANCED_FINDINGS_FILE, 'r') as f:
    enhanced_data = json.load(f)
    
enhanced_findings = enhanced_data.get('findings', [])
metadata = enhanced_data.get('metadata', {})
summary = enhanced_data.get('summary', {})

print(f"📈 Enhanced findings loaded: {len(enhanced_findings)}")
print(f"📄 Metadata: {metadata.get('format', 'Unknown format')}")

# Load ansible targets
with open(ANSIBLE_TARGETS_FILE, 'r') as f:
    targets_data = json.load(f)
    
actionable_targets = targets_data.get('targets', [])
targets_metadata = targets_data.get('metadata', {})

print(f"🎯 Actionable targets loaded: {len(actionable_targets)}")
print(f"📊 Total actionable: {targets_metadata.get('total_actionable', 0)}")

# Show severity distribution
if 'by_severity' in summary:
    print(f"\n📈 Severity distribution: {summary['by_severity']}")
if 'by_target_type' in summary:
    print(f"🎯 Target type distribution: {summary['by_target_type']}")

📊 Loading findings data...
📈 Enhanced findings loaded: 1529
📄 Metadata: ARF
🎯 Actionable targets loaded: 435
📊 Total actionable: 435

📈 Severity distribution: {'medium': 1221, 'high': 69, 'low': 119, 'unknown': 120}
🎯 Target type distribution: {'unknown': 1094, 'package': 118, 'service': 78, 'mount': 50, 'file_ownership': 72, 'file_permission': 53, 'sysctl': 64}


In [5]:
# Analyze which findings can be processed deterministically
print("🔍 Analyzing findings for deterministic processing...")

# Separate findings into categories
deterministic_findings = []
llm_needed_findings = []

# Create a lookup map of targets by rule_id
targets_by_rule = {target['rule_id']: target for target in actionable_targets}

for finding in enhanced_findings:
    rule_id = finding.get('rule_id', '')
    
    # Check if we have a deterministic target for this finding
    if rule_id in targets_by_rule:
        target = targets_by_rule[rule_id]
        
        # Check if target type is deterministic (not 'unknown')
        if target.get('target_type') != 'unknown':
            deterministic_findings.append({
                'finding': finding,
                'target': target
            })
        else:
            llm_needed_findings.append(finding)
    else:
        # No target info means we need LLM processing
        llm_needed_findings.append(finding)

print(f"\n📊 Processing Analysis:")
print(f"   Total findings: {len(enhanced_findings)}")
print(f"   Deterministic processing: {len(deterministic_findings)}")
print(f"   LLM processing needed: {len(llm_needed_findings)}")
print(f"   Processing ratio: {len(deterministic_findings)/len(enhanced_findings)*100:.1f}% deterministic")

🔍 Analyzing findings for deterministic processing...

📊 Processing Analysis:
   Total findings: 1529
   Deterministic processing: 435
   LLM processing needed: 1094
   Processing ratio: 28.4% deterministic


In [6]:
# Show sample deterministic findings
if deterministic_findings:
    print("🔍 Sample Deterministic Findings (first 3):")
    for i, item in enumerate(deterministic_findings[:3]):
        finding = item['finding']
        target = item['target']
        
        print(f"\n📋 Finding {i+1}:")
        print(f"   Rule ID: {finding.get('rule_id', 'Unknown')}")
        print(f"   Severity: {finding.get('severity', 'Unknown')}")
        print(f"   Title: {finding.get('title', 'Unknown')[:60]}...")
        print(f"   Target Type: {target.get('target_type', 'Unknown')}")
        print(f"   Target Name: {target.get('target_name', 'Unknown')}")
        print(f"   Ansible Module: {target.get('ansible_module', 'Unknown')}")

🔍 Sample Deterministic Findings (first 3):

📋 Finding 1:
   Rule ID: xccdf_org.ssgproject.content_rule_package_prelink_removed
   Severity: medium
   Title: Package "prelink" Must not be Installed...
   Target Type: package
   Target Name: prelink
   Ansible Module: yum

📋 Finding 2:
   Rule ID: xccdf_org.ssgproject.content_rule_package_aide_installed
   Severity: medium
   Title: Install AIDE...
   Target Type: package
   Target Name: aide
   Ansible Module: yum

📋 Finding 3:
   Rule ID: xccdf_org.ssgproject.content_rule_package_dracut-fips_installed
   Severity: medium
   Title: Install the dracut-fips Package...
   Target Type: package
   Target Name: dracut-fips
   Ansible Module: yum


In [7]:
# Generate deterministic Ansible playbooks
if deterministic_findings:
    print(f"🚀 Generating deterministic Ansible playbooks for {len(deterministic_findings)} findings...")
    
    # Initialize the deterministic playbook generator
    generator = DeterministicPlaybookGenerator()
    
    # Create a targets file for the deterministic findings
    deterministic_targets = [item['target'] for item in deterministic_findings]
    
    deterministic_targets_file = deterministic_dir / "deterministic_targets.json"
    deterministic_targets_data = {
        'metadata': {
            'total_actionable': len(deterministic_targets),
            'extraction_date': datetime.now().isoformat(),
            'source': 'deterministic_processing',
            'run_timestamp': RUN_TIMESTAMP
        },
        'targets': deterministic_targets
    }
    
    with open(deterministic_targets_file, 'w') as f:
        json.dump(deterministic_targets_data, f, indent=2)
    
    print(f"💾 Saved deterministic targets to: {deterministic_targets_file}")
    
    # Generate the playbook
    playbook_file = deterministic_dir / f"deterministic_remediation_{RUN_TIMESTAMP}.yml"
    
    try:
        playbook = generator.generate_playbook_from_targets(
            str(deterministic_targets_file), 
            str(playbook_file)
        )
        
        print(f"✅ Successfully generated deterministic playbook!")
        print(f"📄 Playbook saved to: {playbook_file}")
        
        # Show playbook stats
        total_tasks = sum(len(play.get('tasks', [])) for play in playbook)
        total_handlers = sum(len(play.get('handlers', [])) for play in playbook)
        
        print(f"📊 Playbook Statistics:")
        print(f"   Total plays: {len(playbook)}")
        print(f"   Total tasks: {total_tasks}")
        print(f"   Total handlers: {total_handlers}")
        print(f"   File size: {playbook_file.stat().st_size / 1024:.1f} KB")
        
    except Exception as e:
        print(f"❌ Error generating deterministic playbook: {e}")
        import traceback
        traceback.print_exc()
else:
    print("⚠️ No deterministic findings to process")

🚀 Generating deterministic Ansible playbooks for 435 findings...
💾 Saved deterministic targets to: ../playbooks/20250714_120000/deterministic/deterministic_targets.json
✅ Generated playbook with 435 tasks: ../playbooks/20250714_120000/deterministic/deterministic_remediation_20250714_120000.yml
✅ Successfully generated deterministic playbook!
📄 Playbook saved to: ../playbooks/20250714_120000/deterministic/deterministic_remediation_20250714_120000.yml
📊 Playbook Statistics:
   Total plays: 1
   Total tasks: 443
   Total handlers: 1
   File size: 112.6 KB


In [8]:
# Save findings that need LLM processing
if llm_needed_findings:
    print(f"💾 Saving {len(llm_needed_findings)} findings that need LLM processing...")
    
    # Create LLM needed findings file
    llm_needed_file = llm_needed_dir / f"llm_needed_findings_{RUN_TIMESTAMP}.json"
    
    llm_needed_data = {
        'metadata': {
            'total_findings': len(llm_needed_findings),
            'created_date': datetime.now().isoformat(),
            'source': 'deterministic_processing_step',
            'run_timestamp': RUN_TIMESTAMP,
            'description': 'Findings that could not be processed deterministically and require LLM classification/processing'
        },
        'findings': llm_needed_findings
    }
    
    with open(llm_needed_file, 'w') as f:
        json.dump(llm_needed_data, f, indent=2)
    
    print(f"💾 Saved LLM needed findings to: {llm_needed_file}")
    
    # Analyze LLM needed findings by severity
    llm_severity_counts = {}
    for finding in llm_needed_findings:
        severity = finding.get('severity', 'unknown')
        llm_severity_counts[severity] = llm_severity_counts.get(severity, 0) + 1
    
    print(f"📈 LLM needed findings by severity: {llm_severity_counts}")
    
    # Store variable for next notebook
    LLM_NEEDED_FILE = str(llm_needed_file)
    print(f"\n🔄 Variable for next notebook:")
    print(f"   LLM_NEEDED_FILE = '{LLM_NEEDED_FILE}'")
else:
    print("✅ All findings processed deterministically - no LLM processing needed!")
    LLM_NEEDED_FILE = None

💾 Saving 1094 findings that need LLM processing...
💾 Saved LLM needed findings to: ../playbooks/20250714_120000/llm_needed/llm_needed_findings_20250714_120000.json
📈 LLM needed findings by severity: {'medium': 909, 'high': 52, 'low': 56, 'unknown': 77}

🔄 Variable for next notebook:
   LLM_NEEDED_FILE = '../playbooks/20250714_120000/llm_needed/llm_needed_findings_20250714_120000.json'


In [9]:
# Create processing summary
processing_summary = {
    'run_timestamp': RUN_TIMESTAMP,
    'processing_date': datetime.now().isoformat(),
    'input_files': {
        'enhanced_findings': ENHANCED_FINDINGS_FILE,
        'ansible_targets': ANSIBLE_TARGETS_FILE
    },
    'statistics': {
        'total_findings': len(enhanced_findings),
        'deterministic_processed': len(deterministic_findings),
        'llm_processing_needed': len(llm_needed_findings),
        'deterministic_percentage': len(deterministic_findings)/len(enhanced_findings)*100 if enhanced_findings else 0
    },
    'output_files': {
        'deterministic_playbook': str(playbook_file) if deterministic_findings else None,
        'deterministic_targets': str(deterministic_targets_file) if deterministic_findings else None,
        'llm_needed_findings': LLM_NEEDED_FILE
    }
}

# Save processing summary
summary_file = playbooks_dir / f"processing_summary_step2_{RUN_TIMESTAMP}.json"
with open(summary_file, 'w') as f:
    json.dump(processing_summary, f, indent=2)

print(f"💾 Saved processing summary to: {summary_file}")

💾 Saved processing summary to: ../playbooks/20250714_120000/processing_summary_step2_20250714_120000.json


In [10]:
# Final summary and next steps
print("🎯 DETERMINISTIC PROCESSING SUMMARY")
print("=" * 50)
print(f"Run timestamp: {RUN_TIMESTAMP}")
print(f"Total findings processed: {len(enhanced_findings)}")
print(f"Deterministic playbooks generated: {len(deterministic_findings)}")
print(f"LLM processing needed: {len(llm_needed_findings)}")
print(f"Success rate: {len(deterministic_findings)/len(enhanced_findings)*100:.1f}% deterministic")

if deterministic_findings:
    print(f"\n✅ Deterministic Processing Results:")
    print(f"   📄 Playbook: {playbook_file}")
    print(f"   🎯 Targets: {deterministic_targets_file}")
    print(f"   📊 Tasks generated: {total_tasks if 'total_tasks' in locals() else 'Unknown'}")

if llm_needed_findings:
    print(f"\n🤖 LLM Processing Needed:")
    print(f"   📄 Findings file: {LLM_NEEDED_FILE}")
    print(f"   📊 Count: {len(llm_needed_findings)}")
    print(f"   📈 By severity: {llm_severity_counts if 'llm_severity_counts' in locals() else 'Unknown'}")
    print(f"\n🔄 Ready for Step 3: Batch process LLM findings")
    print(f"📝 Use these variables in the next notebook (03_process_llm_batch.ipynb):")
    print(f"   RUN_TIMESTAMP = '{RUN_TIMESTAMP}'")
    print(f"   LLM_NEEDED_FILE = '{LLM_NEEDED_FILE}'")
else:
    print(f"\n✅ All findings processed deterministically!")
    print(f"📝 No LLM processing needed - workflow complete")

print(f"\n📁 All outputs saved to: {playbooks_dir}")
print(f"📋 Processing summary: {summary_file}")

🎯 DETERMINISTIC PROCESSING SUMMARY
Run timestamp: 20250714_120000
Total findings processed: 1529
Deterministic playbooks generated: 435
LLM processing needed: 1094
Success rate: 28.4% deterministic

✅ Deterministic Processing Results:
   📄 Playbook: ../playbooks/20250714_120000/deterministic/deterministic_remediation_20250714_120000.yml
   🎯 Targets: ../playbooks/20250714_120000/deterministic/deterministic_targets.json
   📊 Tasks generated: 443

🤖 LLM Processing Needed:
   📄 Findings file: ../playbooks/20250714_120000/llm_needed/llm_needed_findings_20250714_120000.json
   📊 Count: 1094
   📈 By severity: {'medium': 909, 'high': 52, 'low': 56, 'unknown': 77}

🔄 Ready for Step 3: Batch process LLM findings
📝 Use these variables in the next notebook (03_process_llm_batch.ipynb):
   RUN_TIMESTAMP = '20250714_120000'
   LLM_NEEDED_FILE = '../playbooks/20250714_120000/llm_needed/llm_needed_findings_20250714_120000.json'

📁 All outputs saved to: ../playbooks/20250714_120000
📋 Processing summar