In [7]:
"""
KB1 Enrichment using KB2 Structural Analysis
Adds 5 missing fields to KB1 for hybrid RAG fusion
Scientific approach: Leverage KB2 structural insights to enhance KB1 textual entries
"""

import json
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from collections import Counter
import logging
from dataclasses import dataclass

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Configuration paths
CONFIG = {
    'kb1_directory': Path.home() / "Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/raw/vulrag_kb",
    'kb2_path': Path.home() / "Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/tmp/kb2_final_with_embeddings.json",
    'output_directory': Path.home() / "Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/processed/kb1"
}

@dataclass
class EnrichmentMapping:
    """Data class for KB1-KB2 mapping results"""
    kb1_cve_id: str
    kb2_entry_key: str
    kb2_data: Dict
    mapping_confidence: float

print("KB1 ENRICHMENT WITH KB2 STRUCTURAL DATA")
print("=" * 45)
print("Loading KB1 and KB2 data for cross-reference...")

# Validate paths
for key, path in CONFIG.items():
    if not path.exists():
        print(f"⚠️  Warning: {key} path does not exist: {path}")
    else:
        print(f"✅ Found: {key} at {path}")

print(f"\nConfiguration loaded successfully")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")


KB1 ENRICHMENT WITH KB2 STRUCTURAL DATA
Loading KB1 and KB2 data for cross-reference...
✅ Found: kb1_directory at /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/raw/vulrag_kb
✅ Found: kb2_path at /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/tmp/kb2_final_with_embeddings.json
✅ Found: output_directory at /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/processed/kb1

Configuration loaded successfully
  kb1_directory: /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/raw/vulrag_kb
  kb2_path: /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/tmp/kb2_final_with_embeddings.json
  output_directory: /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/processed/kb1


In [8]:

# Cell 1: Load KB1 and KB2 Data

def load_kb1_data(kb1_directory: Path) -> Dict[str, Dict]:
    """
    Load all KB1 CWE files and organize by CVE_id.
    
    Returns:
        Dictionary mapping CVE_id to KB1 entry data
        
    Scientific approach:
        - Consolidate multiple CWE files into unified structure
        - Preserve original CWE classification
        - Handle duplicate CVE_ids across different CWEs
    """
    
    kb1_data = {}
    cwe_files = list(kb1_directory.glob("gpt-4o-mini_CWE-*.json"))
    
    print(f"Loading KB1 from {len(cwe_files)} CWE files...")
    
    for cwe_file in cwe_files:
        cwe_name = cwe_file.stem.split('_')[1]  # Extract CWE-XXX
        
        try:
            with open(cwe_file, 'r') as f:
                cwe_data = json.load(f)
            
            entry_count = 0
            for cve_id, cve_instances in cwe_data.items():
                if isinstance(cve_instances, list):
                    for idx, instance in enumerate(cve_instances):
                        # Create unique key for each instance
                        instance_key = f"{cve_id}_{idx}"
                        
                        # Add CWE metadata
                        instance['source_cwe'] = cwe_name
                        instance['instance_index'] = idx
                        instance['total_instances'] = len(cve_instances)
                        
                        kb1_data[instance_key] = instance
                        entry_count += 1
                else:
                    # Single instance
                    cve_instances['source_cwe'] = cwe_name
                    cve_instances['instance_index'] = 0
                    cve_instances['total_instances'] = 1
                    kb1_data[cve_id] = cve_instances
                    entry_count += 1
            
            print(f"  {cwe_name}: {entry_count} entries loaded")
            
        except Exception as e:
            logger.error(f"Failed to load {cwe_file}: {e}")
    
    print(f"Total KB1 entries loaded: {len(kb1_data)}")
    return kb1_data

def load_kb2_data(kb2_path: Path) -> Dict[str, Dict]:
    """
    Load KB2 data with structural embeddings.
    
    Returns:
        Dictionary of KB2 entries organized by entry_key
    """
    
    print(f"Loading KB2 from: {kb2_path}")
    
    with open(kb2_path, 'r') as f:
        kb2_data = json.load(f)
    
    # Filter entries with valid embeddings
    valid_kb2 = {}
    for key, entry in kb2_data.items():
        if entry.get('embedding_computed', False):
            valid_kb2[key] = entry
    
    print(f"KB2 entries loaded: {len(valid_kb2)} (with embeddings)")
    return valid_kb2

# Execute loading
kb1_data = load_kb1_data(CONFIG['kb1_directory'])
kb2_data = load_kb2_data(CONFIG['kb2_path'])


Loading KB1 from 10 CWE files...
  CWE-125: 140 entries loaded
  CWE-787: 187 entries loaded
  CWE-20: 182 entries loaded
  CWE-264: 120 entries loaded
  CWE-416: 660 entries loaded
  CWE-401: 101 entries loaded
  CWE-476: 281 entries loaded
  CWE-362: 320 entries loaded
  CWE-119: 173 entries loaded
  CWE-200: 153 entries loaded
Total KB1 entries loaded: 2205
Loading KB2 from: /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/tmp/kb2_final_with_embeddings.json
KB2 entries loaded: 4410 (with embeddings)


In [9]:

# Cell 2: CVE Mapping Analysis

def analyze_cve_mapping(kb1_data: Dict, kb2_data: Dict) -> Dict:
    """
    Analyze CVE overlap between KB1 and KB2.
    
    Scientific approach:
        - Identify exact CVE matches
        - Handle version variations (CVE-XXXX-XXXX vs CVE-XXXX-XXXX_X)
        - Quantify mapping coverage and confidence
    """
    
    print("ANALYZING CVE MAPPING BETWEEN KB1 AND KB2")
    print("=" * 45)
    
    # Extract CVE IDs from both datasets
    kb1_cves = set()
    for key, entry in kb1_data.items():
        cve_id = entry.get('CVE_id', '')
        if cve_id:
            kb1_cves.add(cve_id)
    
    kb2_cves = set()
    for key, entry in kb2_data.items():
        cve_id = entry.get('cve_id', '')
        if cve_id:
            # Handle KB2 format: CVE-XXXX-XXXX_X
            base_cve = cve_id.split('_')[0] if '_' in cve_id else cve_id
            kb2_cves.add(base_cve)
    
    # Analyze overlap
    exact_matches = kb1_cves & kb2_cves
    kb1_only = kb1_cves - kb2_cves
    kb2_only = kb2_cves - kb1_cves
    
    mapping_stats = {
        'kb1_total_cves': len(kb1_cves),
        'kb2_total_cves': len(kb2_cves),
        'exact_matches': len(exact_matches),
        'kb1_only': len(kb1_only),
        'kb2_only': len(kb2_only),
        'overlap_percentage': len(exact_matches) / len(kb1_cves) * 100 if kb1_cves else 0
    }
    
    print(f"KB1 CVEs: {mapping_stats['kb1_total_cves']}")
    print(f"KB2 CVEs: {mapping_stats['kb2_total_cves']}")
    print(f"Exact matches: {mapping_stats['exact_matches']}")
    print(f"Overlap: {mapping_stats['overlap_percentage']:.1f}%")
    print(f"KB1 only: {mapping_stats['kb1_only']}")
    print(f"KB2 only: {mapping_stats['kb2_only']}")
    
    # Show sample matches
    print(f"\nSample exact matches:")
    for i, cve in enumerate(sorted(exact_matches)[:5]):
        print(f"  {i+1}. {cve}")
    
    return mapping_stats, exact_matches

def create_kb1_kb2_mappings(kb1_data: Dict, kb2_data: Dict, exact_matches: set) -> List[EnrichmentMapping]:
    """
    Create detailed mappings between KB1 and KB2 entries.
    
    Returns:
        List of EnrichmentMapping objects for successful matches
    """
    
    mappings = []
    
    for kb1_key, kb1_entry in kb1_data.items():
        kb1_cve = kb1_entry.get('CVE_id', '')
        
        if kb1_cve in exact_matches:
            # Find corresponding KB2 entries (both vuln and patch)
            kb2_candidates = []
            
            for kb2_key, kb2_entry in kb2_data.items():
                kb2_cve = kb2_entry.get('cve_id', '').split('_')[0]
                
                if kb2_cve == kb1_cve:
                    kb2_candidates.append((kb2_key, kb2_entry))
            
            if kb2_candidates:
                # Prefer vulnerable version for analysis, fallback to patch
                vuln_entry = None
                patch_entry = None
                
                for kb2_key, kb2_entry in kb2_candidates:
                    if kb2_entry.get('file_type') == 'vuln':
                        vuln_entry = (kb2_key, kb2_entry)
                    elif kb2_entry.get('file_type') == 'patch':
                        patch_entry = (kb2_key, kb2_entry)
                
                # Choose best candidate
                chosen_entry = vuln_entry if vuln_entry else patch_entry
                
                if chosen_entry:
                    kb2_key, kb2_entry = chosen_entry
                    
                    mapping = EnrichmentMapping(
                        kb1_cve_id=kb1_key,
                        kb2_entry_key=kb2_key,
                        kb2_data=kb2_entry,
                        mapping_confidence=1.0  # Exact CVE match
                    )
                    mappings.append(mapping)
    
    print(f"\nCreated {len(mappings)} KB1-KB2 mappings")
    return mappings

# Execute mapping analysis
mapping_stats, exact_matches = analyze_cve_mapping(kb1_data, kb2_data)
kb1_kb2_mappings = create_kb1_kb2_mappings(kb1_data, kb2_data, exact_matches)


ANALYZING CVE MAPPING BETWEEN KB1 AND KB2
KB1 CVEs: 1154
KB2 CVEs: 1154
Exact matches: 1154
Overlap: 100.0%
KB1 only: 0
KB2 only: 0

Sample exact matches:
  1. CVE-2006-3635
  2. CVE-2007-6761
  3. CVE-2007-6762
  4. CVE-2008-7316
  5. CVE-2009-2692

Created 2205 KB1-KB2 mappings


In [10]:

# Cell 3: Generate New Fields from KB2 Data

def analyze_structural_patterns(kb2_entry: Dict) -> Dict:
    """
    Extract structural insights from KB2 entry for KB1 enrichment.
    
    Args:
        kb2_entry: KB2 entry with features and embeddings
        
    Returns:
        Dictionary of extracted structural patterns
    """
    
    features = kb2_entry.get('features', {})
    security_features = features.get('security_features', {})
    complexity_metrics = features.get('complexity_metrics', {})
    code_patterns = features.get('code_patterns', {})
    
    # Extract dangerous calls
    dangerous_calls = list(security_features.get('dangerous_calls', {}).keys())
    
    # Analyze complexity
    edge_density = complexity_metrics.get('edge_density', 0)
    call_density = complexity_metrics.get('call_to_vertex_ratio', 0)
    
    # Get graph statistics
    graph_stats = kb2_entry.get('graph_statistics', {})
    node_count = graph_stats.get('nodes', 0)
    edge_count = graph_stats.get('edges', 0)
    
    return {
        'dangerous_calls': dangerous_calls,
        'edge_density': edge_density,
        'call_density': call_density,
        'node_count': node_count,
        'edge_count': edge_count,
        'vertex_types': code_patterns.get('vertex_type_distribution', {}),
        'all_calls': list(code_patterns.get('all_calls', {}).keys())
    }

def generate_vulnerability_type(kb1_entry: Dict, structural_patterns: Dict) -> str:
    """
    Determine vulnerability type based on KB1 textual data and KB2 structural patterns.
    
    Scientific approach:
        - Combine textual analysis from KB1 
        - Structural pattern recognition from KB2
        - Rule-based classification with confidence scoring
    """
    
    # Extract textual indicators from KB1
    vulnerability_behavior = kb1_entry.get('vulnerability_behavior', {})
    code_before = kb1_entry.get('code_before_change', '').lower()
    solution = kb1_entry.get('solution', '').lower()
    
    # Combine all text for analysis
    text_content = ' '.join([
        vulnerability_behavior.get('specific_code_behavior_causing_vulnerability', ''),
        code_before,
        solution
    ]).lower()
    
    # Structural indicators from KB2
    dangerous_calls = structural_patterns.get('dangerous_calls', [])
    
    # Classification rules
    if any(call in dangerous_calls for call in ['strcpy', 'strcat', 'sprintf', 'gets']):
        if 'buffer' in text_content or 'overflow' in text_content:
            return 'buffer_overflow'
    
    if any(call in dangerous_calls for call in ['malloc', 'calloc', 'realloc', 'free']):
        if 'use after free' in text_content or 'double free' in text_content:
            return 'use_after_free'
        elif 'memory leak' in text_content or 'leak' in text_content:
            return 'memory_leak'
    
    if 'null' in text_content and 'pointer' in text_content:
        return 'null_pointer_dereference'
    
    if 'race' in text_content or 'concurrent' in text_content:
        return 'race_condition'
    
    if 'injection' in text_content or 'input' in text_content:
        return 'injection'
    
    if 'privilege' in text_content or 'permission' in text_content:
        return 'privilege_escalation'
    
    # Default based on CWE
    cwe = kb1_entry.get('source_cwe', '')
    cwe_mapping = {
        'CWE-119': 'buffer_overflow',
        'CWE-125': 'buffer_overflow', 
        'CWE-787': 'buffer_overflow',
        'CWE-416': 'use_after_free',
        'CWE-401': 'memory_leak',
        'CWE-476': 'null_pointer_dereference',
        'CWE-362': 'race_condition',
        'CWE-20': 'input_validation',
        'CWE-200': 'information_disclosure',
        'CWE-264': 'privilege_escalation'
    }
    
    return cwe_mapping.get(cwe, 'other')

def generate_fix_strategy_type(kb1_entry: Dict, structural_patterns: Dict) -> str:
    """
    Determine fix strategy based on vulnerability analysis.
    """
    
    solution = kb1_entry.get('solution', '').lower()
    vulnerability_type = generate_vulnerability_type(kb1_entry, structural_patterns)
    
    # Strategy mapping based on vulnerability type and solution text
    if 'bounds' in solution or 'check' in solution or 'validate' in solution:
        return 'bounds_checking'
    
    if 'sanitize' in solution or 'filter' in solution or 'escape' in solution:
        return 'input_validation'
    
    if 'lock' in solution or 'mutex' in solution or 'synchroniz' in solution:
        return 'synchronization'
    
    if 'null' in solution and 'check' in solution:
        return 'null_checking'
    
    if 'free' in solution or 'cleanup' in solution:
        return 'resource_management'
    
    # Default mapping by vulnerability type
    strategy_mapping = {
        'buffer_overflow': 'bounds_checking',
        'use_after_free': 'resource_management', 
        'memory_leak': 'resource_management',
        'null_pointer_dereference': 'null_checking',
        'race_condition': 'synchronization',
        'injection': 'input_validation',
        'privilege_escalation': 'access_control'
    }
    
    return strategy_mapping.get(vulnerability_type, 'other')

def generate_complexity_level(structural_patterns: Dict) -> str:
    """
    Determine complexity level based on structural metrics.
    
    Scientific approach:
        - Graph complexity metrics (nodes, edges, density)
        - Call complexity (dangerous calls, total calls)
        - Threshold-based classification
    """
    
    node_count = structural_patterns.get('node_count', 0)
    edge_density = structural_patterns.get('edge_density', 0)
    dangerous_calls_count = len(structural_patterns.get('dangerous_calls', []))
    
    # Complexity scoring
    complexity_score = 0
    
    # Node count contribution
    if node_count > 500:
        complexity_score += 3
    elif node_count > 200:
        complexity_score += 2
    elif node_count > 50:
        complexity_score += 1
    
    # Edge density contribution
    if edge_density > 10:
        complexity_score += 2
    elif edge_density > 5:
        complexity_score += 1
    
    # Dangerous calls contribution
    if dangerous_calls_count > 5:
        complexity_score += 2
    elif dangerous_calls_count > 2:
        complexity_score += 1
    
    # Classification
    if complexity_score >= 6:
        return 'complex'
    elif complexity_score >= 3:
        return 'moderate'
    else:
        return 'simple'

def generate_structural_description(kb1_entry: Dict, structural_patterns: Dict) -> str:
    """
    Generate human-readable structural description combining KB1 and KB2 insights.
    """
    
    dangerous_calls = structural_patterns.get('dangerous_calls', [])
    node_count = structural_patterns.get('node_count', 0)
    vertex_types = structural_patterns.get('vertex_types', {})
    
    # Build description components
    description_parts = []
    
    # Graph size
    if node_count > 300:
        description_parts.append(f"Large code structure with {node_count} nodes")
    elif node_count > 100:
        description_parts.append(f"Medium-sized code structure with {node_count} nodes")
    else:
        description_parts.append(f"Small code structure with {node_count} nodes")
    
    # Dangerous calls
    if dangerous_calls:
        if len(dangerous_calls) > 3:
            description_parts.append(f"multiple dangerous functions: {', '.join(dangerous_calls[:3])} and {len(dangerous_calls)-3} others")
        else:
            description_parts.append(f"dangerous functions: {', '.join(dangerous_calls)}")
    
    # Control structures
    call_count = vertex_types.get('CALL', 0)
    control_count = vertex_types.get('CONTROL_STRUCTURE', 0)
    
    if control_count > 5:
        description_parts.append(f"complex control flow with {control_count} control structures")
    elif control_count > 0:
        description_parts.append(f"moderate control flow with {control_count} control structures")
    
    if call_count > 20:
        description_parts.append(f"high function call density ({call_count} calls)")
    
    return "; ".join(description_parts) if description_parts else "Simple code structure"

def generate_kb2_link_id(mapping: EnrichmentMapping) -> str:
    """
    Generate KB2 link ID for cross-reference.
    """
    return mapping.kb2_entry_key

# Test field generation on sample mapping
if kb1_kb2_mappings:
    sample_mapping = kb1_kb2_mappings[0]
    sample_kb1 = kb1_data[sample_mapping.kb1_cve_id]
    sample_patterns = analyze_structural_patterns(sample_mapping.kb2_data)
    
    print(f"\nTesting field generation on: {sample_mapping.kb1_cve_id}")
    print(f"KB2 link: {sample_mapping.kb2_entry_key}")
    print(f"Dangerous calls found: {sample_patterns['dangerous_calls']}")
    print(f"Generated vulnerability type: {generate_vulnerability_type(sample_kb1, sample_patterns)}")
    print(f"Generated fix strategy: {generate_fix_strategy_type(sample_kb1, sample_patterns)}")
    print(f"Generated complexity: {generate_complexity_level(sample_patterns)}")
    print(f"Structural description: {generate_structural_description(sample_kb1, sample_patterns)[:100]}...")



Testing field generation on: CVE-2014-7825_0
KB2 link: CVE-2014-7825_0_vuln
Dangerous calls found: []
Generated vulnerability type: race_condition
Generated fix strategy: bounds_checking
Generated complexity: simple
Structural description: Medium-sized code structure with 118 nodes; moderate control flow with 5 control structures; high fu...


In [11]:

# Cell 4: Batch Enrichment of KB1

def enrich_kb1_entries(kb1_data: Dict, mappings: List[EnrichmentMapping]) -> Dict:
    """
    Enrich all KB1 entries with 5 new fields using KB2 structural analysis.
    
    Returns:
        Enriched KB1 data with new fields added
    """
    
    print("ENRICHING KB1 WITH STRUCTURAL FIELDS")
    print("=" * 40)
    
    enriched_kb1 = kb1_data.copy()
    enrichment_stats = {
        'total_entries': len(kb1_data),
        'enriched_entries': 0,
        'non_enriched_entries': 0,
        'field_generation_stats': {
            'vulnerability_types': Counter(),
            'fix_strategies': Counter(),
            'complexity_levels': Counter()
        }
    }
    
    # Create mapping lookup for fast access
    mapping_lookup = {mapping.kb1_cve_id: mapping for mapping in mappings}
    
    for kb1_key, kb1_entry in enriched_kb1.items():
        if kb1_key in mapping_lookup:
            # Has KB2 mapping - generate rich fields
            mapping = mapping_lookup[kb1_key]
            structural_patterns = analyze_structural_patterns(mapping.kb2_data)
            
            # Generate the 5 new fields
            vulnerability_type = generate_vulnerability_type(kb1_entry, structural_patterns)
            fix_strategy_type = generate_fix_strategy_type(kb1_entry, structural_patterns)
            complexity_level = generate_complexity_level(structural_patterns)
            structural_description = generate_structural_description(kb1_entry, structural_patterns)
            kb2_link_id = generate_kb2_link_id(mapping)
            
            # Add new fields to KB1 entry
            kb1_entry.update({
                'structural_description': structural_description,
                'vulnerability_type': vulnerability_type,
                'fix_strategy_type': fix_strategy_type,
                'complexity_level': complexity_level,
                'kb2_link_id': kb2_link_id,
                'enrichment_source': 'kb2_structural_analysis',
                'enrichment_confidence': mapping.mapping_confidence
            })
            
            # Update stats
            enrichment_stats['enriched_entries'] += 1
            enrichment_stats['field_generation_stats']['vulnerability_types'][vulnerability_type] += 1
            enrichment_stats['field_generation_stats']['fix_strategies'][fix_strategy_type] += 1
            enrichment_stats['field_generation_stats']['complexity_levels'][complexity_level] += 1
            
        else:
            # No KB2 mapping - generate minimal fields from KB1 only
            cwe = kb1_entry.get('source_cwe', '')
            
            # Basic fallback generation
            vulnerability_type = {
                'CWE-119': 'buffer_overflow',
                'CWE-125': 'buffer_overflow',
                'CWE-787': 'buffer_overflow',
                'CWE-416': 'use_after_free',
                'CWE-401': 'memory_leak',
                'CWE-476': 'null_pointer_dereference',
                'CWE-362': 'race_condition',
                'CWE-20': 'input_validation',
                'CWE-200': 'information_disclosure',
                'CWE-264': 'privilege_escalation'
            }.get(cwe, 'other')
            
            kb1_entry.update({
                'structural_description': 'Structural analysis not available - textual analysis only',
                'vulnerability_type': vulnerability_type,
                'fix_strategy_type': 'other',
                'complexity_level': 'unknown',
                'kb2_link_id': None,
                'enrichment_source': 'kb1_textual_fallback',
                'enrichment_confidence': 0.5
            })
            
            enrichment_stats['non_enriched_entries'] += 1
    
    # Print enrichment statistics
    print(f"Enrichment completed:")
    print(f"  Total entries: {enrichment_stats['total_entries']}")
    print(f"  Enriched with KB2: {enrichment_stats['enriched_entries']}")
    print(f"  Fallback only: {enrichment_stats['non_enriched_entries']}")
    print(f"  Enrichment rate: {enrichment_stats['enriched_entries']/enrichment_stats['total_entries']*100:.1f}%")
    
    print(f"\nField distribution:")
    print(f"  Vulnerability types: {dict(enrichment_stats['field_generation_stats']['vulnerability_types'].most_common(5))}")
    print(f"  Fix strategies: {dict(enrichment_stats['field_generation_stats']['fix_strategies'].most_common(5))}")
    print(f"  Complexity levels: {dict(enrichment_stats['field_generation_stats']['complexity_levels'])}")
    
    return enriched_kb1, enrichment_stats

def save_enriched_kb1(enriched_kb1: Dict, output_directory: Path) -> None:
    """
    Save enriched KB1 data back to CWE-organized files.
    
    Preserves original KB1 structure while adding new fields.
    """
    
    print(f"\nSaving enriched KB1 to: {output_directory}")
    output_directory.mkdir(exist_ok=True)
    
    # Organize entries back by CWE
    cwe_groups = {}
    for kb1_key, kb1_entry in enriched_kb1.items():
        cwe = kb1_entry.get('source_cwe', 'unknown')
        if cwe not in cwe_groups:
            cwe_groups[cwe] = {}
        
        # Extract original CVE_id for grouping
        cve_id = kb1_entry.get('CVE_id', kb1_key)
        instance_idx = kb1_entry.get('instance_index', 0)
        
        if cve_id not in cwe_groups[cwe]:
            cwe_groups[cwe][cve_id] = []
        
        # Remove metadata fields before saving
        clean_entry = {k: v for k, v in kb1_entry.items() 
                      if k not in ['source_cwe', 'instance_index', 'total_instances']}
        
        cwe_groups[cwe][cve_id].append(clean_entry)
    
    # Save each CWE file
    saved_files = []
    for cwe, cve_data in cwe_groups.items():
        if cwe != 'unknown':
            # Convert single-item lists back to single entries (preserve original format)
            processed_data = {}
            for cve_id, instances in cve_data.items():
                if len(instances) == 1:
                    processed_data[cve_id] = instances[0]
                else:
                    processed_data[cve_id] = instances
            
            output_file = output_directory / f"gpt-4o-mini_{cwe}_316_enriched.json"
            with open(output_file, 'w') as f:
                json.dump(processed_data, f, indent=2)
            
            saved_files.append(output_file.name)
            print(f"  Saved: {output_file.name} ({len(cve_data)} CVEs)")
    
    # Save enrichment summary
    summary_file = output_directory / "enrichment_summary.json"
    summary_data = {
        'enrichment_date': '2025-06-13',
        'kb2_source': str(CONFIG['kb2_path']),
        'total_cwe_files': len(saved_files),
        'saved_files': saved_files,
        'enrichment_stats': enrichment_stats
    }
    
    with open(summary_file, 'w') as f:
        json.dump(summary_data, f, indent=2)
    
    print(f"  Saved: enrichment_summary.json")
    print(f"\nEnrichment complete! {len(saved_files)} CWE files saved.")

# Execute enrichment
enriched_kb1, enrichment_stats = enrich_kb1_entries(kb1_data, kb1_kb2_mappings)
save_enriched_kb1(enriched_kb1, CONFIG['output_directory'])

print(f"\nKB1 ENRICHMENT COMPLETED SUCCESSFULLY")
print("=" * 40)
print(f"Original KB1 entries: {len(kb1_data)}")
print(f"KB2-enhanced entries: {enrichment_stats['enriched_entries']}")
print(f"Output directory: {CONFIG['output_directory']}")
print(f"Ready for hybrid RAG integration!")

ENRICHING KB1 WITH STRUCTURAL FIELDS
Enrichment completed:
  Total entries: 2205
  Enriched with KB2: 2205
  Fallback only: 0
  Enrichment rate: 100.0%

Field distribution:
  Vulnerability types: {'null_pointer_dereference': 529, 'race_condition': 521, 'use_after_free': 306, 'buffer_overflow': 277, 'injection': 153}
  Fix strategies: {'bounds_checking': 1228, 'resource_management': 420, 'synchronization': 316, 'other': 126, 'null_checking': 74}
  Complexity levels: {'simple': 1563, 'moderate': 629, 'complex': 13}

Saving enriched KB1 to: /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/processed/kb1
  Saved: gpt-4o-mini_CWE-125_316_enriched.json (85 CVEs)
  Saved: gpt-4o-mini_CWE-787_316_enriched.json (99 CVEs)
  Saved: gpt-4o-mini_CWE-416_316_enriched.json (271 CVEs)
  Saved: gpt-4o-mini_CWE-200_316_enriched.json (92 CVEs)
  Saved: gpt-4o-mini_CWE-362_316_enriched.json (157 CVEs)
  Saved: gpt-4o-mini_CWE-20_316_enriched.json (79 CVEs)
  Saved: g