In [2]:
"""
KB2 Construction with Graph Embeddings for Vulnerability Detection
Scientific implementation for hybrid RAG system
"""

import json
import networkx as nx
import numpy as np
from pathlib import Path
from collections import Counter
from typing import Dict, List, Tuple, Optional
import logging

# Configure logging for scientific reproducibility
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Configuration constants
CONFIG = {
    'embedding_dimensions': 128,
    'batch_size': 100,
    'cpg_data_path': Path.home() / "Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/tmp/cpg_json",
    'kb2_input_path': Path.home() / "Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/tmp/kb2_complete.json",
    'kb2_output_path': Path.home() / "Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/tmp/kb2_final_with_embeddings.json"
}

logger.info("Configuration loaded successfully")
print("Configuration:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")

2025-06-13 14:07:14,587 - INFO - Configuration loaded successfully


Configuration:
  embedding_dimensions: 128
  batch_size: 100
  cpg_data_path: /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/tmp/cpg_json
  kb2_input_path: /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/tmp/kb2_complete.json
  kb2_output_path: /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/tmp/kb2_final_with_embeddings.json


In [3]:
def graphson_to_networkx(graphson_data: Dict) -> nx.Graph:
    """
    Convert GraphSON format CPG to NetworkX graph.
    
    Args:
        graphson_data: Dictionary containing GraphSON formatted CPG data
        
    Returns:
        NetworkX Graph object representing the CPG
        
    Scientific rationale:
        - Preserves graph topology for structural analysis
        - Maintains node labels for semantic embedding generation
        - Uses undirected graph to focus on connectivity patterns
    """
    vertices = graphson_data['@value']['vertices']
    edges = graphson_data['@value']['edges']
    
    G = nx.Graph()  # Undirected for structural pattern analysis
    
    # Add vertices with essential attributes
    for vertex in vertices:
        vertex_id = vertex['id']['@value'] if '@value' in vertex['id'] else vertex['id']
        vertex_label = vertex.get('label', 'UNKNOWN')
        G.add_node(vertex_id, label=vertex_label)
    
    # Add edges
    for edge in edges:
        source = edge['outV']['@value'] if '@value' in edge['outV'] else edge['outV']
        target = edge['inV']['@value'] if '@value' in edge['inV'] else edge['inV']
        G.add_edge(source, target)
    
    return G

# Test conversion on sample file
def test_graphson_conversion() -> Tuple[nx.Graph, Dict]:
    """Test GraphSON conversion with validation metrics."""
    
    sample_files = list(CONFIG['cpg_data_path'].rglob("*.json"))
    if not sample_files:
        raise FileNotFoundError("No CPG files found in specified directory")
    
    test_file = sample_files[0]
    logger.info(f"Testing conversion with file: {test_file.name}")
    
    with open(test_file) as f:
        graphson_data = json.load(f)
    
    G = graphson_to_networkx(graphson_data)
    
    # Validation metrics
    metrics = {
        'nodes': G.number_of_nodes(),
        'edges': G.number_of_edges(),
        'density': nx.density(G),
        'connected_components': len(list(nx.connected_components(G)))
    }
    
    logger.info(f"Conversion successful: {metrics}")
    return G, metrics

# Execute test
test_graph, test_metrics = test_graphson_conversion()
print("GraphSON Conversion Test Results:")
for metric, value in test_metrics.items():
    print(f"  {metric}: {value}")

2025-06-13 14:07:14,747 - INFO - Testing conversion with file: patch_cpg.json
2025-06-13 14:07:14,753 - INFO - Conversion successful: {'nodes': 124, 'edges': 574, 'density': 0.07526881720430108, 'connected_components': 1}


GraphSON Conversion Test Results:
  nodes: 124
  edges: 574
  density: 0.07526881720430108
  connected_components: 1


In [4]:
def compute_structural_graph_embedding(G: nx.Graph, dimensions: int = 128) -> np.ndarray:
    """
    Compute structural graph embedding using multi-feature approach.
    
    Args:
        G: NetworkX graph representing CPG
        dimensions: Target dimensionality of embedding vector
        
    Returns:
        Normalized embedding vector capturing structural patterns
        
    Scientific approach:
        1. Topological features (connectivity patterns)
        2. Node label distribution (semantic structure)
        3. Local clustering patterns (code organization)
        4. Degree distribution analysis (complexity metrics)
        5. L2 normalization for similarity computation
    """
    
    if G.number_of_nodes() == 0:
        return np.zeros(dimensions, dtype=np.float32)
    
    features = []
    n_nodes = G.number_of_nodes()
    n_edges = G.number_of_edges()
    
    # 1. Global topological features
    edge_node_ratio = n_edges / max(n_nodes, 1)
    density = nx.density(G)
    component_ratio = len(list(nx.connected_components(G))) / max(n_nodes, 1)
    
    features.extend([edge_node_ratio, density, component_ratio])
    
    # 2. Node label distribution analysis
    label_counts = Counter([G.nodes[n].get('label', 'UNKNOWN') for n in G.nodes()])
    total_nodes = max(sum(label_counts.values()), 1)
    
    # Key CPG node types for vulnerability analysis
    vulnerability_relevant_labels = [
        'CALL', 'IDENTIFIER', 'CONTROL_STRUCTURE', 'BLOCK',
        'LOCAL', 'METHOD_PARAMETER_IN', 'METHOD_PARAMETER_OUT',
        'LITERAL', 'RETURN', 'METHOD'
    ]
    
    for label in vulnerability_relevant_labels:
        proportion = label_counts.get(label, 0) / total_nodes
        features.append(proportion)
    
    # 3. Degree distribution analysis
    degrees = [d for n, d in G.degree()]
    if degrees:
        deg_mean = np.mean(degrees)
        deg_std = np.std(degrees)
        deg_range = np.max(degrees) - np.min(degrees)
        
        # Normalized degree statistics
        features.extend([
            deg_mean / max(n_nodes, 1),
            deg_std / max(deg_mean, 1) if deg_mean > 0 else 0,
            deg_range / max(deg_mean, 1) if deg_mean > 0 else 0
        ])
        
        # Degree distribution histogram
        degree_hist, _ = np.histogram(degrees, bins=5, density=True)
        features.extend(degree_hist.tolist())
        
        # Hub analysis (high-degree nodes)
        high_degree_threshold = np.percentile(degrees, 80)
        hub_proportion = sum(1 for d in degrees if d >= high_degree_threshold) / len(degrees)
        features.append(hub_proportion)
    else:
        features.extend([0] * 10)
    
    # 4. Structural pattern analysis (label connectivity)
    label_adjacency_patterns = []
    for u, v in G.edges():
        label_u = G.nodes[u].get('label', 'UNKNOWN')
        label_v = G.nodes[v].get('label', 'UNKNOWN')
        pattern = tuple(sorted([label_u, label_v]))
        label_adjacency_patterns.append(pattern)
    
    pattern_counts = Counter(label_adjacency_patterns)
    total_edges = max(len(label_adjacency_patterns), 1)
    
    # Important vulnerability-related patterns
    critical_patterns = [
        ('CALL', 'IDENTIFIER'), ('CALL', 'CONTROL_STRUCTURE'),
        ('CALL', 'LITERAL'), ('IDENTIFIER', 'IDENTIFIER'),
        ('BLOCK', 'CALL'), ('CONTROL_STRUCTURE', 'BLOCK')
    ]
    
    for pattern in critical_patterns:
        proportion = pattern_counts.get(pattern, 0) / total_edges
        features.append(proportion)
    
    # 5. Clustering analysis by node type
    for label in vulnerability_relevant_labels[:5]:  # Top 5 to manage dimensions
        nodes_with_label = [n for n in G.nodes() if G.nodes[n].get('label') == label]
        if len(nodes_with_label) > 1:
            subgraph = G.subgraph(nodes_with_label)
            clustering = nx.average_clustering(subgraph) if subgraph.number_of_edges() > 0 else 0
        else:
            clustering = 0
        features.append(clustering)
    
    # Convert to numpy array and normalize
    features = np.array(features, dtype=np.float32)
    
    # L2 normalization for cosine similarity computation
    if np.linalg.norm(features) > 0:
        features = features / np.linalg.norm(features)
    
    # Ensure exact dimensionality
    if len(features) < dimensions:
        features = np.concatenate([features, np.zeros(dimensions - len(features))])
    else:
        features = features[:dimensions]
    
    return features

# Test embedding computation
def test_embedding_computation() -> None:
    """Test embedding computation with diversity analysis."""
    
    # Test on multiple files to verify diversity
    test_files = list(CONFIG['cpg_data_path'].rglob("*.json"))[:3]
    embeddings = []
    file_info = []
    
    print("Testing embedding computation:")
    
    for cpg_file in test_files:
        with open(cpg_file) as f:
            graphson_data = json.load(f)
        
        G = graphson_to_networkx(graphson_data)
        embedding = compute_structural_graph_embedding(G, CONFIG['embedding_dimensions'])
        
        embeddings.append(embedding)
        file_info.append({
            'file': cpg_file.name,
            'instance': cpg_file.parent.name,
            'nodes': G.number_of_nodes(),
            'edges': G.number_of_edges(),
            'embedding_norm': np.linalg.norm(embedding)
        })
        
        print(f"  {cpg_file.parent.name}: nodes={G.number_of_nodes()}, "
              f"edges={G.number_of_edges()}, embedding_norm={np.linalg.norm(embedding):.3f}")
    
    # Compute pairwise similarities
    print("\nPairwise similarities:")
    for i in range(len(embeddings)):
        for j in range(i+1, len(embeddings)):
            similarity = np.dot(embeddings[i], embeddings[j])
            print(f"  {file_info[i]['instance']} <-> {file_info[j]['instance']}: {similarity:.3f}")

# Execute embedding test
test_embedding_computation()

Testing embedding computation:
  CVE-2017-7533_0: nodes=124, edges=574, embedding_norm=1.000
  CVE-2017-7533_0: nodes=124, edges=571, embedding_norm=1.000
  CVE-2021-0935_0: nodes=388, edges=1734, embedding_norm=1.000

Pairwise similarities:
  CVE-2017-7533_0 <-> CVE-2017-7533_0: 1.000
  CVE-2017-7533_0 <-> CVE-2021-0935_0: 0.968
  CVE-2017-7533_0 <-> CVE-2021-0935_0: 0.969


In [5]:
def analyze_vulnerability_patterns(cpg_dir: Path, max_instances: int = 5) -> Dict:
    """
    Analyze structural patterns between vulnerable and patched code versions.
    
    Args:
        cpg_dir: Directory containing CPG files organized by CVE
        max_instances: Maximum number of CVE instances to analyze
        
    Returns:
        Dictionary containing pattern analysis results
        
    Scientific objective:
        Quantify structural differences between vulnerable and patched code
        to validate embedding discriminative power for vulnerability detection.
    """
    
    analysis_results = {
        'instances_analyzed': 0,
        'similarity_distributions': [],
        'structural_changes': [],
        'embedding_statistics': {}
    }
    
    instances_processed = 0
    
    print("Vulnerability Pattern Analysis:")
    print("Instance                    | Vuln Nodes | Patch Nodes | Similarity | Structural Change")
    print("-" * 85)
    
    for instance_dir in cpg_dir.iterdir():
        if not instance_dir.is_dir() or instances_processed >= max_instances:
            continue
            
        vuln_file = instance_dir / "vuln_cpg.json"
        patch_file = instance_dir / "patch_cpg.json"
        
        if not (vuln_file.exists() and patch_file.exists()):
            continue
        
        try:
            # Load and process vulnerable version
            with open(vuln_file) as f:
                vuln_data = json.load(f)
            vuln_graph = graphson_to_networkx(vuln_data)
            vuln_embedding = compute_structural_graph_embedding(vuln_graph, CONFIG['embedding_dimensions'])
            
            # Load and process patched version
            with open(patch_file) as f:
                patch_data = json.load(f)
            patch_graph = graphson_to_networkx(patch_data)
            patch_embedding = compute_structural_graph_embedding(patch_graph, CONFIG['embedding_dimensions'])
            
            # Compute similarity
            similarity = np.dot(vuln_embedding, patch_embedding)
            
            # Analyze structural changes
            node_change = patch_graph.number_of_nodes() - vuln_graph.number_of_nodes()
            edge_change = patch_graph.number_of_edges() - vuln_graph.number_of_edges()
            
            structural_change_magnitude = abs(node_change) + abs(edge_change)
            
            # Store results
            analysis_results['similarity_distributions'].append(similarity)
            analysis_results['structural_changes'].append({
                'instance': instance_dir.name,
                'node_change': node_change,
                'edge_change': edge_change,
                'magnitude': structural_change_magnitude,
                'similarity': similarity
            })
            
            # Display results
            change_indicator = "Major" if structural_change_magnitude > 20 else "Minor"
            print(f"{instance_dir.name:<25} | {vuln_graph.number_of_nodes():>9} | "
                  f"{patch_graph.number_of_nodes():>10} | {similarity:>10.3f} | {change_indicator:>15}")
            
            instances_processed += 1
            
        except Exception as e:
            logger.warning(f"Failed to process {instance_dir.name}: {e}")
            continue
    
    analysis_results['instances_analyzed'] = instances_processed
    
    # Compute statistics
    similarities = analysis_results['similarity_distributions']
    if similarities:
        analysis_results['embedding_statistics'] = {
            'mean_similarity': np.mean(similarities),
            'std_similarity': np.std(similarities),
            'min_similarity': np.min(similarities),
            'max_similarity': np.max(similarities)
        }
        
        print(f"\nStatistical Summary:")
        print(f"  Mean vuln-patch similarity: {np.mean(similarities):.3f}")
        print(f"  Standard deviation: {np.std(similarities):.3f}")
        print(f"  Range: [{np.min(similarities):.3f}, {np.max(similarities):.3f}]")
    
    return analysis_results

# Execute vulnerability pattern analysis
pattern_analysis = analyze_vulnerability_patterns(CONFIG['cpg_data_path'], max_instances=10)

Vulnerability Pattern Analysis:
Instance                    | Vuln Nodes | Patch Nodes | Similarity | Structural Change
-------------------------------------------------------------------------------------
CVE-2017-7533_0           |       124 |        124 |      1.000 |           Minor
CVE-2021-0935_0           |       370 |        388 |      1.000 |           Major
CVE-2017-14156_0          |        44 |         44 |      1.000 |           Minor
CVE-2023-20928_3          |        28 |         40 |      0.995 |           Major
CVE-2019-15221_0          |       226 |        248 |      1.000 |           Major
CVE-2013-1763_0           |        58 |         69 |      0.999 |           Major
CVE-2020-27786_0          |       115 |        131 |      0.999 |           Major
CVE-2016-6786_6           |        48 |         39 |      0.997 |           Major
CVE-2017-15102_0          |       432 |        432 |      1.000 |           Minor
CVE-2023-38430_1          |       313 |        313 |    

In [6]:
def process_kb2_batch(entry_batch: List[Tuple[str, Dict]], cpg_dir: Path) -> Tuple[int, int, List[str]]:
    """
    Process a batch of KB2 entries to add graph embeddings.
    
    Args:
        entry_batch: List of (entry_key, entry_data) tuples
        cpg_dir: Directory containing CPG files
        
    Returns:
        Tuple of (success_count, error_count, error_messages)
        
    Scientific approach:
        Batch processing for memory efficiency and progress tracking
        with comprehensive error handling and validation.
    """
    
    success_count = 0
    error_count = 0
    error_messages = []
    
    for entry_key, entry_data in entry_batch:
        # Skip entries that failed initial extraction
        if not entry_data.get('extraction_success', True):
            continue
            
        cve_id = entry_data['cve_id']
        file_type = entry_data['file_type']
        
        # Construct CPG file path
        cpg_file = cpg_dir / cve_id / f"{file_type}_cpg.json"
        
        if not cpg_file.exists():
            error_message = f"CPG file not found: {cpg_file}"
            error_messages.append(error_message)
            entry_data['embedding_error'] = error_message
            error_count += 1
            continue
        
        try:
            # Load CPG data
            with open(cpg_file) as f:
                graphson_data = json.load(f)
            
            # Convert to NetworkX and compute embedding
            G = graphson_to_networkx(graphson_data)
            embedding = compute_structural_graph_embedding(G, CONFIG['embedding_dimensions'])
            
            # Validate embedding
            if np.isnan(embedding).any() or np.isinf(embedding).any():
                raise ValueError("Invalid embedding values (NaN or Inf)")
            
            # Add embedding and metadata to entry
            entry_data['graph_embedding'] = embedding.tolist()
            entry_data['embedding_method'] = 'structural_multi_feature'
            entry_data['embedding_dimensions'] = CONFIG['embedding_dimensions']
            entry_data['graph_statistics'] = {
                'nodes': int(G.number_of_nodes()),
                'edges': int(G.number_of_edges()),
                'density': float(nx.density(G)),
                'connected_components': len(list(nx.connected_components(G)))
            }
            entry_data['embedding_computed'] = True
            
            success_count += 1
            
        except Exception as e:
            error_message = f"Embedding computation failed for {entry_key}: {str(e)}"
            error_messages.append(error_message)
            entry_data['embedding_error'] = str(e)
            entry_data['embedding_computed'] = False
            error_count += 1
    
    return success_count, error_count, error_messages

# Test batch processing on small sample
def test_batch_processing() -> None:
    """Test batch processing functionality with validation."""
    
    # Load KB2 data
    with open(CONFIG['kb2_input_path']) as f:
        kb2_data = json.load(f)
    
    # Take small sample for testing
    sample_size = 5
    sample_items = list(kb2_data.items())[:sample_size]
    
    print(f"Testing batch processing on {sample_size} entries:")
    
    success, errors, error_msgs = process_kb2_batch(sample_items, CONFIG['cpg_data_path'])
    
    print(f"  Success: {success}/{sample_size}")
    print(f"  Errors: {errors}/{sample_size}")
    
    if error_msgs:
        print("  Error messages:")
        for msg in error_msgs[:3]:  # Show first 3 errors
            print(f"    {msg}")
    
    # Validate a successful embedding
    for entry_key, entry_data in sample_items:
        if entry_data.get('embedding_computed', False):
            embedding = np.array(entry_data['graph_embedding'])
            print(f"  Sample embedding for {entry_key}:")
            print(f"    Shape: {embedding.shape}")
            print(f"    Norm: {np.linalg.norm(embedding):.3f}")
            print(f"    Non-zero elements: {np.count_nonzero(embedding)}")
            break

# Execute batch processing test
test_batch_processing()

Testing batch processing on 5 entries:
  Success: 5/5
  Errors: 0/5
  Sample embedding for CVE-2017-7533_0_patch:
    Shape: (128,)
    Norm: 1.000
    Non-zero elements: 27


In [7]:
def process_complete_kb2_with_embeddings(kb2_input_path: Path, cpg_dir: Path, output_path: Path) -> Dict:
    """
    Process complete KB2 dataset to add graph embeddings with comprehensive logging.
    
    Args:
        kb2_input_path: Path to input KB2 JSON file
        cpg_dir: Directory containing CPG files
        output_path: Path for output KB2 with embeddings
        
    Returns:
        Dictionary containing processing statistics and results
        
    Scientific approach:
        - Batch processing for memory efficiency
        - Comprehensive error handling and validation
        - Progress tracking for large datasets
        - Statistical analysis of embedding quality
    """
    
    print("COMPLETE KB2 EMBEDDING PROCESSING")
    print("=" * 45)
    
    # Load existing KB2 data
    print(f"Loading KB2 data from: {kb2_input_path}")
    with open(kb2_input_path) as f:
        kb2_data = json.load(f)
    
    print(f"Total KB2 entries to process: {len(kb2_data)}")
    
    # Initialize processing statistics
    stats = {
        'total_entries': len(kb2_data),
        'successful_embeddings': 0,
        'failed_embeddings': 0,
        'processing_errors': [],
        'embedding_statistics': {
            'mean_norm': 0.0,
            'std_norm': 0.0,
            'zero_embeddings': 0,
            'nan_embeddings': 0
        },
        'processing_time': 0,
        'batch_results': []
    }
    
    # Convert to list for batch processing
    entry_items = list(kb2_data.items())
    batch_size = CONFIG['batch_size']
    total_batches = (len(entry_items) + batch_size - 1) // batch_size
    
    print(f"Processing in {total_batches} batches of size {batch_size}")
    
    # Track embedding norms for quality analysis
    embedding_norms = []
    
    start_time = time.time()
    
    # Process in batches with progress tracking
    for batch_idx in range(total_batches):
        start_idx = batch_idx * batch_size
        end_idx = min(start_idx + batch_size, len(entry_items))
        batch = entry_items[start_idx:end_idx]
        
        print(f"Processing batch {batch_idx + 1}/{total_batches} "
              f"(entries {start_idx + 1}-{end_idx})")
        
        # Process current batch
        batch_success, batch_errors, batch_error_msgs = process_kb2_batch(batch, cpg_dir)
        
        # Update statistics
        stats['successful_embeddings'] += batch_success
        stats['failed_embeddings'] += batch_errors
        stats['processing_errors'].extend(batch_error_msgs)
        
        # Collect embedding norms for quality analysis
        for entry_key, entry_data in batch:
            if entry_data.get('embedding_computed', False):
                embedding = np.array(entry_data['graph_embedding'])
                norm = np.linalg.norm(embedding)
                embedding_norms.append(norm)
                
                # Check for problematic embeddings
                if norm == 0:
                    stats['embedding_statistics']['zero_embeddings'] += 1
                if np.isnan(embedding).any() or np.isinf(embedding).any():
                    stats['embedding_statistics']['nan_embeddings'] += 1
        
        # Store batch results
        stats['batch_results'].append({
            'batch_idx': batch_idx,
            'success_count': batch_success,
            'error_count': batch_errors,
            'success_rate': batch_success / len(batch) if len(batch) > 0 else 0
        })
        
        # Progress update
        overall_progress = (batch_idx + 1) / total_batches * 100
        print(f"  Batch complete: {batch_success}/{len(batch)} successful "
              f"({batch_success/len(batch)*100:.1f}%) | "
              f"Overall progress: {overall_progress:.1f}%")
    
    # Calculate final statistics
    processing_time = time.time() - start_time
    stats['processing_time'] = processing_time
    
    if embedding_norms:
        stats['embedding_statistics']['mean_norm'] = float(np.mean(embedding_norms))
        stats['embedding_statistics']['std_norm'] = float(np.std(embedding_norms))
    
    # Save processed KB2 data
    print(f"\nSaving enhanced KB2 to: {output_path}")
    with open(output_path, 'w') as f:
        json.dump(kb2_data, f, indent=2)
    
    # Generate processing report
    print(f"\nPROCESSING COMPLETE")
    print(f"=" * 25)
    print(f"Total processing time: {processing_time:.1f} seconds")
    print(f"Successful embeddings: {stats['successful_embeddings']}")
    print(f"Failed embeddings: {stats['failed_embeddings']}")
    print(f"Success rate: {stats['successful_embeddings']/stats['total_entries']*100:.1f}%")
    print(f"Output file size: {output_path.stat().st_size / (1024*1024):.1f} MB")
    
    if embedding_norms:
        print(f"\nEMBEDDING QUALITY ANALYSIS")
        print(f"Mean embedding norm: {stats['embedding_statistics']['mean_norm']:.3f}")
        print(f"Std embedding norm: {stats['embedding_statistics']['std_norm']:.3f}")
        print(f"Zero embeddings: {stats['embedding_statistics']['zero_embeddings']}")
        print(f"NaN/Inf embeddings: {stats['embedding_statistics']['nan_embeddings']}")
    
    if stats['processing_errors']:
        print(f"\nFirst 5 processing errors:")
        for i, error in enumerate(stats['processing_errors'][:5]):
            print(f"  {i+1}. {error}")
    
    return stats

# Import required modules
import time

# Execute complete processing
logger.info("Starting complete KB2 processing with embeddings")
processing_stats = process_complete_kb2_with_embeddings(
    CONFIG['kb2_input_path'], 
    CONFIG['cpg_data_path'], 
    CONFIG['kb2_output_path']
)

2025-06-13 14:07:15,251 - INFO - Starting complete KB2 processing with embeddings


COMPLETE KB2 EMBEDDING PROCESSING
Loading KB2 data from: /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/tmp/kb2_complete.json
Total KB2 entries to process: 4410
Processing in 45 batches of size 100
Processing batch 1/45 (entries 1-100)
  Batch complete: 100/100 successful (100.0%) | Overall progress: 2.2%
Processing batch 2/45 (entries 101-200)
  Batch complete: 100/100 successful (100.0%) | Overall progress: 4.4%
Processing batch 3/45 (entries 201-300)
  Batch complete: 100/100 successful (100.0%) | Overall progress: 6.7%
Processing batch 4/45 (entries 301-400)
  Batch complete: 100/100 successful (100.0%) | Overall progress: 8.9%
Processing batch 5/45 (entries 401-500)
  Batch complete: 100/100 successful (100.0%) | Overall progress: 11.1%
Processing batch 6/45 (entries 501-600)
  Batch complete: 100/100 successful (100.0%) | Overall progress: 13.3%
Processing batch 7/45 (entries 601-700)
  Batch complete: 100/100 successful (100.0%) | Overal

In [8]:
class KB2SimilaritySearchEngine:
    """
    Efficient similarity search engine for KB2 structural embeddings.
    
    Scientific approach:
        - Cosine similarity for normalized embeddings
        - Hybrid scoring combining structural and feature-based similarity
        - Configurable similarity thresholds
        - Performance optimization for large datasets
    """
    
    def __init__(self, kb2_path: Path, similarity_threshold: float = 0.7):
        """
        Initialize search engine with KB2 data.
        
        Args:
            kb2_path: Path to KB2 file with embeddings
            similarity_threshold: Minimum similarity for candidate selection
        """
        self.similarity_threshold = similarity_threshold
        self.kb2_data = {}
        self.embeddings_matrix = None
        self.entry_keys = []
        
        self._load_kb2_data(kb2_path)
        self._build_embedding_matrix()
        
        logger.info(f"KB2 Search Engine initialized with {len(self.kb2_data)} entries")
    
    def _load_kb2_data(self, kb2_path: Path) -> None:
        """Load KB2 data and filter entries with valid embeddings."""
        
        print(f"Loading KB2 data from: {kb2_path}")
        with open(kb2_path) as f:
            raw_data = json.load(f)
        
        # Filter entries with valid embeddings
        valid_entries = 0
        for key, entry in raw_data.items():
            if (entry.get('embedding_computed', False) and 
                'graph_embedding' in entry and 
                entry['graph_embedding'] is not None):
                self.kb2_data[key] = entry
                valid_entries += 1
        
        print(f"Loaded {valid_entries} entries with valid embeddings out of {len(raw_data)} total")
    
    def _build_embedding_matrix(self) -> None:
        """Build numpy matrix of embeddings for efficient similarity computation."""
        
        if not self.kb2_data:
            raise ValueError("No valid KB2 data loaded")
        
        self.entry_keys = list(self.kb2_data.keys())
        embeddings_list = []
        
        for key in self.entry_keys:
            embedding = np.array(self.kb2_data[key]['graph_embedding'], dtype=np.float32)
            embeddings_list.append(embedding)
        
        self.embeddings_matrix = np.vstack(embeddings_list)
        
        print(f"Built embedding matrix: {self.embeddings_matrix.shape}")
        print(f"Embedding dimension: {self.embeddings_matrix.shape[1]}")
        print(f"Memory usage: {self.embeddings_matrix.nbytes / (1024*1024):.1f} MB")
    
    def compute_structural_features_similarity(self, query_features: Dict, candidate_key: str) -> float:
        """
        Compute similarity based on structural features (non-embedding).
        
        Args:
            query_features: Dictionary of query structural features
            candidate_key: Key of candidate entry in KB2
            
        Returns:
            Feature-based similarity score [0, 1]
        """
        
        candidate_entry = self.kb2_data[candidate_key]
        candidate_features = candidate_entry['features']
        
        # Initialize similarity components
        similarities = []
        
        # 1. Dangerous calls similarity (Jaccard)
        query_dangerous = set(query_features.get('dangerous_calls', []))
        candidate_dangerous = set(candidate_features['security_features']['dangerous_calls'].keys())
        
        if query_dangerous or candidate_dangerous:
            jaccard_dangerous = len(query_dangerous & candidate_dangerous) / max(len(query_dangerous | candidate_dangerous), 1)
            similarities.append(('dangerous_calls', jaccard_dangerous, 0.3))
        
        # 2. Complexity similarity (normalized difference)
        query_complexity = query_features.get('complexity_score', 0)
        candidate_complexity = candidate_features['complexity_metrics']['edge_density']
        
        if max(query_complexity, candidate_complexity) > 0:
            complexity_sim = 1 - abs(query_complexity - candidate_complexity) / max(query_complexity, candidate_complexity)
            similarities.append(('complexity', complexity_sim, 0.2))
        
        # 3. Function calls similarity (top calls overlap)
        query_calls = set(query_features.get('top_calls', []))
        candidate_calls = set(list(candidate_features['code_patterns']['all_calls'].keys())[:10])
        
        if query_calls or candidate_calls:
            calls_jaccard = len(query_calls & candidate_calls) / max(len(query_calls | candidate_calls), 1)
            similarities.append(('function_calls', calls_jaccard, 0.3))
        
        # 4. Vertex type distribution similarity
        query_types = query_features.get('vertex_types', {})
        candidate_types = candidate_features['code_patterns']['vertex_type_distribution']
        
        if query_types and candidate_types:
            # Compute cosine similarity of type distributions
            all_types = set(query_types.keys()) | set(candidate_types.keys())
            query_vec = np.array([query_types.get(t, 0) for t in all_types])
            candidate_vec = np.array([candidate_types.get(t, 0) for t in all_types])
            
            if np.linalg.norm(query_vec) > 0 and np.linalg.norm(candidate_vec) > 0:
                types_sim = np.dot(query_vec, candidate_vec) / (np.linalg.norm(query_vec) * np.linalg.norm(candidate_vec))
                similarities.append(('vertex_types', types_sim, 0.2))
        
        # Compute weighted average
        if similarities:
            total_weight = sum(weight for _, _, weight in similarities)
            weighted_sim = sum(sim * weight for _, sim, weight in similarities) / total_weight
            return weighted_sim
        else:
            return 0.0
    
    def search_similar_graphs(self, query_embedding: np.ndarray, 
                            query_features: Dict = None, 
                            top_k: int = 10, 
                            hybrid_weight: float = 0.7) -> List[Tuple[str, float, Dict]]:
        """
        Search for similar graphs using hybrid similarity.
        
        Args:
            query_embedding: Query graph embedding vector
            query_features: Optional structural features for hybrid scoring
            top_k: Number of top candidates to return
            hybrid_weight: Weight for embedding similarity vs feature similarity
            
        Returns:
            List of (entry_key, similarity_score, metadata) tuples
        """
        
        if self.embeddings_matrix is None:
            raise ValueError("Embedding matrix not built")
        
        # Normalize query embedding
        query_norm = np.linalg.norm(query_embedding)
        if query_norm == 0:
            logger.warning("Query embedding has zero norm")
            return []
        
        normalized_query = query_embedding / query_norm
        
        # Compute cosine similarities with all embeddings
        cosine_similarities = np.dot(self.embeddings_matrix, normalized_query)
        
        # Get initial candidates above threshold
        candidate_indices = np.where(cosine_similarities >= self.similarity_threshold)[0]
        
        if len(candidate_indices) == 0:
            logger.info(f"No candidates found above similarity threshold {self.similarity_threshold}")
            # Lower threshold and take top candidates
            candidate_indices = np.argsort(cosine_similarities)[-min(top_k*2, len(cosine_similarities)):]
        
        # Compute hybrid scores if features provided
        results = []
        for idx in candidate_indices:
            entry_key = self.entry_keys[idx]
            embedding_sim = float(cosine_similarities[idx])
            
            # Compute final similarity score
            if query_features is not None:
                feature_sim = self.compute_structural_features_similarity(query_features, entry_key)
                final_sim = hybrid_weight * embedding_sim + (1 - hybrid_weight) * feature_sim
            else:
                final_sim = embedding_sim
                feature_sim = 0.0
            
            # Collect metadata
            entry_data = self.kb2_data[entry_key]
            metadata = {
                'cve_id': entry_data['cve_id'],
                'file_type': entry_data['file_type'],
                'embedding_similarity': embedding_sim,
                'feature_similarity': feature_sim,
                'final_similarity': final_sim,
                'graph_stats': entry_data.get('graph_statistics', {}),
                'dangerous_calls': list(entry_data['features']['security_features']['dangerous_calls'].keys())
            }
            
            results.append((entry_key, final_sim, metadata))
        
        # Sort by final similarity and return top-k
        results.sort(key=lambda x: x[1], reverse=True)
        return results[:top_k]
    
    def get_embedding_statistics(self) -> Dict:
        """Get statistics about the embedding matrix for analysis."""
        
        if self.embeddings_matrix is None:
            return {}
        
        norms = np.linalg.norm(self.embeddings_matrix, axis=1)
        pairwise_sims = np.dot(self.embeddings_matrix, self.embeddings_matrix.T)
        
        # Remove diagonal (self-similarities)
        mask = ~np.eye(pairwise_sims.shape[0], dtype=bool)
        off_diagonal_sims = pairwise_sims[mask]
        
        return {
            'total_embeddings': self.embeddings_matrix.shape[0],
            'embedding_dimension': self.embeddings_matrix.shape[1],
            'norm_statistics': {
                'mean': float(np.mean(norms)),
                'std': float(np.std(norms)),
                'min': float(np.min(norms)),
                'max': float(np.max(norms))
            },
            'similarity_statistics': {
                'mean_pairwise_similarity': float(np.mean(off_diagonal_sims)),
                'std_pairwise_similarity': float(np.std(off_diagonal_sims)),
                'max_pairwise_similarity': float(np.max(off_diagonal_sims)),
                'min_pairwise_similarity': float(np.min(off_diagonal_sims))
            }
        }

# Test the search engine
def test_kb2_search_engine():
    """Test KB2 search engine functionality."""
    
    print("TESTING KB2 SEARCH ENGINE")
    print("=" * 30)
    
    # Initialize search engine
    search_engine = KB2SimilaritySearchEngine(CONFIG['kb2_output_path'])
    
    # Get embedding statistics
    stats = search_engine.get_embedding_statistics()
    print("Embedding Statistics:")
    for category, values in stats.items():
        if isinstance(values, dict):
            print(f"  {category}:")
            for key, value in values.items():
                print(f"    {key}: {value:.3f}" if isinstance(value, float) else f"    {key}: {value}")
        else:
            print(f"  {category}: {values}")
    
    # Test search with a random embedding
    print(f"\nTesting similarity search...")
    test_embedding = np.random.randn(CONFIG['embedding_dimensions'])
    test_embedding = test_embedding / np.linalg.norm(test_embedding)  # Normalize
    
    # Test features for hybrid search
    test_features = {
        'dangerous_calls': ['malloc', 'strcpy'],
        'complexity_score': 0.5,
        'top_calls': ['malloc', 'free', 'strcpy'],
        'vertex_types': {'CALL': 10, 'IDENTIFIER': 20, 'BLOCK': 5}
    }
    
    # Perform search
    results = search_engine.search_similar_graphs(
        test_embedding, 
        test_features, 
        top_k=5, 
        hybrid_weight=0.7
    )
    
    print(f"Found {len(results)} similar entries:")
    for i, (entry_key, similarity, metadata) in enumerate(results):
        print(f"  {i+1}. {entry_key}")
        print(f"     CVE: {metadata['cve_id']}")
        print(f"     Type: {metadata['file_type']}")
        print(f"     Final similarity: {similarity:.3f}")
        print(f"     Embedding sim: {metadata['embedding_similarity']:.3f}")
        print(f"     Feature sim: {metadata['feature_similarity']:.3f}")
        print(f"     Dangerous calls: {metadata['dangerous_calls']}")
        print()
    
    return search_engine

# Execute test
kb2_search_engine = test_kb2_search_engine()

2025-06-13 14:08:08,259 - INFO - KB2 Search Engine initialized with 4410 entries


TESTING KB2 SEARCH ENGINE
Loading KB2 data from: /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/tmp/kb2_final_with_embeddings.json
Loaded 4410 entries with valid embeddings out of 4410 total
Built embedding matrix: (4410, 128)
Embedding dimension: 128
Memory usage: 2.2 MB


2025-06-13 14:08:08,373 - INFO - No candidates found above similarity threshold 0.7


Embedding Statistics:
  total_embeddings: 4410
  embedding_dimension: 128
  norm_statistics:
    mean: 1.000
    std: 0.000
    min: 1.000
    max: 1.000
  similarity_statistics:
    mean_pairwise_similarity: 0.844
    std_pairwise_similarity: 0.274
    max_pairwise_similarity: 1.000
    min_pairwise_similarity: 0.108

Testing similarity search...
Found 5 similar entries:
  1. CVE-2021-43057_1_patch
     CVE: CVE-2021-43057_1
     Type: patch
     Final similarity: 0.190
     Embedding sim: 0.199
     Feature sim: 0.169
     Dangerous calls: []

  2. CVE-2021-43057_1_vuln
     CVE: CVE-2021-43057_1
     Type: vuln
     Final similarity: 0.190
     Embedding sim: 0.199
     Feature sim: 0.169
     Dangerous calls: []

  3. CVE-2019-19252_0_vuln
     CVE: CVE-2019-19252_0
     Type: vuln
     Final similarity: 0.156
     Embedding sim: 0.201
     Feature sim: 0.051
     Dangerous calls: []

  4. CVE-2016-10088_0_patch
     CVE: CVE-2016-10088_0
     Type: patch
     Final similarity: 0.1

In [9]:
# Fonction manquante depuis notebook 03
def analyze_full_cpg(cpg_file):
    """Analyze full CPG file for features extraction."""
    
    with open(cpg_file) as f:
        data = json.load(f)
    
    vertices = data['@value']['vertices']
    edges = data['@value']['edges']
    
    analysis = {
        'dangerous_calls': {},
        'all_calls': {},
        'vertex_types': {},
        'file_info': {
            'vertex_count': len(vertices),
            'edge_count': len(edges)
        }
    }
    
    # Extract dangerous calls and vertex types
    dangerous_functions = ['strcpy', 'strcat', 'sprintf', 'malloc', 'free', 'memcpy', 'memset']
    
    for vertex in vertices:
        label = vertex.get('label', 'UNKNOWN')
        analysis['vertex_types'][label] = analysis['vertex_types'].get(label, 0) + 1
        
        if label == 'CALL':
            # Extract call name
            props = vertex.get('properties', {})
            if 'NAME' in props:
                name_prop = props['NAME']
                if isinstance(name_prop, dict) and '@value' in name_prop:
                    value_data = name_prop['@value']
                    if isinstance(value_data, dict) and '@value' in value_data:
                        name = value_data['@value'][0] if isinstance(value_data['@value'], list) else value_data['@value']
                        
                        analysis['all_calls'][name] = analysis['all_calls'].get(name, 0) + 1
                        
                        # Check if dangerous
                        for dangerous in dangerous_functions:
                            if dangerous in str(name).lower():
                                analysis['dangerous_calls'][name] = analysis['dangerous_calls'].get(name, 0) + 1
    
    return analysis

In [10]:
def test_real_cpg_query_pipeline():
    """
    Test complete pipeline with real CPG files to validate search accuracy.
    
    Scientific validation:
        - Query known vulnerable code against KB2
        - Measure retrieval precision for vulnerable vs patched versions
        - Analyze similarity distributions across different CVE types
        - Validate that similar vulnerabilities are correctly identified
    """
    
    print("REAL CPG QUERY PIPELINE TESTING")
    print("=" * 40)
    
    # Select test CPG files from different CVEs
    test_cpg_files = []
    cve_patterns = {}
    
    # Find diverse test cases
    for instance_dir in CONFIG['cpg_data_path'].iterdir():
        if instance_dir.is_dir():
            vuln_file = instance_dir / "vuln_cpg.json"
            patch_file = instance_dir / "patch_cpg.json"
            
            if vuln_file.exists() and patch_file.exists():
                cve_id = instance_dir.name
                test_cpg_files.append({
                    'cve_id': cve_id,
                    'vuln_file': vuln_file,
                    'patch_file': patch_file
                })
                
                if len(test_cpg_files) >= 5:  # Test with 5 different CVEs
                    break
    
    print(f"Testing with {len(test_cpg_files)} CVE instances")
    
    test_results = {
        'query_accuracy': [],
        'vuln_patch_similarity': [],
        'cross_cve_similarity': [],
        'retrieval_precision': [],
        'detailed_results': []
    }
    
    for i, test_case in enumerate(test_cpg_files):
        print(f"\nTesting CVE {i+1}: {test_case['cve_id']}")
        print("-" * 30)
        
        try:
            # Process vulnerable version as query
            print("Processing vulnerable version...")
            with open(test_case['vuln_file']) as f:
                vuln_data = json.load(f)
            
            vuln_graph = graphson_to_networkx(vuln_data)
            vuln_embedding = compute_structural_graph_embedding(vuln_graph, CONFIG['embedding_dimensions'])
            
            # Extract query features
            vuln_analysis = analyze_full_cpg(test_case['vuln_file'])
            query_features = {
                'dangerous_calls': list(vuln_analysis['dangerous_calls'].keys()),
                'complexity_score': vuln_graph.number_of_edges() / max(vuln_graph.number_of_nodes(), 1),
                'top_calls': list(vuln_analysis['all_calls'].keys())[:10],
                'vertex_types': vuln_analysis['vertex_types']
            }
            
            print(f"  Query graph: {vuln_graph.number_of_nodes()} nodes, {vuln_graph.number_of_edges()} edges")
            print(f"  Dangerous calls: {query_features['dangerous_calls']}")
            
            # Search in KB2
            search_results = kb2_search_engine.search_similar_graphs(
                vuln_embedding, 
                query_features, 
                top_k=10, 
                hybrid_weight=0.7
            )
            
            # Analyze results
            print(f"  Found {len(search_results)} similar entries")
            
            # Check if patch version is in top results
            patch_key = f"{test_case['cve_id']}_patch"
            vuln_key = f"{test_case['cve_id']}_vuln"
            
            patch_rank = None
            vuln_rank = None
            same_cve_count = 0
            
            for rank, (entry_key, similarity, metadata) in enumerate(search_results):
                if entry_key == patch_key:
                    patch_rank = rank + 1
                if entry_key == vuln_key:
                    vuln_rank = rank + 1
                if metadata['cve_id'] == test_case['cve_id']:
                    same_cve_count += 1
                
                print(f"    {rank+1:2d}. {entry_key:<25} | Sim: {similarity:.3f} | "
                      f"CVE: {metadata['cve_id']:<15} | Type: {metadata['file_type']}")
            
            # Calculate metrics
            retrieval_precision = same_cve_count / len(search_results) if search_results else 0
            
            # Measure vuln-patch similarity specifically
            patch_embedding = None
            if patch_key in kb2_search_engine.kb2_data:
                patch_embedding = np.array(kb2_search_engine.kb2_data[patch_key]['graph_embedding'])
                vuln_patch_sim = np.dot(vuln_embedding, patch_embedding)
            else:
                # Compute patch embedding if not in KB2
                with open(test_case['patch_file']) as f:
                    patch_data = json.load(f)
                patch_graph = graphson_to_networkx(patch_data)
                patch_embedding = compute_structural_graph_embedding(patch_graph, CONFIG['embedding_dimensions'])
                vuln_patch_sim = np.dot(vuln_embedding, patch_embedding)
            
            # Store detailed results
            test_result = {
                'cve_id': test_case['cve_id'],
                'query_graph_stats': {
                    'nodes': vuln_graph.number_of_nodes(),
                    'edges': vuln_graph.number_of_edges(),
                    'dangerous_calls_count': len(query_features['dangerous_calls'])
                },
                'search_results_count': len(search_results),
                'patch_rank': patch_rank,
                'vuln_rank': vuln_rank,
                'same_cve_in_top10': same_cve_count,
                'retrieval_precision': retrieval_precision,
                'vuln_patch_similarity': float(vuln_patch_sim),
                'top_similarities': [float(sim) for _, sim, _ in search_results[:3]]
            }
            
            test_results['detailed_results'].append(test_result)
            test_results['retrieval_precision'].append(retrieval_precision)
            test_results['vuln_patch_similarity'].append(vuln_patch_sim)
            
            print(f"  Patch rank in results: {patch_rank if patch_rank else 'Not found'}")
            print(f"  Vuln-patch similarity: {vuln_patch_sim:.3f}")
            print(f"  Same CVE precision: {retrieval_precision:.3f}")
            
        except Exception as e:
            print(f"  Error processing {test_case['cve_id']}: {e}")
            logger.error(f"Error in real CPG testing for {test_case['cve_id']}: {e}")
    
    # Summary statistics
    print(f"\nTEST SUMMARY STATISTICS")
    print("=" * 25)
    
    if test_results['retrieval_precision']:
        mean_precision = np.mean(test_results['retrieval_precision'])
        mean_vuln_patch_sim = np.mean(test_results['vuln_patch_similarity'])
        
        print(f"Mean retrieval precision: {mean_precision:.3f}")
        print(f"Mean vuln-patch similarity: {mean_vuln_patch_sim:.3f}")
        
        # Count how many times patch was in top-5
        patches_in_top5 = sum(1 for r in test_results['detailed_results'] 
                             if r['patch_rank'] and r['patch_rank'] <= 5)
        print(f"Patch versions found in top-5: {patches_in_top5}/{len(test_results['detailed_results'])}")
        
        # Average top similarity scores
        all_top_sims = []
        for r in test_results['detailed_results']:
            all_top_sims.extend(r['top_similarities'])
        
        if all_top_sims:
            print(f"Mean top-3 similarity: {np.mean(all_top_sims):.3f}")
    
    return test_results

def analyze_kb2_coverage_and_quality():
    """
    Analyze KB2 dataset coverage and embedding quality.
    
    Quality metrics:
        - CVE distribution analysis
        - Embedding norm distribution
        - Pairwise similarity analysis
        - Feature diversity assessment
    """
    
    print("KB2 COVERAGE AND QUALITY ANALYSIS")
    print("=" * 35)
    
    # Load KB2 data for analysis
    with open(CONFIG['kb2_output_path']) as f:
        kb2_data = json.load(f)
    
    # CVE distribution analysis
    cve_counts = {}
    file_type_counts = {'vuln': 0, 'patch': 0}
    embedding_counts = {'with_embedding': 0, 'without_embedding': 0}
    
    dangerous_calls_global = Counter()
    complexity_scores = []
    graph_sizes = []
    
    for entry_key, entry_data in kb2_data.items():
        # Count CVEs
        if 'cve_id' in entry_data:
            cve_id = entry_data['cve_id']
            cve_counts[cve_id] = cve_counts.get(cve_id, 0) + 1
        
        # Count file types
        if 'file_type' in entry_data:
            file_type_counts[entry_data['file_type']] += 1
        
        # Count embeddings
        if entry_data.get('embedding_computed', False):
            embedding_counts['with_embedding'] += 1
            
            # Analyze features if available
            if 'features' in entry_data:
                features = entry_data['features']
                
                # Dangerous calls
                dangerous_calls = features['security_features']['dangerous_calls']
                for call, count in dangerous_calls.items():
                    dangerous_calls_global[call] += count
                
                # Complexity
                complexity = features['complexity_metrics']['edge_density']
                complexity_scores.append(complexity)
                
                # Graph size
                if 'graph_statistics' in entry_data:
                    graph_stats = entry_data['graph_statistics']
                    graph_sizes.append(graph_stats.get('nodes', 0))
        else:
            embedding_counts['without_embedding'] += 1
    
    # Print analysis results
    print(f"Total KB2 entries: {len(kb2_data)}")
    print(f"Unique CVEs: {len(cve_counts)}")
    print(f"File type distribution: {file_type_counts}")
    print(f"Embedding distribution: {embedding_counts}")
    
    print(f"\nTop 10 most frequent dangerous calls:")
    for call, count in dangerous_calls_global.most_common(10):
        print(f"  {call}: {count}")
    
    if complexity_scores:
        print(f"\nComplexity statistics:")
        print(f"  Mean: {np.mean(complexity_scores):.3f}")
        print(f"  Std: {np.std(complexity_scores):.3f}")
        print(f"  Range: [{np.min(complexity_scores):.3f}, {np.max(complexity_scores):.3f}]")
    
    if graph_sizes:
        print(f"\nGraph size statistics:")
        print(f"  Mean nodes: {np.mean(graph_sizes):.1f}")
        print(f"  Std nodes: {np.std(graph_sizes):.1f}")
        print(f"  Range: [{np.min(graph_sizes)}, {np.max(graph_sizes)}]")
    
    # Check for CVEs with both vuln and patch
    complete_cves = []
    for cve_id, count in cve_counts.items():
        if count >= 2:  # Likely has both vuln and patch
            complete_cves.append(cve_id)
    
    print(f"\nCVEs with both vuln and patch: {len(complete_cves)}")
    
    return {
        'total_entries': len(kb2_data),
        'unique_cves': len(cve_counts),
        'complete_cves': len(complete_cves),
        'embedding_coverage': embedding_counts['with_embedding'] / len(kb2_data),
        'dangerous_calls_stats': dict(dangerous_calls_global.most_common(20)),
        'complexity_stats': {
            'mean': float(np.mean(complexity_scores)) if complexity_scores else 0,
            'std': float(np.std(complexity_scores)) if complexity_scores else 0
        }
    }

# Execute comprehensive testing
print("EXECUTING COMPREHENSIVE KB2 TESTING")
print("=" * 45)

# Test real CPG queries
real_query_results = test_real_cpg_query_pipeline()

print("\n" + "="*50 + "\n")

# Analyze KB2 quality
kb2_quality_analysis = analyze_kb2_coverage_and_quality()

EXECUTING COMPREHENSIVE KB2 TESTING
REAL CPG QUERY PIPELINE TESTING
Testing with 5 CVE instances

Testing CVE 1: CVE-2017-7533_0
------------------------------
Processing vulnerable version...
  Query graph: 124 nodes, 571 edges
  Dangerous calls: ['fsnotify_oldname_free']
  Found 10 similar entries
     1. CVE-2017-7533_0_vuln      | Sim: 0.930 | CVE: CVE-2017-7533_0 | Type: vuln
     2. CVE-2017-7533_1_vuln      | Sim: 0.844 | CVE: CVE-2017-7533_1 | Type: vuln
     3. CVE-2017-7374_0_patch     | Sim: 0.822 | CVE: CVE-2017-7374_0 | Type: patch
     4. CVE-2017-7374_4_patch     | Sim: 0.822 | CVE: CVE-2017-7374_4 | Type: patch
     5. CVE-2017-7533_0_patch     | Sim: 0.821 | CVE: CVE-2017-7533_0 | Type: patch
     6. CVE-2021-0941_1_vuln      | Sim: 0.819 | CVE: CVE-2021-0941_1 | Type: vuln
     7. CVE-2016-10147_0_patch    | Sim: 0.819 | CVE: CVE-2016-10147_0 | Type: patch
     8. CVE-2016-10147_0_vuln     | Sim: 0.818 | CVE: CVE-2016-10147_0 | Type: vuln
     9. CVE-2017-7374_0_vuln 

In [11]:
def generate_final_kb2_report():
    """
    Generate comprehensive final report for KB2 construction and validation.
    
    Report includes:
        - Processing statistics and performance metrics
        - Embedding quality analysis and validation results
        - Search engine performance evaluation
        - Recommendations for production deployment
        - Technical specifications for integration
    """
    
    print("GENERATING FINAL KB2 REPORT")
    print("=" * 30)
    
    # Collect all analysis results
    report_data = {
        'system_info': {
            'notebook_version': '05_kb2_construction_final',
            'generation_date': '2025-06-13',
            'joern_cpg_version': 'GraphSON format',
            'embedding_method': 'structural_multi_feature',
            'embedding_dimensions': CONFIG['embedding_dimensions']
        },
        'processing_statistics': processing_stats,
        'search_engine_stats': kb2_search_engine.get_embedding_statistics(),
        'quality_analysis': kb2_quality_analysis,
        'query_validation': real_query_results
    }
    
    # Generate summary metrics
    summary_metrics = {
        'total_kb2_entries': report_data['processing_statistics']['total_entries'],
        'successful_embeddings': report_data['processing_statistics']['successful_embeddings'],
        'success_rate': report_data['processing_statistics']['successful_embeddings'] / 
                       report_data['processing_statistics']['total_entries'],
        'embedding_coverage': report_data['quality_analysis']['embedding_coverage'],
        'unique_cves_count': report_data['quality_analysis']['unique_cves'],
        'mean_embedding_norm': report_data['search_engine_stats']['norm_statistics']['mean'],
        'mean_retrieval_precision': np.mean(report_data['query_validation']['retrieval_precision']) 
                                   if report_data['query_validation']['retrieval_precision'] else 0
    }
    
    # Print executive summary
    print("EXECUTIVE SUMMARY")
    print("-" * 20)
    print(f"KB2 Construction Status: {'SUCCESS' if summary_metrics['success_rate'] > 0.95 else 'PARTIAL'}")
    print(f"Total entries processed: {summary_metrics['total_kb2_entries']:,}")
    print(f"Embedding success rate: {summary_metrics['success_rate']:.1%}")
    print(f"Unique CVEs covered: {summary_metrics['unique_cves_count']:,}")
    print(f"Mean retrieval precision: {summary_metrics['mean_retrieval_precision']:.3f}")
    
    # Technical specifications
    print(f"\nTECHNICAL SPECIFICATIONS")
    print("-" * 25)
    print(f"Embedding dimension: {CONFIG['embedding_dimensions']}")
    print(f"Similarity metric: Cosine similarity + feature-based hybrid")
    print(f"Graph representation: NetworkX undirected graphs")
    print(f"Feature extraction: Multi-layer structural analysis")
    print(f"Storage format: JSON with numpy array serialization")
    
    # Performance analysis
    processing_time = report_data['processing_statistics']['processing_time']
    entries_per_second = summary_metrics['total_kb2_entries'] / processing_time if processing_time > 0 else 0
    
    print(f"\nPERFORMANCE METRICS")
    print("-" * 20)
    print(f"Total processing time: {processing_time:.1f} seconds")
    print(f"Processing rate: {entries_per_second:.1f} entries/second")
    print(f"KB2 file size: {CONFIG['kb2_output_path'].stat().st_size / (1024*1024):.1f} MB")
    print(f"Memory usage (embeddings): {report_data['search_engine_stats']['total_embeddings'] * CONFIG['embedding_dimensions'] * 4 / (1024*1024):.1f} MB")
    
    # Quality assessment
    print(f"\nQUALITY ASSESSMENT")
    print("-" * 20)
    
    norm_stats = report_data['search_engine_stats']['norm_statistics']
    sim_stats = report_data['search_engine_stats']['similarity_statistics']
    
    print(f"Embedding norm distribution:")
    print(f"  Mean: {norm_stats['mean']:.3f}")
    print(f"  Std: {norm_stats['std']:.3f}")
    print(f"  Range: [{norm_stats['min']:.3f}, {norm_stats['max']:.3f}]")
    
    print(f"Pairwise similarity distribution:")
    print(f"  Mean: {sim_stats['mean_pairwise_similarity']:.3f}")
    print(f"  Std: {sim_stats['std_pairwise_similarity']:.3f}")
    print(f"  Range: [{sim_stats['min_pairwise_similarity']:.3f}, {sim_stats['max_pairwise_similarity']:.3f}]")
    
    # Validation results
    if report_data['query_validation']['detailed_results']:
        print(f"\nVALIDATION RESULTS")
        print("-" * 20)
        
        patches_found = sum(1 for r in report_data['query_validation']['detailed_results'] 
                           if r['patch_rank'] and r['patch_rank'] <= 10)
        total_tests = len(report_data['query_validation']['detailed_results'])
        
        print(f"Patch detection rate (top-10): {patches_found}/{total_tests} ({patches_found/total_tests:.1%})")
        
        mean_vuln_patch_sim = np.mean(report_data['query_validation']['vuln_patch_similarity'])
        print(f"Mean vuln-patch similarity: {mean_vuln_patch_sim:.3f}")
        
        precision_scores = report_data['query_validation']['retrieval_precision']
        print(f"Retrieval precision: {np.mean(precision_scores):.3f} ± {np.std(precision_scores):.3f}")
    
    # Recommendations
    print(f"\nRECOMMENDATIONS FOR PRODUCTION")
    print("-" * 32)
    
    if summary_metrics['success_rate'] > 0.95:
        print("✓ KB2 ready for production deployment")
    else:
        print("⚠ Review failed embeddings before production")
    
    if summary_metrics['mean_retrieval_precision'] > 0.7:
        print("✓ Search precision suitable for hybrid RAG")
    else:
        print("⚠ Consider tuning similarity thresholds")
    
    if norm_stats['std'] / norm_stats['mean'] < 0.3:
        print("✓ Embedding normalization stable")
    else:
        print("⚠ High embedding variance detected")
    
    # Integration guidelines
    print(f"\nINTEGRATION GUIDELINES")
    print("-" * 23)
    print("1. Load KB2 using KB2SimilaritySearchEngine class")
    print("2. Generate CPG for new code using Joern")
    print("3. Compute embedding using compute_structural_graph_embedding()")
    print("4. Search similar graphs with hybrid scoring (recommended weight: 0.7)")
    print("5. Combine with KB1 textual results for final ranking")
    
    # Save detailed report
    report_path = CONFIG['kb2_output_path'].parent / "kb2_construction_report.json"
    with open(report_path, 'w') as f:
        json.dump(report_data, f, indent=2, default=lambda x: float(x) if isinstance(x, np.floating) else str(x))
    
    print(f"\nDetailed report saved: {report_path}")
    
    return report_data, summary_metrics

def export_kb2_for_production():
    """
    Export KB2 components for production integration.
    
    Exports:
        - Embedding matrix in efficient format
        - Metadata index for fast lookup
        - Configuration file for system integration
        - Example code for integration
    """
    
    print("EXPORTING KB2 FOR PRODUCTION")
    print("=" * 30)
    
    export_dir = CONFIG['kb2_output_path'].parent / "kb2_production_export"
    export_dir.mkdir(exist_ok=True)
    
    # 1. Export embedding matrix as numpy binary
    embeddings_path = export_dir / "embeddings_matrix.npy"
    np.save(embeddings_path, kb2_search_engine.embeddings_matrix)
    print(f"Embeddings matrix exported: {embeddings_path}")
    
    # 2. Export entry keys mapping
    keys_path = export_dir / "entry_keys.json"
    with open(keys_path, 'w') as f:
        json.dump(kb2_search_engine.entry_keys, f, indent=2)
    print(f"Entry keys mapping exported: {keys_path}")
    
    # 3. Export metadata index
    metadata_index = {}
    for key, entry in kb2_search_engine.kb2_data.items():
        metadata_index[key] = {
            'cve_id': entry['cve_id'],
            'file_type': entry['file_type'],
            'dangerous_calls': list(entry['features']['security_features']['dangerous_calls'].keys()),
            'complexity': entry['features']['complexity_metrics']['edge_density'],
            'graph_stats': entry.get('graph_statistics', {})
        }
    
    metadata_path = export_dir / "metadata_index.json"
    with open(metadata_path, 'w') as f:
        json.dump(metadata_index, f, indent=2)
    print(f"Metadata index exported: {metadata_path}")
    
    # 4. Export configuration
    config_export = {
        'embedding_dimensions': CONFIG['embedding_dimensions'],
        'similarity_threshold': 0.7,
        'hybrid_weight': 0.7,
        'top_k_default': 10,
        'files': {
            'embeddings_matrix': 'embeddings_matrix.npy',
            'entry_keys': 'entry_keys.json',
            'metadata_index': 'metadata_index.json',
            'full_kb2': '../kb2_final_with_embeddings.json'
        }
    }
    
    config_path = export_dir / "kb2_config.json"
    with open(config_path, 'w') as f:
        json.dump(config_export, f, indent=2)
    print(f"Configuration exported: {config_path}")
    
    # 5. Export integration example
    integration_code = '''
"""
KB2 Integration Example
Usage example for production RAG system
"""
import numpy as np
import json
from pathlib import Path

class ProductionKB2Engine:
    def __init__(self, export_dir):
        # Load precomputed embeddings
        self.embeddings = np.load(export_dir / "embeddings_matrix.npy")
        
        with open(export_dir / "entry_keys.json") as f:
            self.entry_keys = json.load(f)
            
        with open(export_dir / "metadata_index.json") as f:
            self.metadata = json.load(f)
            
        with open(export_dir / "kb2_config.json") as f:
            self.config = json.load(f)
    
    def search_similar(self, query_embedding, top_k=10):
        # Normalize query
        query_norm = query_embedding / np.linalg.norm(query_embedding)
        
        # Compute similarities
        similarities = np.dot(self.embeddings, query_norm)
        
        # Get top-k
        top_indices = np.argsort(similarities)[-top_k:][::-1]
        
        results = []
        for idx in top_indices:
            entry_key = self.entry_keys[idx]
            similarity = float(similarities[idx])
            metadata = self.metadata[entry_key]
            results.append((entry_key, similarity, metadata))
        
        return results

# Usage:
# kb2_engine = ProductionKB2Engine(Path("kb2_production_export"))
# results = kb2_engine.search_similar(your_query_embedding)
'''
    
    integration_path = export_dir / "integration_example.py"
    with open(integration_path, 'w') as f:
        f.write(integration_code)
    print(f"Integration example exported: {integration_path}")
    
    # Calculate export sizes
    total_size = sum(f.stat().st_size for f in export_dir.rglob("*") if f.is_file())
    print(f"\nTotal export size: {total_size / (1024*1024):.1f} MB")
    
    print(f"Production export complete: {export_dir}")
    
    return export_dir

# Generate final report and export
print("FINALIZING KB2 CONSTRUCTION")
print("=" * 35)

final_report, summary = generate_final_kb2_report()

print("\n" + "="*50 + "\n")

production_export = export_kb2_for_production()

print(f"\nKB2 CONSTRUCTION COMPLETE")
print("=" * 25)
print(f"Success rate: {summary['success_rate']:.1%}")
print(f"Total CVEs: {summary['unique_cves_count']:,}")
print(f"Ready for integration with hybrid RAG system")

logger.info("KB2 construction and validation completed successfully")

FINALIZING KB2 CONSTRUCTION
GENERATING FINAL KB2 REPORT


2025-06-13 14:08:09,112 - INFO - KB2 construction and validation completed successfully


EXECUTIVE SUMMARY
--------------------
KB2 Construction Status: SUCCESS
Total entries processed: 4,410
Embedding success rate: 100.0%
Unique CVEs covered: 2,205
Mean retrieval precision: 0.140

TECHNICAL SPECIFICATIONS
-------------------------
Embedding dimension: 128
Similarity metric: Cosine similarity + feature-based hybrid
Graph representation: NetworkX undirected graphs
Feature extraction: Multi-layer structural analysis
Storage format: JSON with numpy array serialization

PERFORMANCE METRICS
--------------------
Total processing time: 52.4 seconds
Processing rate: 84.2 entries/second
KB2 file size: 20.1 MB
Memory usage (embeddings): 2.2 MB

QUALITY ASSESSMENT
--------------------
Embedding norm distribution:
  Mean: 1.000
  Std: 0.000
  Range: [1.000, 1.000]
Pairwise similarity distribution:
  Mean: 0.844
  Std: 0.274
  Range: [0.108, 1.000]

VALIDATION RESULTS
--------------------
Patch detection rate (top-10): 3/5 (60.0%)
Mean vuln-patch similarity: 0.999
Retrieval precision: 