In [None]:
# Cell 1: Examine the REAL structure of the data
import json
from pathlib import Path

cpg_dir = Path.home() / "Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/tmp/cpg_json"
sample_file = list(cpg_dir.rglob("*vuln_cpg.json"))[0]

print("INSPECTING GRAPHSON DATA STRUCTURE")
print("=" * 45)

with open(sample_file) as f:
    data = json.load(f)

print(f"File: {sample_file.name}")
print(f"Root keys: {list(data.keys())}")

vertices = data['@value']['vertices']
edges = data['@value']['edges']

print(f"{len(vertices)} vertices, {len(edges)} edges")

# Examine ONE vertex in detail
print("\nVERTEX STRUCTURE:")
print("=" * 25)
first_vertex = vertices[0]
print(f"Type: {type(first_vertex)}")
print(f"Keys: {list(first_vertex.keys()) if isinstance(first_vertex, dict) else 'Not a dict'}")

# Show the complete structure of the first vertex
import json
print("Full structure of the first vertex:")
print(json.dumps(first_vertex, indent=2)[:1000] + "...")

# Examine ONE edge in detail  
print("\nEDGE STRUCTURE:")
print("=" * 20)
first_edge = edges[0]
print(f"Type: {type(first_edge)}")
print(f"Keys: {list(first_edge.keys()) if isinstance(first_edge, dict) else 'Not a dict'}")

print("Full structure of the first edge:")
print(json.dumps(first_edge, indent=2)[:500] + "...")

INSPECTING GRAPHSON DATA STRUCTURE
File: vuln_cpg.json
Root keys: ['@type', '@value']
124 vertices, 1095 edges

VERTEX STRUCTURE:
Type: <class 'dict'>
Keys: ['@type', 'id', 'label', 'properties']
Full structure of the first vertex:
{
  "@type": "g:Vertex",
  "id": {
    "@type": "g:Int64",
    "@value": 30064771075
  },
  "label": "CALL",
  "properties": {
    "DISPATCH_TYPE": {
      "@type": "g:VertexProperty",
      "@value": {
        "@type": "g:List",
        "@value": [
          "STATIC_DISPATCH"
        ]
      },
      "id": {
        "@type": "g:Int64",
        "@value": 3
      }
    },
    "NAME": {
      "@type": "g:VertexProperty",
      "@value": {
        "@type": "g:List",
        "@value": [
          "<operator>.logicalOr"
        ]
      },
      "id": {
        "@type": "g:Int64",
        "@value": 6
      }
    },
    "METHOD_FULL_NAME": {
      "@type": "g:VertexProperty",
      "@value": {
        "@type": "g:List",
        "@value": [
          "<operator>.log

In [None]:
# Cell 2: data extraction
def extract_vertex_info(vertex):
    """Extract vertex info with the real GraphSON structure"""
    info = {
        'label': vertex.get('label', ''),
        'properties': {}
    }
    
    # Extract properties
    if 'properties' in vertex:
        for prop_name, prop_data in vertex['properties'].items():
            if isinstance(prop_data, dict) and '@value' in prop_data:
                value_data = prop_data['@value']
                if isinstance(value_data, dict) and '@value' in value_data:
                    # Structure: properties.NAME.@value.@value = [value]
                    values = value_data['@value']
                    if isinstance(values, list) and values:
                        info['properties'][prop_name] = values[0]  # Take the first
                    else:
                        info['properties'][prop_name] = values
    
    return info

def extract_edge_info(edge):
    """Extract edge info"""
    return {
        'label': edge.get('label', ''),
        'from_type': edge.get('outVLabel', ''),
        'to_type': edge.get('inVLabel', ''),
        'from_id': edge.get('outV', {}).get('@value', '') if isinstance(edge.get('outV'), dict) else edge.get('outV', ''),
        'to_id': edge.get('inV', {}).get('@value', '') if isinstance(edge.get('inV'), dict) else edge.get('inV', '')
    }

# Test on the first file
sample_file = list(cpg_dir.rglob("*vuln_cpg.json"))[0]

with open(sample_file) as f:
    data = json.load(f)

vertices = data['@value']['vertices']
edges = data['@value']['edges']

print("TEST  EXTRACTION")
print("=" * 30)

# Analyze the first vertices
print("\nVERTEX ANALYSIS:")
vertex_types = []
vertex_names = []
dangerous_calls = []

dangerous_functions = ['strcpy', 'strcat', 'sprintf', 'scanf', 'gets', 'strncpy', 
                      'malloc', 'free', 'calloc', 'realloc', 'memcpy', 'memmove']

for i, vertex in enumerate(vertices[:10]):
    info = extract_vertex_info(vertex)
    vertex_types.append(info['label'])
    
    print(f"Vertex {i+1}:")
    print(f"  Label: {info['label']}")
    print(f"  Properties: {list(info['properties'].keys())}")
    
    # Look for interesting names/calls
    if 'NAME' in info['properties']:
        name = info['properties']['NAME']
        vertex_names.append(name)
        print(f"  Name: {name}")
        
        # Check if it's a dangerous call
        if info['label'] == 'CALL':
            for dangerous in dangerous_functions:
                if dangerous in str(name).lower():
                    dangerous_calls.append(name)
                    print(f"  DANGEROUS CALL: {name}")
    
    if 'METHOD_FULL_NAME' in info['properties']:
        print(f"  Method: {info['properties']['METHOD_FULL_NAME']}")
    
    print()

print(f"Vertex types found: {set(vertex_types)}")
print(f"Dangerous calls found: {dangerous_calls}")

TEST CORRECTED EXTRACTION

VERTEX ANALYSIS:
Vertex 1:
  Label: CALL
  Properties: ['DISPATCH_TYPE', 'NAME', 'METHOD_FULL_NAME', 'SIGNATURE', 'TYPE_FULL_NAME', 'COLUMN_NUMBER', 'ARGUMENT_INDEX', 'ORDER', 'CODE', 'LINE_NUMBER']
  Name: <operator>.logicalOr
  Method: <operator>.logicalOr

Vertex 2:
  Label: CALL
  Properties: ['DISPATCH_TYPE', 'NAME', 'METHOD_FULL_NAME', 'SIGNATURE', 'TYPE_FULL_NAME', 'COLUMN_NUMBER', 'ARGUMENT_INDEX', 'ORDER', 'CODE', 'LINE_NUMBER']
  Name: dput
  Method: dput

Vertex 3:
  Label: IDENTIFIER
  Properties: ['NAME', 'TYPE_FULL_NAME', 'COLUMN_NUMBER', 'ARGUMENT_INDEX', 'ORDER', 'CODE', 'LINE_NUMBER']
  Name: new_dir

Vertex 4:
  Label: IDENTIFIER
  Properties: ['NAME', 'TYPE_FULL_NAME', 'COLUMN_NUMBER', 'ARGUMENT_INDEX', 'ORDER', 'CODE', 'LINE_NUMBER']
  Name: dentry

Vertex 5:
  Label: CALL
  Properties: ['DISPATCH_TYPE', 'NAME', 'METHOD_FULL_NAME', 'SIGNATURE', 'TYPE_FULL_NAME', 'COLUMN_NUMBER', 'ARGUMENT_INDEX', 'ORDER', 'CODE', 'LINE_NUMBER']
  Name: <op

In [None]:
# Cell 3: Full analysis of a CPG file 
def analyze_full_cpg(cpg_file):
    """Full analysis of a CPG file with the real structure"""
    
    with open(cpg_file) as f:
        data = json.load(f)
    
    vertices = data['@value']['vertices']
    edges = data['@value']['edges']
    
    analysis = {
        'file_info': {
            'name': cpg_file.name,
            'vertex_count': len(vertices),
            'edge_count': len(edges),
            'size_kb': cpg_file.stat().st_size / 1024
        },
        'vertex_types': {},
        'edge_types': {},
        'dangerous_calls': [],
        'all_calls': [],
        'methods': [],
        'identifiers': [],
        'literals': [],
        'control_structures': []
    }
    
    # Extended dangerous functions
    dangerous_functions = [
        'strcpy', 'strcat', 'sprintf', 'scanf', 'gets', 'strncpy',
        'malloc', 'free', 'calloc', 'realloc', 'memcpy', 'memmove',
        'memset', 'alloca', 'delete', 'new'
    ]
    
    # Analyze all vertices
    for vertex in vertices:
        info = extract_vertex_info(vertex)
        label = info['label']
        
        # Count types
        analysis['vertex_types'][label] = analysis['vertex_types'].get(label, 0) + 1
        
        # Extract by type
        if label == 'CALL' and 'NAME' in info['properties']:
            name = info['properties']['NAME']
            analysis['all_calls'].append(name)
            
            # Check if dangerous (flexible search)
            name_lower = str(name).lower()
            for dangerous in dangerous_functions:
                if dangerous in name_lower:
                    analysis['dangerous_calls'].append(name)
                    break
        
        elif label == 'METHOD' and 'NAME' in info['properties']:
            analysis['methods'].append(info['properties']['NAME'])
        
        elif label == 'IDENTIFIER' and 'NAME' in info['properties']:
            analysis['identifiers'].append(info['properties']['NAME'])
        
        elif label == 'LITERAL' and 'CODE' in info['properties']:
            analysis['literals'].append(info['properties']['CODE'])
        
        elif label in ['CONTROL_STRUCTURE', 'IF', 'FOR', 'WHILE', 'BLOCK']:
            analysis['control_structures'].append(label)
    
    # Analyze all edges
    for edge in edges:
        edge_info = extract_edge_info(edge)
        label = edge_info['label']
        analysis['edge_types'][label] = analysis['edge_types'].get(label, 0) + 1
    
    # Deduplication and counting
    from collections import Counter
    analysis['dangerous_calls'] = dict(Counter(analysis['dangerous_calls']).most_common(10))
    analysis['all_calls'] = dict(Counter(analysis['all_calls']).most_common(20))
    analysis['methods'] = dict(Counter(analysis['methods']).most_common(10))
    analysis['identifiers'] = dict(Counter(analysis['identifiers']).most_common(15))
    analysis['control_structures'] = dict(Counter(analysis['control_structures']))
    
    return analysis

# Analyze the sample file
print("FULL FILE ANALYSIS")
print("=" * 30)

analysis = analyze_full_cpg(sample_file)

print(f"File: {analysis['file_info']['name']}")
print(f"{analysis['file_info']['vertex_count']} vertices, {analysis['file_info']['edge_count']} edges ({analysis['file_info']['size_kb']:.1f} KB)")

print(f"\nVertex types:")
for vtype, count in sorted(analysis['vertex_types'].items(), key=lambda x: x[1], reverse=True):
    print(f"   {vtype}: {count}")

print(f"\nEdge types (top 10):")
edge_items = sorted(analysis['edge_types'].items(), key=lambda x: x[1], reverse=True)[:10]
for etype, count in edge_items:
    print(f"   {etype}: {count}")

print(f"\nDangerous calls: {analysis['dangerous_calls']}")
print(f"\nAll calls (top 15):")
for call, count in list(analysis['all_calls'].items())[:15]:
    print(f"   {call}: {count}")

print(f"\nControl structures: {analysis['control_structures']}")
print(f"\nTop identifiers: {dict(list(analysis['identifiers'].items())[:8])}")

FULL FILE ANALYSIS
File: vuln_cpg.json
124 vertices, 1095 edges (790.5 KB)

Vertex types:
   IDENTIFIER: 47
   CALL: 41
   CONTROL_STRUCTURE: 9
   BLOCK: 6
   LOCAL: 5
   METHOD_PARAMETER_OUT: 4
   METHOD_PARAMETER_IN: 4
   RETURN: 2
   FIELD_IDENTIFIER: 2
   METHOD: 1
   LITERAL: 1
   JUMP_TARGET: 1
   METHOD_RETURN: 1

Edge types (top 10):
   REACHING_DEF: 315
   AST: 123
   CDG: 111
   CFG: 110
   CONTAINS: 109
   POST_DOMINATE: 99
   DOMINATE: 99
   ARGUMENT: 73
   REF: 47
   CONDITION: 5

Dangerous calls: {'fsnotify_oldname_free': 2}

All calls (top 15):
   <operator>.logicalOr: 5
   <operator>.assignment: 5
   d_inode: 4
   d_really_is_negative: 3
   dput: 2
   fsnotify_oldname_free: 2
   IS_ERR: 2
   <operator>.equals: 2
   unlock_rename: 2
   <operator>.indirectFieldAccess: 1
   fsnotify_oldname_init: 1
   <operator>.logicalAnd: 1
   fsnotify_move: 1
   lookup_one_len: 1
   lock_rename: 1

Control structures: {'BLOCK': 6, 'CONTROL_STRUCTURE': 9}

Top identifiers: {'dentry': 11,

In [16]:
# Cell 4: Vulnerable vs Patch Comparison
def compare_vuln_vs_patch_corrected(instance_name):
    """Compare patterns between vulnerable and patched versions"""
    
    vuln_file = cpg_dir / instance_name / "vuln_cpg.json"
    patch_file = cpg_dir / instance_name / "patch_cpg.json"
    
    if not (vuln_file.exists() and patch_file.exists()):
        return None
    
    print(f"Analyzing {instance_name}...")
    vuln_analysis = analyze_full_cpg(vuln_file)
    patch_analysis = analyze_full_cpg(patch_file)
    
    # Compare basic metrics
    comparison = {
        'instance': instance_name,
        'metrics_change': {
            'vertices': {
                'vuln': vuln_analysis['file_info']['vertex_count'],
                'patch': patch_analysis['file_info']['vertex_count'],
                'diff': patch_analysis['file_info']['vertex_count'] - vuln_analysis['file_info']['vertex_count']
            },
            'edges': {
                'vuln': vuln_analysis['file_info']['edge_count'], 
                'patch': patch_analysis['file_info']['edge_count'],
                'diff': patch_analysis['file_info']['edge_count'] - vuln_analysis['file_info']['edge_count']
            }
        },
        'dangerous_calls': {
            'vuln': set(vuln_analysis['dangerous_calls'].keys()),
            'patch': set(patch_analysis['dangerous_calls'].keys()),
        },
        'all_calls': {
            'vuln': set(vuln_analysis['all_calls'].keys()),
            'patch': set(patch_analysis['all_calls'].keys()),
        },
        'vertex_types': {
            'vuln': vuln_analysis['vertex_types'],
            'patch': patch_analysis['vertex_types']
        }
    }
    
    # Calculate differences
    comparison['dangerous_calls']['removed'] = comparison['dangerous_calls']['vuln'] - comparison['dangerous_calls']['patch']
    comparison['dangerous_calls']['added'] = comparison['dangerous_calls']['patch'] - comparison['dangerous_calls']['vuln']
    comparison['dangerous_calls']['common'] = comparison['dangerous_calls']['vuln'] & comparison['dangerous_calls']['patch']
    
    comparison['all_calls']['removed'] = comparison['all_calls']['vuln'] - comparison['all_calls']['patch']
    comparison['all_calls']['added'] = comparison['all_calls']['patch'] - comparison['all_calls']['vuln'] 
    comparison['all_calls']['common'] = comparison['all_calls']['vuln'] & comparison['all_calls']['patch']
    
    return comparison

# Analyze the file we just examined
instance_name = sample_file.parent.name
comparison = compare_vuln_vs_patch_corrected(instance_name)

if comparison:
    print(f"\nVULNERABLE vs PATCH COMPARISON - {comparison['instance']}")
    print("=" * 50)
    
    metrics = comparison['metrics_change']
    print(f"Vertices: {metrics['vertices']['vuln']} → {metrics['vertices']['patch']} ({metrics['vertices']['diff']:+d})")
    print(f"Edges: {metrics['edges']['vuln']} → {metrics['edges']['patch']} ({metrics['edges']['diff']:+d})")
    
    dangerous = comparison['dangerous_calls']
    print(f"\nDangerous calls:")
    print(f"   Removed: {dangerous['removed']}")
    print(f"   Added: {dangerous['added']}")
    print(f"   Unchanged: {dangerous['common']}")
    
    calls = comparison['all_calls']
    print(f"\nFunction calls:")
    print(f"   Removed: {list(calls['removed'])[:5]}")
    print(f"   Added: {list(calls['added'])[:5]}")
    print(f"   Total vuln: {len(calls['vuln'])}, Total patch: {len(calls['patch'])}")
    
    # Differences in vertex types
    vuln_types = comparison['vertex_types']['vuln']
    patch_types = comparison['vertex_types']['patch']
    
    print(f"\nChanges in vertex types:")
    for vtype in set(vuln_types.keys()) | set(patch_types.keys()):
        vuln_count = vuln_types.get(vtype, 0)
        patch_count = patch_types.get(vtype, 0)
        if vuln_count != patch_count:
            print(f"   {vtype}: {vuln_count} → {patch_count} ({patch_count - vuln_count:+d})")

Analyzing CVE-2017-7533_0...

VULNERABLE vs PATCH COMPARISON - CVE-2017-7533_0
Vertices: 124 → 124 (+0)
Edges: 1095 → 1103 (+8)

Dangerous calls:
   Removed: {'fsnotify_oldname_free'}
   Added: set()
   Unchanged: set()

Function calls:
   Removed: ['fsnotify_oldname_init', 'd_move', '<operator>.indirectFieldAccess', 'fsnotify_oldname_free', '<operator>.fieldAccess']
   Added: ['simple_rename', 'release_dentry_name_snapshot', 'd_mountpoint', 'take_dentry_name_snapshot', '<operator>.addressOf']
   Total vuln: 20, Total patch: 20

Changes in vertex types:
   CALL: 41 → 42 (+1)
   FIELD_IDENTIFIER: 2 → 1 (-1)


In [17]:
# Cell 5: Analyze multiple instances to see patterns
print("\nANALYSIS OF MULTIPLE INSTANCES")
print("=" * 35)

# Take 3 different instances
sample_instances = [f.parent.name for f in list(cpg_dir.rglob("*vuln_cpg.json"))[:3]]

for instance in sample_instances:
    comparison = compare_vuln_vs_patch_corrected(instance)
    if comparison:
        print(f"\nFile: {instance}:")
        
        metrics = comparison['metrics_change']
        print(f"   Complexity change: {metrics['vertices']['diff']:+d} vertices, {metrics['edges']['diff']:+d} edges")
        
        dangerous = comparison['dangerous_calls']
        if dangerous['removed'] or dangerous['added']:
            print(f"   Dangerous calls - Removed: {dangerous['removed']}, Added: {dangerous['added']}")
        else:
            print(f"   No dangerous calls changed")
        
        calls = comparison['all_calls']
        if calls['removed'] or calls['added']:
            print(f"   Calls changed: -{len(calls['removed'])}, +{len(calls['added'])}")
        else:
            print(f"   No function calls changed")


ANALYSIS OF MULTIPLE INSTANCES
Analyzing CVE-2017-7533_0...

File: CVE-2017-7533_0:
   Complexity change: +0 vertices, +8 edges
   Dangerous calls - Removed: {'fsnotify_oldname_free'}, Added: set()
   Calls changed: -5, +5
Analyzing CVE-2021-0935_0...

File: CVE-2021-0935_0:
   Complexity change: +18 vertices, +153 edges
   Dangerous calls - Removed: {'memset'}, Added: set()
   No function calls changed
Analyzing CVE-2017-14156_0...

File: CVE-2017-14156_0:
   Complexity change: +0 vertices, +0 edges
   No dangerous calls changed
   No function calls changed


In [18]:
# Cell 6: Features for KB2 based on our findings
def extract_kb2_features(cpg_file):
    """Extract optimal features for KB2 based on the analysis"""
    
    analysis = analyze_full_cpg(cpg_file)
    
    # Features based on our findings
    features = {
        # Basic metadata
        'file_info': {
            'source_file': cpg_file.name,
            'size_kb': round(cpg_file.stat().st_size / 1024, 1),
            'vertex_count': analysis['file_info']['vertex_count'],
            'edge_count': analysis['file_info']['edge_count']
        },
        
        # Critical security features
        'security_features': {
            'dangerous_calls': analysis['dangerous_calls'],
            'dangerous_call_count': len(analysis['dangerous_calls']),
            'has_malloc_family': any('malloc' in call or 'calloc' in call or 'realloc' in call 
                                   for call in analysis['all_calls'].keys()),
            'has_string_functions': any(func in call.lower() 
                                      for call in analysis['all_calls'].keys()
                                      for func in ['strcpy', 'strcat', 'sprintf', 'scanf']),
            'has_memory_functions': any(func in call.lower()
                                      for call in analysis['all_calls'].keys() 
                                      for func in ['memset', 'memcpy', 'memmove', 'free'])
        },
        
        # Code patterns
        'code_patterns': {
            'all_calls': dict(list(analysis['all_calls'].items())[:20]),  # Top 20
            'call_count': len(analysis['all_calls']),
            'control_structure_count': sum(analysis['control_structures'].values()),
            'identifier_count': len(analysis['identifiers']),
            'vertex_type_distribution': analysis['vertex_types']
        },
        
        # Structural complexity
        'complexity_metrics': {
            'call_to_vertex_ratio': len(analysis['all_calls']) / analysis['file_info']['vertex_count'] if analysis['file_info']['vertex_count'] > 0 else 0,
            'edge_density': analysis['file_info']['edge_count'] / analysis['file_info']['vertex_count'] if analysis['file_info']['vertex_count'] > 0 else 0,
            'control_flow_complexity': analysis['vertex_types'].get('CONTROL_STRUCTURE', 0) + analysis['vertex_types'].get('BLOCK', 0)
        },
        
        # Signatures for retrieval
        'signatures': {
            'dangerous_call_signature': sorted(analysis['dangerous_calls'].keys()),
            'top_calls_signature': sorted(list(analysis['all_calls'].keys())[:10]),
            'vertex_type_signature': sorted([(k, v) for k, v in analysis['vertex_types'].items() if v > 2])
        }
    }
    
    return features

# Test feature extraction on our sample
print("KB2 FEATURE EXTRACTION")
print("=" * 30)

kb2_features = extract_kb2_features(sample_file)

print(f"File: {kb2_features['file_info']['source_file']}")
print(f"Basic metrics: {kb2_features['file_info']['vertex_count']} vertices, {kb2_features['file_info']['edge_count']} edges")

print("\nSecurity features:")
security = kb2_features['security_features']
print(f"   Dangerous calls: {security['dangerous_calls']}")
print(f"   Has malloc family: {security['has_malloc_family']}")
print(f"   Has string functions: {security['has_string_functions']}")
print(f"   Has memory functions: {security['has_memory_functions']}")

print("\nCode patterns:")
patterns = kb2_features['code_patterns']
print(f"   Total unique calls: {patterns['call_count']}")
print(f"   Control structures: {patterns['control_structure_count']}")
print(f"   Top calls: {dict(list(patterns['all_calls'].items())[:5])}")

print("\nComplexity:")
complexity = kb2_features['complexity_metrics']
print(f"   Call density: {complexity['call_to_vertex_ratio']:.3f}")
print(f"   Edge density: {complexity['edge_density']:.3f}")
print(f"   Control flow complexity: {complexity['control_flow_complexity']}")

print("\nSignatures for retrieval:")
sigs = kb2_features['signatures']
print(f"   Dangerous call signature: {sigs['dangerous_call_signature']}")
print(f"   Top calls signature: {sigs['top_calls_signature']}")

KB2 FEATURE EXTRACTION
File: vuln_cpg.json
Basic metrics: 124 vertices, 1095 edges

Security features:
   Dangerous calls: {'fsnotify_oldname_free': 2}
   Has malloc family: False
   Has string functions: False
   Has memory functions: True

Code patterns:
   Total unique calls: 20
   Control structures: 15
   Top calls: {'<operator>.logicalOr': 5, '<operator>.assignment': 5, 'd_inode': 4, 'd_really_is_negative': 3, 'dput': 2}

Complexity:
   Call density: 0.161
   Edge density: 8.831
   Control flow complexity: 15

Signatures for retrieval:
   Dangerous call signature: ['fsnotify_oldname_free']
   Top calls signature: ['<operator>.assignment', '<operator>.equals', '<operator>.indirectFieldAccess', '<operator>.logicalOr', 'IS_ERR', 'd_inode', 'd_really_is_negative', 'dput', 'fsnotify_oldname_free', 'unlock_rename']


In [19]:
# Cell 7: Construction of KB2 for the entire dataset
import json
from pathlib import Path
from tqdm import tqdm
import pickle

def build_complete_kb2():
    """Build complete KB2 for all CPG files"""
    
    cpg_dir = Path.home() / "Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/tmp/cpg_json"
    kb2_output = Path.home() / "Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/tmp/kb2_complete.json"
    
    print("CONSTRUCTION OF COMPLETE KB2")
    print("=" * 35)
    
    # Find all CPG files
    all_cpg_files = list(cpg_dir.rglob("*.json"))
    print(f"Found {len(all_cpg_files)} CPG files to process")
    
    kb2_data = {}
    success_count = 0
    error_count = 0
    
    # Process all files with progress bar
    for cpg_file in tqdm(all_cpg_files, desc="Processing CPG files"):
        try:
            # Extract CVE ID and type (vuln/patch)
            instance_id = cpg_file.parent.name
            file_type = 'vuln' if 'vuln_cpg' in cpg_file.name else 'patch'
            entry_key = f"{instance_id}_{file_type}"
            
            # Extract features
            features = extract_kb2_features(cpg_file)
            
            # Add metadata for KB2
            kb2_entry = {
                'cve_id': instance_id,
                'file_type': file_type,
                'features': features,
                'extraction_success': True,
                'extraction_date': '2025-06-13'
            }
            
            kb2_data[entry_key] = kb2_entry
            success_count += 1
            
        except Exception as e:
            error_count += 1
            print(f"Error with {cpg_file}: {e}")
            
            # Add error entry
            kb2_data[f"{cpg_file.parent.name}_{cpg_file.stem}"] = {
                'extraction_success': False,
                'error': str(e),
                'file_path': str(cpg_file)
            }
    
    print(f"\nRESULTS:")
    print(f"   Success: {success_count}")
    print(f"   Errors: {error_count}")
    print(f"   KB2 Entries: {len(kb2_data)}")
    
    # Save KB2
    print(f"\nSaving KB2...")
    with open(kb2_output, 'w') as f:
        json.dump(kb2_data, f, indent=2)
    
    print(f"KB2 saved: {kb2_output}")
    print(f"File size: {kb2_output.stat().st_size / (1024*1024):.1f} MB")
    
    return kb2_data, kb2_output

# Start the construction
# WARNING: This will take 10-15 minutes to process 4410 files
print("KB2 CONSTRUCTION - Estimated duration: 10-15 minutes")
print("Do you want to continue? (type 'yes' to confirm)")

# For now, just test on 50 files
test_files = list(cpg_dir.rglob("*.json"))[:50]
print(f"\nTEST on {len(test_files)} files first...")

kb2_test = {}
for cpg_file in test_files[:10]:  # Just 10 for now
    try:
        instance_id = cpg_file.parent.name
        file_type = 'vuln' if 'vuln_cpg' in cpg_file.name else 'patch'
        entry_key = f"{instance_id}_{file_type}"
        
        features = extract_kb2_features(cpg_file)
        kb2_test[entry_key] = {
            'cve_id': instance_id,
            'file_type': file_type,
            'features': features
        }
        print(f"Success: {entry_key}")
    except Exception as e:
        print(f"Error: {cpg_file.name}: {e}")

print(f"\nTest KB2: {len(kb2_test)} entries created")

# Show an example KB2 entry
if kb2_test:
    sample_key = list(kb2_test.keys())[0]
    sample_entry = kb2_test[sample_key]
    print(f"\nExample KB2 entry ({sample_key}):")
    print(f"   CVE ID: {sample_entry['cve_id']}")
    print(f"   Type: {sample_entry['file_type']}")
    print(f"   Dangerous calls: {sample_entry['features']['security_features']['dangerous_calls']}")
    print(f"   Complexity: {sample_entry['features']['complexity_metrics']['edge_density']:.2f}")

KB2 CONSTRUCTION - Estimated duration: 10-15 minutes
Do you want to continue? (type 'yes' to confirm)

TEST on 50 files first...
Success: CVE-2017-7533_0_patch
Success: CVE-2017-7533_0_vuln
Success: CVE-2021-0935_0_patch
Success: CVE-2021-0935_0_vuln
Success: CVE-2017-14156_0_patch
Success: CVE-2017-14156_0_vuln
Success: CVE-2023-20928_3_patch
Success: CVE-2023-20928_3_vuln
Success: CVE-2019-15221_0_patch
Success: CVE-2019-15221_0_vuln

Test KB2: 10 entries created

Example KB2 entry (CVE-2017-7533_0_patch):
   CVE ID: CVE-2017-7533_0
   Type: patch
   Dangerous calls: {}
   Complexity: 8.90


In [20]:
# Full KB2 construction in background
print("LAUNCHING FULL KB2 CONSTRUCTION")
print("You can continue working while this runs...")

# In tmux or as a background process
kb2_data, kb2_file = build_complete_kb2()

LAUNCHING FULL KB2 CONSTRUCTION
You can continue working while this runs...
CONSTRUCTION OF COMPLETE KB2
Found 4410 CPG files to process


Processing CPG files: 100%|██████████| 4410/4410 [00:32<00:00, 136.76it/s]



RESULTS:
   Success: 4410
   Errors: 0
   KB2 Entries: 4410

Saving KB2...
KB2 saved: /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/tmp/kb2_complete.json
File size: 11.2 MB
