# VulRAG CPG Extraction - Complete Pipeline
## Optimized Version with Parallelization

This notebook:
1. Extracts all uniques instances from the VulRAG KB dataset
2. Generates .c files in an organized structure
3. Launches parallel CPG extraction using Joern
4. Produces the KB2 dataset with CPGs in JSON format

# Cell 1: Imports and Configuration

In [2]:
import json
import os
import sys
import subprocess
import shutil
import time
from pathlib import Path
from collections import defaultdict, Counter
from datetime import datetime
import pandas as pd
from IPython.display import display, HTML, clear_output
import warnings
import hashlib
warnings.filterwarnings('ignore')

# Path configuration
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
VULRAG_KB_DIR = PROJECT_ROOT / 'data' / 'raw' / 'vulrag_kb'
DATA_TMP = PROJECT_ROOT / 'data' / 'tmp'
CPG_JSON_DIR = DATA_TMP / 'cpg_json'
KB2_OUTPUT = PROJECT_ROOT / 'data' / 'processed' 
SCRIPTS_DIR = PROJECT_ROOT / 'scripts'
RESULTS_DIR = PROJECT_ROOT / 'results' / 'cpg_extraction'


# Create necessary directories
for dir_path in [DATA_TMP, CPG_JSON_DIR, KB2_OUTPUT, SCRIPTS_DIR, RESULTS_DIR]:
    dir_path.mkdir(parents=True, exist_ok=True)

print("VulRAG CPG Extraction Pipeline")
print("=" * 50)
print(f"Project root: {PROJECT_ROOT}")
print(f"VulRAG KB: {VULRAG_KB_DIR}")
print(f"CPG output: {CPG_JSON_DIR}")
print(f"KB2 final: {KB2_OUTPUT}")
print(f"Results: {RESULTS_DIR}")


VulRAG CPG Extraction Pipeline
Project root: /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System
VulRAG KB: /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/raw/vulrag_kb
CPG output: /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/tmp/cpg_json
KB2 final: /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/processed
Results: /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/results/cpg_extraction


#  KB1 Dataset Validation

In [3]:

print("\nSTEP 1: VALIDATE KB1 DATASET")
print("=" * 50)

if not VULRAG_KB_DIR.exists():
    print("ERROR: VulRAG KB directory not found!")
    print(f"Expected at: {VULRAG_KB_DIR}")
    raise FileNotFoundError("Please run download_data.sh first")

kb_files = list(VULRAG_KB_DIR.glob("*.json"))
print(f"Found {len(kb_files)} JSON files:")

total_size_mb = 0
for file in sorted(kb_files):
    file_size_mb = file.stat().st_size / (1024 * 1024)
    total_size_mb += file_size_mb
    print(f"   {file.name} ({file_size_mb:.1f} MB)")

print(f"\nTotal dataset size: {total_size_mb:.1f} MB")


STEP 1: VALIDATE KB1 DATASET
Found 10 JSON files:
   gpt-4o-mini_CWE-119_316.json (1.7 MB)
   gpt-4o-mini_CWE-125_316.json (1.5 MB)
   gpt-4o-mini_CWE-200_316.json (1.3 MB)
   gpt-4o-mini_CWE-20_316.json (1.7 MB)
   gpt-4o-mini_CWE-264_316.json (1.1 MB)
   gpt-4o-mini_CWE-362_316.json (2.9 MB)
   gpt-4o-mini_CWE-401_316.json (0.8 MB)
   gpt-4o-mini_CWE-416_316.json (5.7 MB)
   gpt-4o-mini_CWE-476_316.json (2.6 MB)
   gpt-4o-mini_CWE-787_316.json (2.0 MB)

Total dataset size: 21.2 MB


In [4]:
def extract_code_to_files(vulrag_kb_dir, output_dir=DATA_TMP / "temp_code_files"):
    """
    Extract code instances from VulRAG KB to separate files
    Uses unique keys to prevent metadata overwrites
    """
    
    print("Extracting code files from VulRAG KB...")
    print("=" * 50)
    
    # Create output structure
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True, parents=True)
    
    # Global metadata
    metadata = {
        'extraction_info': {
            'timestamp': str(pd.Timestamp.now()),
            'source_directory': str(vulrag_kb_dir),
            'output_directory': str(output_path)
        },
        'statistics': {
            'total_files_processed': 0,
            'total_instances': 0,
            'total_code_files_created': 0,
            'empty_code_instances': 0,
            'by_cwe': {}
        },
        'file_mappings': {},
        'instance_mappings': {}
    }
    
    global_instance_counter = 0
    
    # Process each CWE file
    for kb_file in sorted(vulrag_kb_dir.glob("*.json")):
        cwe = kb_file.stem.split('_')[1]  # Extract CWE-XXX
        
        print(f"Processing {kb_file.name}...")
        
        # Create CWE directory
        cwe_dir = output_path / cwe
        cwe_dir.mkdir(exist_ok=True)
        
        with open(kb_file, 'r', encoding='utf-8') as f:
            cwe_data = json.load(f)
        
        cwe_stats = {
            'source_file': kb_file.name,
            'cve_count': len(cwe_data.keys()),
            'instance_count': 0,
            'code_files_created': 0,
            'empty_code_instances': 0,
            'instances': {}
        }
        
        # Process each CVE
        for cve_id, instances in cwe_data.items():
            cve_instances = []
            
            for idx, instance in enumerate(instances):
                global_instance_counter += 1
                cwe_stats['instance_count'] += 1
                
                # Extract codes
                vuln_code = instance.get('code_before_change', '').strip()
                patch_code = instance.get('code_after_change', '').strip()
                
                # Handle empty code instances
                is_empty = not vuln_code or not patch_code
                if is_empty:
                    cwe_stats['empty_code_instances'] += 1
                    metadata['statistics']['empty_code_instances'] += 1
                    if not vuln_code:
                        vuln_code = "// Empty vulnerable code"
                    if not patch_code:
                        patch_code = "// Empty patched code"
                
                # Create filenames
                base_name = f"{cve_id}_{idx}"
                vuln_filename = f"{base_name}_vuln.c"
                patch_filename = f"{base_name}_patch.c"
                
                vuln_filepath = cwe_dir / vuln_filename
                patch_filepath = cwe_dir / patch_filename
                
                # Write files
                with open(vuln_filepath, 'w', encoding='utf-8') as f:
                    f.write(vuln_code)
                
                with open(patch_filepath, 'w', encoding='utf-8') as f:
                    f.write(patch_code)
                
                cwe_stats['code_files_created'] += 2
                
                # Instance metadata
                instance_id = f"{cve_id}_{idx}"
                instance_metadata = {
                    'instance_id': instance_id,
                    'cve_id': cve_id,
                    'cwe': cwe,
                    'instance_index': idx,
                    'global_counter': global_instance_counter,
                    'source_file': kb_file.name,
                    'has_empty_code': is_empty,
                    'files': {
                        'vulnerable_code': {
                            'filename': vuln_filename,
                            'full_path': str(vuln_filepath),
                            'relative_path': f"{cwe}/{vuln_filename}",
                            'size_bytes': len(vuln_code.encode('utf-8')),
                            'line_count': len(vuln_code.split('\n'))
                        },
                        'patched_code': {
                            'filename': patch_filename,
                            'full_path': str(patch_filepath),
                            'relative_path': f"{cwe}/{patch_filename}",
                            'size_bytes': len(patch_code.encode('utf-8')),
                            'line_count': len(patch_code.split('\n'))
                        }
                    },
                    'additional_info': {
                        'vulnerability_type': instance.get('preconditions_for_vulnerability', ''),
                        'trigger_condition': instance.get('trigger_condition', ''),
                        'solution': instance.get('solution', ''),
                        'gpt_purpose': instance.get('GPT_purpose', ''),
                        'modified_lines': instance.get('modified_lines', {})
                    }
                }
                
                cve_instances.append(instance_metadata)
                
                # Use unique keys for file mappings
                unique_vuln_key = f"{cwe}_{vuln_filename}"
                unique_patch_key = f"{cwe}_{patch_filename}"
                
                metadata['file_mappings'][unique_vuln_key] = instance_metadata
                metadata['file_mappings'][unique_patch_key] = instance_metadata
                metadata['instance_mappings'][instance_id] = instance_metadata
            
            if cve_instances:
                cwe_stats['instances'][cve_id] = cve_instances
        
        metadata['statistics']['by_cwe'][cwe] = cwe_stats
        metadata['statistics']['total_files_processed'] += 1
        metadata['statistics']['total_instances'] += cwe_stats['instance_count']
        metadata['statistics']['total_code_files_created'] += cwe_stats['code_files_created']
        
        print(f"   Created {cwe_stats['code_files_created']} files from {cwe_stats['instance_count']} instances")
    
    # Save metadata
    metadata_file = output_path / "metadata.json"
    with open(metadata_file, 'w', encoding='utf-8') as f:
        json.dump(metadata, f, indent=2, ensure_ascii=False)
    
   
    
    # Summary
    print(f"\nExtraction completed")
    print(f"Output directory: {output_path}")
    print(f"Files processed: {metadata['statistics']['total_files_processed']}")
    print(f"Total instances: {metadata['statistics']['total_instances']}")
    print(f"Code files created: {metadata['statistics']['total_code_files_created']}")
    print(f"Empty code instances: {metadata['statistics']['empty_code_instances']}")
    
    return metadata

# Run extraction using existing path variables
metadata = extract_code_to_files(VULRAG_KB_DIR)


Extracting code files from VulRAG KB...
Processing gpt-4o-mini_CWE-119_316.json...
   Created 346 files from 173 instances
Processing gpt-4o-mini_CWE-125_316.json...
   Created 280 files from 140 instances
Processing gpt-4o-mini_CWE-200_316.json...
   Created 306 files from 153 instances
Processing gpt-4o-mini_CWE-20_316.json...
   Created 364 files from 182 instances
Processing gpt-4o-mini_CWE-264_316.json...
   Created 240 files from 120 instances
Processing gpt-4o-mini_CWE-362_316.json...
   Created 640 files from 320 instances
Processing gpt-4o-mini_CWE-401_316.json...
   Created 202 files from 101 instances
Processing gpt-4o-mini_CWE-416_316.json...
   Created 1320 files from 660 instances
Processing gpt-4o-mini_CWE-476_316.json...
   Created 562 files from 281 instances
Processing gpt-4o-mini_CWE-787_316.json...
   Created 374 files from 187 instances

Extraction completed
Output directory: /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/t

In [5]:
def run_cpg_extraction():
    """Lancer l'extraction CPG avec suivi en temps réel"""

    code_files_dir = DATA_TMP / 'temp_code_files'
    cpg_output_dir = CPG_JSON_DIR
    extract_script = SCRIPTS_DIR / 'extract_cpg.sh' 

    if not extract_script.exists():
        print(f"❌ Script not found: {extract_script}")
        return False

    if not os.access(extract_script, os.X_OK):
        print(f"❌ Script not executable: {extract_script}")
        print(f"Run: chmod +x {extract_script}")
        return False

    print(f"🚀 Launching CPG extraction...")
    print(f"Script: {extract_script}")
    print("=" * 50)
    print(f"TEMP_CODE_FILES: {code_files_dir}")

    try:
        result = subprocess.run(
            [str(extract_script), str(code_files_dir), str(cpg_output_dir)],
            cwd=extract_script.parent,
            text=True,
        )

        print("=" * 50)
        if result.returncode == 0:
            print("✅ CPG extraction completed successfully!")
        else:
            print(f"❌ CPG extraction failed with code {result.returncode}")

        return result.returncode == 0

    except KeyboardInterrupt:
        print("\n⏹️ Extraction interrupted by user")
        return False
    except Exception as e:
        print(f"❌ Error running extraction: {e}")
        return False

# Usage
success = run_cpg_extraction()


🚀 Launching CPG extraction...
Script: /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/scripts/extract_cpg.sh
TEMP_CODE_FILES: /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/tmp/temp_code_files
=== FULL CPG EXTRACTION ===
Input: /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/tmp/temp_code_files
Output: /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/tmp/cpg_json
Started: Tue Aug  5 08:18:05 EDT 2025
Found 2317 pairs to process

Processing CWE-119...
  [50/2317] Progress: N/A pairs/min
  [100/2317] Progress: N/A pairs/min
  [150/2317] Progress: N/A pairs/min
Processing CWE-125...
  [200/2317] Progress: N/A pairs/min
  [250/2317] Progress: N/A pairs/min
  [300/2317] Progress: 30000.00 pairs/min
Processing CWE-20...
  [350/2317] Progress: 35000.00 pairs/min
  [400/2317] Progress: 40000.00 pairs/min
  [450/2317] Progress: 45000.00 p