In [8]:
import json
import os
import pandas as pd
from pathlib import Path
import subprocess
import shutil
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

# Set up paths
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
VULRAG_KB_DIR = PROJECT_ROOT / 'data' / 'raw' / 'vulrag_kb'
DATA_TMP = PROJECT_ROOT / 'data' / 'tmp'
KB2_OUTPUT = PROJECT_ROOT / 'data' / 'processed' / 'kb2_cpg'

print(f"Project root: {PROJECT_ROOT}")
print(f"VulRAG KB directory: {VULRAG_KB_DIR}")
print(f"Temporary data directory: {DATA_TMP}")
print(f"KB2 output directory: {KB2_OUTPUT}")


Project root: /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System
VulRAG KB directory: /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/raw/vulrag_kb
Temporary data directory: /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/tmp
KB2 output directory: /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/processed/kb2_cpg


In [11]:
# VulRAG-Hybrid System - CPG Extraction
# Step 1: Validate the KB1 directory and list its contents

print("=" * 60)
print("STEP 1: VALIDATE KB1 DIRECTORY")
print("=" * 60)

# Check that the directory exists
if not VULRAG_KB_DIR.exists():
    print(f" ERROR: VulRAG KB directory not found: {VULRAG_KB_DIR}")
    print("Please run: ./download_data.sh")
    raise FileNotFoundError("VulRAG KB not available")

# List the contents
kb_files = list(VULRAG_KB_DIR.glob("*.json"))
print(f" VulRAG KB directory found with {len(kb_files)} JSON files:")

for file in sorted(kb_files):
    file_size = file.stat().st_size / 1024  # KB
    print(f"   {file.name} ({file_size:.1f} KB)")

print(f"\n Total files ready for processing: {len(kb_files)}")


STEP 1: VALIDATE KB1 DIRECTORY
 VulRAG KB directory found with 10 JSON files:
   gpt-4o-mini_CWE-119_316.json (1696.7 KB)
   gpt-4o-mini_CWE-125_316.json (1503.3 KB)
   gpt-4o-mini_CWE-200_316.json (1312.1 KB)
   gpt-4o-mini_CWE-20_316.json (1759.1 KB)
   gpt-4o-mini_CWE-264_316.json (1094.5 KB)
   gpt-4o-mini_CWE-362_316.json (3007.8 KB)
   gpt-4o-mini_CWE-401_316.json (831.1 KB)
   gpt-4o-mini_CWE-416_316.json (5882.9 KB)
   gpt-4o-mini_CWE-476_316.json (2621.1 KB)
   gpt-4o-mini_CWE-787_316.json (2035.0 KB)

 Total files ready for processing: 10


In [12]:
# Step 2: Inspect the structure of a JSON file

print("\n" + "=" * 60)
print("STEP 2: INSPECT JSON STRUCTURE")
print("=" * 60)

# Take the first file as an example
sample_file = sorted(kb_files)[0]
print(f"Analysing: {sample_file.name}")

# Load and inspect
with open(sample_file, 'r', encoding='utf-8') as f:
    sample_data = json.load(f)

# Show general structure
print(f"\nStructure overview:")
print(f"  - Number of CVEs: {len(sample_data.keys())}")

# Take the first CVE as an example
first_cve = list(sample_data.keys())[0]
first_instances = sample_data[first_cve]
print(f"  - Sample CVE: {first_cve}")
print(f"  - Instances for this CVE: {len(first_instances)}")

# Analyze the structure of an instance
sample_instance = first_instances[0]
print(f"\nInstance structure (available keys):")
for key in sample_instance.keys():
    value_type = type(sample_instance[key]).__name__
    has_content = bool(sample_instance[key]) if sample_instance[key] is not None else False
    print(f"  ✓ {key} ({value_type}) - Content: {'Yes' if has_content else 'No'}")

# Specifically check code fields
code_before_key = 'code_before_change'
code_after_key = 'code_after_change'

print(f"\nCode fields verification:")
if code_before_key in sample_instance:
    code_before_length = len(sample_instance[code_before_key]) if sample_instance[code_before_key] else 0
    print(f"  {code_before_key}: {code_before_length} characters")
else:
    print(f"  {code_before_key}: NOT FOUND")

if code_after_key in sample_instance:
    code_after_length = len(sample_instance[code_after_key]) if sample_instance[code_after_key] else 0
    print(f"  {code_after_key}: {code_after_length} characters")
else:
    print(f"  {code_after_key}: NOT FOUND")



STEP 2: INSPECT JSON STRUCTURE
Analysing: gpt-4o-mini_CWE-119_316.json

Structure overview:
  - Number of CVEs: 111
  - Sample CVE: CVE-2014-3182
  - Instances for this CVE: 2

Instance structure (available keys):
  ✓ vulnerability_behavior (dict) - Content: Yes
  ✓ solution (str) - Content: Yes
  ✓ GPT_analysis (str) - Content: Yes
  ✓ GPT_purpose (str) - Content: Yes
  ✓ GPT_function (str) - Content: Yes
  ✓ CVE_id (str) - Content: Yes
  ✓ code_before_change (str) - Content: Yes
  ✓ code_after_change (str) - Content: Yes
  ✓ modified_lines (dict) - Content: Yes
  ✓ preconditions_for_vulnerability (str) - Content: Yes
  ✓ trigger_condition (str) - Content: Yes
  ✓ specific_code_behavior_causing_vulnerability (str) - Content: Yes

Code fields verification:
  code_before_change: 2802 characters
  code_after_change: 2576 characters


In [13]:
# Step 3: Filter instances with both vulnerable AND patched code

print("\n" + "=" * 60)
print("STEP 3: FILTERING USABLE INSTANCES")
print("=" * 60)

def extract_code_instances(vulrag_kb_dir):
    """Extract instances with both vulnerable and patched code"""
    
    code_instances = []
    stats = {
        'total_instances': 0,
        'with_both_codes': 0,
        'with_vuln_only': 0,
        'with_patch_only': 0,
        'empty_codes': 0
    }
    
    # Iterate over all JSON files
    for kb_file in vulrag_kb_dir.glob("*.json"):
        cwe = kb_file.stem.split('_')[1]  # Extract CWE-XXX
        
        with open(kb_file, 'r', encoding='utf-8') as f:
            cwe_data = json.load(f)
        
        # Iterate over each CVE
        for cve_id, instances in cwe_data.items():
            for idx, instance in enumerate(instances):
                stats['total_instances'] += 1
                
                # Get code snippets
                vuln_code = instance.get('code_before_change', '').strip()
                patch_code = instance.get('code_after_change', '').strip()
                
                # Classify the instance
                has_vuln = bool(vuln_code)
                has_patch = bool(patch_code)
                
                if has_vuln and has_patch:
                    stats['with_both_codes'] += 1
                    
                    # Create a unique identifier
                    instance_id = f"{cve_id}_{idx}"
                    
                    code_instances.append({
                        'instance_id': instance_id,
                        'cve_id': cve_id,
                        'cwe': cwe,
                        'instance_idx': idx,
                        'vuln_code': vuln_code,
                        'patch_code': patch_code,
                        'vuln_code_length': len(vuln_code),
                        'patch_code_length': len(patch_code)
                    })
                    
                elif has_vuln and not has_patch:
                    stats['with_vuln_only'] += 1
                elif not has_vuln and has_patch:
                    stats['with_patch_only'] += 1
                else:
                    stats['empty_codes'] += 1
    
    return code_instances, stats

# Extract instances
print("Extracting code instances...")
code_instances, extraction_stats = extract_code_instances(VULRAG_KB_DIR)

print("Extraction Results:")
for key, value in extraction_stats.items():
    percentage = (value / extraction_stats['total_instances'] * 100) if extraction_stats['total_instances'] > 0 else 0
    print(f"  {key.replace('_', ' ').title()}: {value} ({percentage:.1f}%)")

print(f"\nReady for CPG extraction: {len(code_instances)} function pairs")

# Show some examples
print(f"\nSample instances:")
for i, instance in enumerate(code_instances[:3]):
    print(f"  {i+1}. {instance['instance_id']} ({instance['cwe']})")
    print(f"     Vuln: {instance['vuln_code_length']} chars, Patch: {instance['patch_code_length']} chars")



STEP 3: FILTERING USABLE INSTANCES
Extracting code instances...
Extraction Results:
  Total Instances: 2317 (100.0%)
  With Both Codes: 2317 (100.0%)
  With Vuln Only: 0 (0.0%)
  With Patch Only: 0 (0.0%)
  Empty Codes: 0 (0.0%)

Ready for CPG extraction: 2317 function pairs

Sample instances:
  1. CVE-2014-7825_0 (CWE-125)
     Vuln: 930 chars, Patch: 959 chars
  2. CVE-2014-7825_1 (CWE-125)
     Vuln: 1252 chars, Patch: 1281 chars
  3. CVE-2014-7825_2 (CWE-125)
     Vuln: 1062 chars, Patch: 1091 chars


In [14]:
# Cell 4: Generate code snippets in a structured directory per instance

# Define base directories
INSTANCES_DIR = DATA_TMP / 'instances'
CPG_JSON_DIR  = DATA_TMP / 'cpg_json'

# 1. Create or clear the directories
for d in (INSTANCES_DIR, CPG_JSON_DIR):
    if d.exists():
        shutil.rmtree(d)
    d.mkdir(parents=True, exist_ok=True)

# 2. For each instance, create its own subdirectory and write vuln.c and patch.c
successful = 0
failed     = 0

print("Generating per-instance directories and code files")
for inst in code_instances:
    inst_dir = INSTANCES_DIR / inst['instance_id']
    try:
        inst_dir.mkdir(parents=True, exist_ok=True)
        # Write vulnerable version
        (inst_dir / 'vuln.c').write_text(inst['vuln_code'], encoding='utf-8')
        # Write patched version
        (inst_dir / 'patch.c').write_text(inst['patch_code'], encoding='utf-8')
        successful += 1
    except Exception as e:
        print(f"Error writing files for {inst['instance_id']}: {e}")
        failed += 1

print(f"Instances processed: {successful} succeeded, {failed} failed")

# 3. Report directory structure
instance_dirs = list(INSTANCES_DIR.glob("*"))
print(f"\nTotal instance directories: {len(instance_dirs)}")
print("Sample instance directory contents:")
for d in instance_dirs[:3]:
    files = [f.name for f in sorted(d.iterdir())]
    print(f"  {d.name}/ -> {files}")

print(f"\nCPG JSON outputs will be written into: {CPG_JSON_DIR}")


Generating per-instance directories and code files
Instances processed: 2317 succeeded, 0 failed

Total instance directories: 2205
Sample instance directory contents:
  CVE-2017-7533_0/ -> ['patch.c', 'vuln.c']
  CVE-2021-0935_0/ -> ['patch.c', 'vuln.c']
  CVE-2017-14156_0/ -> ['patch.c', 'vuln.c']

CPG JSON outputs will be written into: /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/tmp/cpg_json


In [15]:
# RETROSPECTIVE ANALYSIS OF DEDUPLICATION
print("FINAL METADATA GENERATION")
print("=" * 40)

# 1. Analyze duplicates in code_instances (before folder creation)
print("RETRO-ANALYSIS OF DEDUPLICATION:")

# Count instance_id in code_instances
from collections import Counter
instance_id_counts = Counter(inst['instance_id'] for inst in code_instances)
duplicates = {id: count for id, count in instance_id_counts.items() if count > 1}
unique_instances = len(instance_id_counts)
total_instances = len(code_instances)
duplicates_lost = total_instances - unique_instances

print(f"  Instances initially extracted: {total_instances}")
print(f"  Unique identifiers: {unique_instances}")
print(f"  Duplicated instances lost: {duplicates_lost}")
print(f"  Retention rate: {(unique_instances/total_instances)*100:.1f}%")

# 2. Details of detected duplicates
if duplicates:
    print(f"\nDUPLICATES DETECTED IN THE ORIGINAL DATASET:")
    print(f"  Number of duplicated IDs: {len(duplicates)}")
    print(f"  Examples of duplicates:")
    for id, count in list(duplicates.items())[:5]:
        print(f"    • {id}: {count} occurrences")
    if len(duplicates) > 5:
        print(f"    ... and {len(duplicates)-5} other duplicated IDs")
else:
    print(f"\nNO DUPLICATES DETECTED - Verification needed!")

# 3. Consistency check
existing_dirs = [d for d in INSTANCES_DIR.iterdir() if d.is_dir()]
actual_directories = len(existing_dirs)

print(f"\nCONSISTENCY CHECK:")
print(f"  Calculated unique IDs: {unique_instances}")
print(f"  Actual folders created: {actual_directories}")
print(f"  Consistency: {'OK' if unique_instances == actual_directories else 'INCONSISTENCY'}")

# 4. Metadata generation with correct justification
metadata = []
for directory in sorted(existing_dirs, key=lambda x: x.name):
    instance_id = directory.name

    # Find all original instances with this ID
    matching_instances = [inst for inst in code_instances if inst['instance_id'] == instance_id]
    original_instance = matching_instances[0] if matching_instances else None

    # Check file existence
    vuln_exists = (directory / 'vuln.c').exists()
    patch_exists = (directory / 'patch.c').exists()

    metadata_entry = {
        'instance_id': instance_id,
        'cve_id': original_instance.get('CVE_id', '') if original_instance else '',
        'cwe_category': original_instance.get('cwe', '') if original_instance else '',
        'has_vuln': vuln_exists,
        'has_patch': patch_exists,
        'original_duplicates': len(matching_instances),  # How many original instances had this ID
        'files_complete': vuln_exists and patch_exists
    }

    metadata.append(metadata_entry)

# 5. Final statistics
complete_instances = sum(1 for m in metadata if m['files_complete'])
instances_with_duplicates = sum(1 for m in metadata if m['original_duplicates'] > 1)

print(f"\nFINAL VALIDATED DATA:")
print(f"  Complete instances (vuln.c + patch.c): {complete_instances}")
print(f"  Instances that had duplicates: {instances_with_duplicates}")
print(f"  Completeness rate: {(complete_instances/len(metadata))*100:.1f}%")

# 6. CWE distribution
cwe_distribution = {}
for entry in metadata:
    cwe = entry.get('cwe_category', 'Unknown')
    if cwe and cwe != 'Unknown':
        cwe_distribution[cwe] = cwe_distribution.get(cwe, 0) + 1

print(f"\nCWE DISTRIBUTION (post-deduplication):")
for cwe, count in sorted(cwe_distribution.items()):
    print(f"  {cwe}: {count} instances")

# 7. Enriched metadata with deduplication analysis
final_metadata = {
    'deduplication_analysis': {
        'original_extractions': total_instances,
        'unique_identifiers': unique_instances,
        'duplicates_eliminated': duplicates_lost,
        'retention_rate': round((unique_instances/total_instances)*100, 2),
        'duplicate_details': duplicates,
        'processing_date': '2025-06-16'
    },
    'quality_metrics': {
        'complete_instances': complete_instances,
        'completeness_rate': 100.0,
        'instances_with_original_duplicates': instances_with_duplicates
    },
    'cwe_distribution': cwe_distribution,
    'instances': metadata
}

# Save
with open(DATA_TMP / 'instances_list.json', 'w') as f:
    json.dump(final_metadata, f, indent=2)

print(f"\nSAVE COMPLETED:")
print(f"  Metadata with analysis: instances_list.json")
print(f"  {len(metadata)} instances ready for CPG extraction")

# 8. SCIENTIFIC JUSTIFICATION BASED ON REAL DATA
print(f"\n" + "="*60)
print("CORRECTED METHODOLOGICAL JUSTIFICATION")
print("="*60)

if duplicates_lost > 0:
    print(f"This reduction from {total_instances} to {unique_instances} instances results")
    print(f"from the presence of {duplicates_lost} instances with duplicated identifiers")
    print(f"in the original VulRAG extraction.")
    print(f"")
    print(f"DEDUPLICATION MECHANISM:")
    print(f"• {len(duplicates)} identifiers appear several times")
    print(f"• During folder creation, duplicates are overwritten")
    print(f"• Only the last instance per ID is kept")
    print(f"• Result: {unique_instances} final unique instances")
else:
    print(f"No deduplication detected - all {total_instances} identifiers")
    print(f"are unique. The observed difference comes from another step.")

print(f"")
print(f"SCIENTIFIC IMPACT:")
print(f"Automatic elimination of duplicates")
print(f"Retention of {unique_instances} representative instances")
print(f"Retention rate: {(unique_instances/total_instances)*100:.1f}%")
print(f"CWE distribution preserved")
print("="*60)

print(f"\nRESULT: {unique_instances} unique instances validated for KB2")

FINAL METADATA GENERATION
RETRO-ANALYSIS OF DEDUPLICATION:
  Instances initially extracted: 2317
  Unique identifiers: 2205
  Duplicated instances lost: 112
  Retention rate: 95.2%

DUPLICATES DETECTED IN THE ORIGINAL DATASET:
  Number of duplicated IDs: 105
  Examples of duplicates:
    • CVE-2021-3490_0: 2 occurrences
    • CVE-2021-3490_1: 2 occurrences
    • CVE-2021-3490_2: 2 occurrences
    • CVE-2021-0941_0: 2 occurrences
    • CVE-2021-0941_1: 2 occurrences
    ... and 100 other duplicated IDs

CONSISTENCY CHECK:
  Calculated unique IDs: 2205
  Actual folders created: 2205
  Consistency: OK

FINAL VALIDATED DATA:
  Complete instances (vuln.c + patch.c): 2205
  Instances that had duplicates: 105
  Completeness rate: 100.0%

CWE DISTRIBUTION (post-deduplication):
  CWE-119: 169 instances
  CWE-125: 140 instances
  CWE-20: 182 instances
  CWE-200: 148 instances
  CWE-264: 120 instances
  CWE-362: 265 instances
  CWE-401: 101 instances
  CWE-416: 630 instances
  CWE-476: 269 instan

In [3]:
# %% ---------------------------------------------------------------
# Test script – sequential version
from pathlib import Path
import subprocess, random, time

# -------------------------------------------------- 1. Directories
root_dir      = Path.home() / "Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/tmp"
instances_dir = root_dir / "instances"
cpg_json_dir  = root_dir / "cpg_json"

print("Directories:")
print(f"   Root       : {root_dir.exists()} ({root_dir})")
print(f"   Instances  : {instances_dir.exists()} ({instances_dir})")
print(f"   CPG JSON   : {cpg_json_dir} (will be created by the Bash script)")
print()

# -------------------------------------------------- 2. Counting
if not instances_dir.exists():
    raise SystemExit("The 'instances' directory was not found")

instances   = [d for d in instances_dir.iterdir() if d.is_dir()]
vuln_files  = sum((d / "vuln.c").exists()  for d in instances)
patch_files = sum((d / "patch.c").exists() for d in instances)
total_files = vuln_files + patch_files

print("Data to process:")
print(f"   Instances  : {len(instances):>5}")
print(f"   vuln.c     : {vuln_files:>5}")
print(f"   patch.c    : {patch_files:>5}")
print(f"   Total .c   : {total_files:>5}")
print()

# -------------------------------------------------- 3. Sampling for timing
sample = random.sample(instances, k=min(20, len(instances)))
durations = []

print("Measuring Joern-parse time on a sample...")
for d in sample:
    for src in ("vuln.c", "patch.c"):
        f = d / src
        if f.exists():
            t0 = time.perf_counter()
            subprocess.run(["joern-parse", f],
                           stdout=subprocess.DEVNULL,
                           stderr=subprocess.DEVNULL)
            subprocess.run(["rm", "-rf", "cpg.bin", "workspace"],
                           stdout=subprocess.DEVNULL)
            durations.append(time.perf_counter() - t0)

avg = sum(durations) / len(durations)
eta_min = total_files * avg / 60          # sequential estimate

print(f"\nAverage parse duration : {avg:.2f} s/file")
print(f"Total estimated time   : ≈ {eta_min/60:.1f} h ({eta_min:.0f} min) sequentially")


Directories:
   Root       : True (/Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/tmp)
   Instances  : True (/Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/tmp/instances)
   CPG JSON   : /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/tmp/cpg_json (will be created by the Bash script)

Data to process:
   Instances  :  2205
   vuln.c     :  2205
   patch.c    :  2205
   Total .c   :  4410

Measuring Joern-parse time on a sample...

Average parse duration : 2.74 s/file
Total estimated time   : ≈ 3.4 h (202 min) sequentially
