# KB2 Construction - Advanced Retrieval Knowledge Base with Chunked Embeddings

## Academic Context
This notebook implements an advanced KB2 (Knowledge Base 2) construction pipeline for vulnerability detection.
Key innovations:
- **Chunked code embeddings**: Handle long functions beyond API context limits
- **Dual-path processing**: Normal CPG analysis vs. fallback heuristic extraction for flat CPGs
- **Double indexation**: Separate vuln/patch vectors for improved retrieval precision
- **Hybrid retrieval**: Dense embeddings + sparse TF-IDF for robustness

In [6]:
# Cell 1: Configuration & Dependencies
import json
import numpy as np
import faiss
from pathlib import Path
from collections import Counter, defaultdict
from datetime import datetime
import pickle
import gzip
import gc
import re
import tiktoken  # For accurate token counting
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
import sys
import os
from typing import List, Dict, Tuple, Optional, Any
import logging

# Configure logging for academic reproducibility
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Project paths
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name=='notebooks' else Path.cwd()
CPG_JSON_DIR = PROJECT_ROOT/'data'/'tmp'/'cpg_json'
OUTPUT_DIR = PROJECT_ROOT/'data'/'processed'
OUTPUT_DIR.mkdir(exist_ok=True)
SANITY_REPORT = OUTPUT_DIR / 'cpg_sanity_report.json'

# Chunking configuration
CHUNK_SIZE = 800  # tokens per chunk (below OpenAI's limit with buffer)
CHUNK_OVERLAP = 100  # overlap tokens between chunks
MAX_CHUNKS_PER_FUNCTION = 10  # limit chunks to avoid explosion




## Step 1: Load Prerequisites
Load KB1 metadata and CPG sanity report to identify flat CPGs requiring fallback processing.

In [9]:

# Path to KB1 metadata file
kb1_path = PROJECT_ROOT / 'data' / 'processed' / 'kb1.json'
kb1 = json.load(open(kb1_path, encoding='utf-8'))

def build_kb1_index(data):
    """Build KB1 index using composite keys and CVE IDs."""
    kb1_by_composite = {}
    kb1_by_cve = defaultdict(list)
    for composite_key, entry_data in data.items():
        kb1_by_composite[composite_key] = entry_data
        cve_id = entry_data.get('cve_id', '')
        if cve_id:
            kb1_by_cve[cve_id].append({
                'composite_key': composite_key,
                'cwe_id': entry_data.get('cwe_id', ''),
                'data': entry_data
            })
    return kb1_by_composite, kb1_by_cve

kb1_by_composite, kb1_by_cve = build_kb1_index(kb1)

def get_meta_from_kb1(composite_key, kb1_data):
    """Get KB1 metadata by composite key."""
    return kb1_data.get(composite_key)

# --- Sanity report helpers ---
from collections import Counter
import re

EXPECTED_TYPES = {"CALL", "IDENTIFIER", "METHOD", "CONTROL_STRUCTURE"}
UNKNOWN_LABEL = "UNKNOWN"
_CVE_RE = re.compile(r"^CVE-\d{4}-\d+(?:_(\d+))?$")

def _meta_from_path(p: Path):
    """Extract {cwe,cve,instance,kind,composite_key} from path."""
    parts = list(p.parts)
    cwe = next((seg for seg in parts if seg.startswith("CWE-")), None)
    cve = None
    inst = None
    for seg in parts:
        m = _CVE_RE.match(seg)
        if m:
            cve = seg.split("_")[0]
            if m.group(1) and m.group(1).isdigit():
                inst = m.group(1)
            break
    if cve and inst is None:
        parent = p.parent.name
        if parent.isdigit():
            inst = parent
    if cve and inst is None:
        for seg in parts:
            if seg.startswith(cve + "_"):
                tail = seg[len(cve) + 1:]
                if tail.isdigit():
                    inst = tail
                    break
    fname = p.name.lower()
    if "vuln" in fname:
        kind = "vuln"
    elif "patch" in fname or "safe" in fname:
        kind = "patch"
    else:
        kind = "unknown"
    if not (cwe and cve and inst):
        return None
    return {"cwe": cwe, "cve": cve, "instance": inst, "kind": kind,
            "composite_key": f"{cwe}_{cve}_{inst}"}

def _import_graphson_parser():
    """Try importing GraphSONParser from various locations."""
    try:
        from graphson_parser import GraphSONParser
        return GraphSONParser
    except Exception:
        pass
    try:
        sys.path.append(str(PROJECT_ROOT / "scripts"))
        from kb2_preprocessing.graphson_parser import GraphSONParser
        return GraphSONParser
    except Exception:
        pass
    try:
        from scripts.kb2_preprocessing.graphson_parser import GraphSONParser
        return GraphSONParser
    except Exception:
        pass
    return None

def _load_vertices_edges(obj):
    """Get vertices and edges from parser or path."""
    if hasattr(obj, "parse"):
        try:
            if not obj.parse():
                return [], []
        except Exception:
            return [], []
        vertices = getattr(obj, "vertices", []) or []
        edges = getattr(obj, "edges", []) or []
        if not vertices and hasattr(obj, "get_vertices"):
            vertices = obj.get_vertices() or []
        if not edges and hasattr(obj, "get_edges"):
            edges = obj.get_edges() or []
        return vertices, edges
    # fallback: read raw JSON
    try:
        with open(obj, "r", encoding="utf-8") as f:
            doc = json.load(f)
        if "vertices" in doc:
            return doc.get("vertices", []), doc.get("edges", [])
        val = doc.get("@value", {})
        return val.get("vertices", []), val.get("edges", [])
    except Exception:
        return [], []

def generate_sanity_report():
    """Scan recursively for *_cpg.json and flag flat CPGs."""
    logger.info("Generating CPG sanity report...")
    GraphSONParser = _import_graphson_parser()
    files = sorted(CPG_JSON_DIR.rglob("*_cpg.json"))
    logger.info(f"Found {len(files)} CPG files")
    items = {}
    flat_count = parse_errors = 0
    for cpg_file in tqdm(files, desc="Checking CPGs"):
        meta = _meta_from_path(cpg_file)
        if not meta:
            continue
        key = f"{meta['composite_key']}::{meta['kind']}"
        item = {"composite_key": meta["composite_key"], "kind": meta["kind"],
                "path": str(cpg_file), "metrics": {}, "flat": True, "notes": []}
        try:
            parser = GraphSONParser(str(cpg_file)) if GraphSONParser else None
            vertices, edges = _load_vertices_edges(parser if parser else cpg_file)
            vcount, ecount = len(vertices), len(edges)
            vtypes = Counter(v.get("label", UNKNOWN_LABEL) for v in vertices)
            has_sem = bool(EXPECTED_TYPES & set(vtypes.keys()))
            unknown_ratio = (vtypes.get(UNKNOWN_LABEL, 0) / vcount) if vcount else 1.0
            epv = (ecount / vcount) if vcount else 0.0
            flat = (vcount < 25) or (not has_sem) or (epv < 0.5 and vcount < 200) or (unknown_ratio > 0.6)
            item["metrics"] = {"num_vertices": vcount, "num_edges": ecount,
                               "edge_per_vertex": epv, "unknown_ratio": unknown_ratio,
                               "has_semantic_nodes": has_sem}
            item["flat"] = bool(flat)
            if not has_sem: item["notes"].append("missing_semantic_nodes")
            if unknown_ratio > 0.6: item["notes"].append("high_unknown_ratio")
            if vcount < 25: item["notes"].append("too_few_vertices")
            if epv < 0.5 and vcount < 200: item["notes"].append("sparse_graph")
        except Exception as e:
            item["notes"].append("parse_failed")
            item["notes"].append(str(e))
            parse_errors += 1
        if item["flat"]:
            flat_count += 1
        items[key] = item
    report = {"version": "1.0", "generated_at": datetime.now().isoformat(),
              "stats": {"total": len(items), "flat": flat_count, "parse_errors": parse_errors},
              "items": items}
    with open(SANITY_REPORT, "w", encoding="utf-8") as f:
        json.dump(report, f, indent=2)
    logger.info(f"✅ Sanity report saved: {SANITY_REPORT}")
    logger.info(f"   Total: {len(items)}, Flat: {flat_count}, Parse errors: {parse_errors}")
    return report

def load_sanity_report():
    """Load sanity report, return flat index set and full items dict."""
    if not SANITY_REPORT.exists():
        generate_sanity_report()
    with open(SANITY_REPORT, "r", encoding="utf-8") as f:
        report = json.load(f)
    items = report.get("items", {})
    flat_index = {k for k, v in items.items() if v.get("flat")}
    logger.info(f"Identified {len(flat_index)} flat CPGs")
    return flat_index, items

# Load report
flat_cpgs, sanity_items = load_sanity_report()

# KB1 stats
logger.info(f"📊 KB1 loaded: {len(kb1_by_composite)} entries")
logger.info(f"📊 Unique CVEs: {len(kb1_by_cve)}")
logger.info(f"📊 Flat CPGs: {len(flat_cpgs)}")

if kb1_by_composite:
    sample_key = next(iter(kb1_by_composite))
    sample_entry = kb1_by_composite[sample_key]
    logger.info(f"📄 Sample KB1 entry: {sample_key}")
    logger.info(f"   • CVE: {sample_entry.get('cve_id', 'N/A')}")
    logger.info(f"   • CWE: {sample_entry.get('cwe_id', 'N/A')}")
    logger.info(f"   • Desc: {sample_entry.get('vulnerability_type', 'N/A')[:50]}...")


2025-08-08 12:16:48,243 - INFO - Generating CPG sanity report...
2025-08-08 12:16:48,464 - INFO - Found 4634 CPG files
Checking CPGs: 100%|██████████| 4634/4634 [01:20<00:00, 57.88it/s] 
2025-08-08 12:18:08,583 - INFO - ✅ Sanity report saved: /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/processed/cpg_sanity_report.json
2025-08-08 12:18:08,583 - INFO -    Total: 4634, Flat: 375, Parse errors: 0
2025-08-08 12:18:08,597 - INFO - Identified 375 flat CPGs
2025-08-08 12:18:08,597 - INFO - 📊 KB1 loaded: 2317 entries
2025-08-08 12:18:08,597 - INFO - 📊 Unique CVEs: 1154
2025-08-08 12:18:08,599 - INFO - 📊 Flat CPGs: 375
2025-08-08 12:18:08,599 - INFO - 📄 Sample KB1 entry: CWE-119_CVE-2014-3182_0
2025-08-08 12:18:08,600 - INFO -    • CVE: CVE-2014-3182
2025-08-08 12:18:08,600 - INFO -    • CWE: CWE-119
2025-08-08 12:18:08,600 - INFO -    • Desc: Invalid user input provided to the device index, w...


## Step 2: Code Chunking Strategy
using  intelligent code chunking to handle long functions beyond embedding API limits.

In [10]:


from kb2_preprocessing.chunking import (
    count_tokens,             # alias vers count_tokens du module
    find_logical_boundaries,  # idem
    chunk_code,               # idem
)

# Test chunking sur un exemple
test_code = """void vulnerable_function(char *input) {
    char buffer[100];
    strcpy(buffer, input);  // Buffer overflow vulnerability
    
    if (strlen(input) > 50) {
        printf("Input too long\n");
        return;
    }
    
    for (int i = 0; i < 10; i++) {
        process_data(buffer, i);
    }
}"""

test_chunks = chunk_code(test_code, max_tokens=50)  # chunks courts pour tester
logger.info(f"Test chunking: {len(test_chunks)} chunks from {count_tokens(test_code)} tokens")


2025-08-08 12:21:43,058 - INFO - Test chunking: 29 chunks from 78 tokens


## Step 3: Find and Process Vulnerability Pairs

In [11]:
# Cell 4: Find Vulnerability Pairs

# Add kb2_preprocessing to path
sys.path.append(str(PROJECT_ROOT / 'scripts'))
try:
    from kb2_preprocessing import GraphSONParser, extract_kb2_features
    logger.info("Successfully imported kb2_preprocessing modules")
except ImportError as e:
    logger.error(f"Failed to import kb2_preprocessing: {e}")
    GraphSONParser = None
    extract_kb2_features = None

def find_pairs(dir_):
    """
    Find all vulnerability/patch pairs in the CPG directory using the correct composite keys.
    This function iterates over CWE directories and their instances, looking for both
    'vuln_cpg.json' and 'patch_cpg.json' files. If both files exist, it constructs a composite key
    (matching the KB1 format) and stores relevant metadata for each pair.
    """
    pairs = {}
    for cwe_dir in dir_.iterdir():
        # Only process directories whose names start with 'CWE-'
        if not cwe_dir.name.startswith('CWE-'):
            continue
        for inst in cwe_dir.iterdir():
            vuln_file = inst / 'vuln_cpg.json'
            patch_file = inst / 'patch_cpg.json'
            # Check that both the vulnerable and patch CPG files exist
            if vuln_file.exists() and patch_file.exists():
                cve = extract_cve_from_path(vuln_file)
                # Extract the instance number from the directory name
                num = inst.name.split('_')[-1]
                # Construct the composite key in the same format as KB1
                composite_key = f"{cwe_dir.name}_{cve}_{num}"
                pairs[composite_key] = {
                    'vuln': vuln_file,
                    'safe': patch_file,
                    'cve': cve,
                    'cwe': cwe_dir.name,
                    'instance_id': num,
                    'composite_key': composite_key  # Composite key for cross-referencing
                }
    return pairs

CPG_DIR = PROJECT_ROOT / 'data' / 'tmp' / 'cpg_json'
pairs = find_pairs(CPG_DIR)
logger.info(f"Found {len(pairs)} vulnerability/patch pairs")

2025-08-08 12:22:22,896 - INFO - Successfully imported kb2_preprocessing modules
2025-08-08 12:22:22,939 - INFO - Found 2317 vulnerability/patch pairs


In [12]:
# Cell 4: Import Feature Extraction
import sys
sys.path.append(str(PROJECT_ROOT/'scripts'))
from kb2_preprocessing import extract_kb2_features

# Test feature extraction
sample_pair = next(iter(pairs.items()))
sample_key, sample_info = sample_pair
vf = extract_kb2_features(sample_info['vuln'])
sf = extract_kb2_features(sample_info['safe'])

print(f"Sample extraction - Quality: {vf['extraction_metadata']['feature_completeness']['quality_score']:.2f}")
print(f"Structural features: {len(vf['structural_features'])} dimensions")
print(f"Semantic features: {len(vf['semantic_features'])} elements")

2025-08-08 12:22:49,862 - INFO - Loaded CWE patterns for 9 CWE types
2025-08-08 12:22:49,863 - INFO - Risk weights: {'CWE-416': 0.9, 'CWE-476': 0.8, 'CWE-362': 0.7, 'CWE-119/787': 0.9, 'CWE-20': 0.6, 'CWE-200': 0.4, 'CWE-125': 0.7, 'CWE-264': 0.5, 'CWE-401': 0.6}
2025-08-08 12:22:49,869 - INFO - Extracted features from /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/tmp/cpg_json/CWE-264/CVE-2016-6786_6/vuln_cpg.json
2025-08-08 12:22:49,869 - INFO - Structural features: 3
2025-08-08 12:22:49,869 - INFO - Semantic features: 5
2025-08-08 12:22:49,875 - INFO - Loaded CWE patterns for 9 CWE types
2025-08-08 12:22:49,876 - INFO - Risk weights: {'CWE-416': 0.9, 'CWE-476': 0.8, 'CWE-362': 0.7, 'CWE-119/787': 0.9, 'CWE-20': 0.6, 'CWE-200': 0.4, 'CWE-125': 0.7, 'CWE-264': 0.5, 'CWE-401': 0.6}
2025-08-08 12:22:49,879 - INFO - Extracted features from /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/tmp/cpg_json/CWE-26

Sample extraction - Quality: 1.00
Structural features: 12 dimensions
Semantic features: 7 elements


## Step 4: Fallback Feature Extraction for Flat CPGs
Implement heuristic feature extraction for cases where CPG parsing fails.

In [None]:
# Cell 5: Fallback Feature Extraction

def extract_fallback_features(code: str) -> Dict[str, Any]:
    """Extract heuristic features from raw code when CPG is flat/missing.
    
    When Joern fails to parse (flat CPG), we fall back to
    text-based and pattern-matching features to maintain coverage.
    """
    features = {}
    
    # Text statistics
    lines = code.split('\n')
    features['loc'] = len(lines)
    features['avg_line_length'] = np.mean([len(l) for l in lines]) if lines else 0
    
    # Comment ratio
    comment_lines = sum(1 for l in lines if l.strip().startswith('//') or '/*' in l)
    features['comment_ratio'] = comment_lines / len(lines) if lines else 0
    
    # Keyword counts (control flow complexity proxy)
    keywords = ['if', 'else', 'for', 'while', 'switch', 'case', 'return', 'goto', 'break', 'continue']
    for kw in keywords:
        features[f'kw_{kw}'] = len(re.findall(rf'\b{kw}\b', code))
    
    # Approximate cyclomatic complexity
    features['approx_cyclomatic'] = 1 + sum([
        features.get(f'kw_{kw}', 0) 
        for kw in ['if', 'for', 'while', 'case']
    ])
    
    # Dangerous function calls (CWE-relevant patterns)
    dangerous_funcs = [
        'strcpy', 'strcat', 'sprintf', 'gets', 'scanf',  # Buffer overflow risks
        'malloc', 'free', 'realloc',  # Memory management
        'memcpy', 'memmove', 'memset',  # Memory operations
        'system', 'exec', 'popen',  # Command injection risks
    ]
    
    for func in dangerous_funcs:
        features[f'call_{func}'] = len(re.findall(rf'\b{func}\s*\(', code))
    
    # Pointer/memory access patterns
    features['ptr_deref'] = len(re.findall(r'\*\w+', code))
    features['field_access'] = len(re.findall(r'\w+->\w+', code))
    features['array_access'] = len(re.findall(r'\w+\[.*?\]', code))
    
    # Null check patterns (safety indicators)
    features['null_checks'] = len(re.findall(r'if\s*\(!?\s*\w+\s*[!=]=\s*NULL', code, re.IGNORECASE))
    
    return features

def extract_code_from_cpg(cpg_path: Path) -> Optional[str]:
    """Extract raw code from CPG UNKNOWN nodes when semantic parsing failed."""
    if not cpg_path.exists():
        return None
    
    try:
        parser = GraphSONParser(str(cpg_path))
        if not parser.parse():
            return None
        
        # Try to get source code from UNKNOWN nodes
        code_parts = []
        for vertex in parser.vertices:
            if vertex.get('label') == 'UNKNOWN':
                props = parser.extract_vertex_properties(vertex)
                if 'CODE' in props:
                    code_parts.append(props['CODE'])
        
        return '\n'.join(code_parts) if code_parts else None
    except Exception as e:
        logger.warning(f"Failed to extract code from CPG: {e}")
        return None

def load_code_body(composite_key, kb1_meta):
    """Load code body directly from temp_code_files"""
    # Parse composite key: CWE-416_CVE-2014-3182_1
    parts = composite_key.split('_')
    if len(parts) < 3:
        return None, None
    
    cwe = parts[0]  # CWE-416
    cve = parts[1]  # CVE-2014-3182
    instance = parts[2]  # 1
    
    # Build file paths with correct format
    temp_dir = PROJECT_ROOT/'data'/'tmp'/'temp_code_files'
    vuln_file = temp_dir/cwe/f"{cve}_{instance}_vuln.c"
    patch_file = temp_dir/cwe/f"{cve}_{instance}_patch.c"
    
    # Load code
    vuln_code = vuln_file.read_text() if vuln_file.exists() else None
    patch_code = patch_file.read_text() if patch_file.exists() else None
    
    return vuln_code, patch_code

sample_key = list(pairs.keys())[0]
sample_info = pairs[sample_key]
sample_kb1 = get_meta_from_kb1(sample_key, kb1_by_composite)
vuln_code, patch_code = load_code_body(sample_key, sample_kb1)

print(f"Sample VULN code length: {len(vuln_code) if vuln_code else 0} chars")
print(f"Sample PATCH code length: {len(patch_code) if patch_code else 0} chars")

Sample VULN code length: 445 chars
Sample PATCH code length: 395 chars


In [None]:
# Cell 6: OpenAI Embeddings Setup
from openai import OpenAI
import os

# Initialize OpenAI client
api_key = os.getenv('OPENAI_API_KEY')
if not api_key:
    print("OPENAI_API_KEY not found. Embeddings will be skipped.")
    client = None
else:
    client = OpenAI(api_key=api_key)
    print("OpenAI client initialized")

## Step 5: Chunked Embedding Generation
Generate embeddings for code chunks with fallback strategies.

In [None]:
# Cell 6: Embedding Generation with Chunking

import openai

# Configure OpenAI API
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
if OPENAI_API_KEY:
    openai.api_key = OPENAI_API_KEY
    logger.info("OpenAI API key configured")
else:
    logger.warning("No OpenAI API key found - embeddings will be skipped")

def generate_embedding(code: str, use_chunking: bool = True) -> Optional[np.ndarray]:
    """Generate embedding for code, with chunking for long functions.
    
    Academic innovation: Chunk-based embedding with max-pooling aggregation
    to handle functions beyond API context limits.
    """
    if not OPENAI_API_KEY:
        return None
    
    if not code or not code.strip():
        return None
    
    try:
        # Check if chunking is needed
        token_count = count_tokens(code)
        
        if not use_chunking or token_count <= CHUNK_SIZE:
            # Single embedding for short code
            response = openai.embeddings.create(
                model="text-embedding-3-large",
                input=code
            )
            embedding = np.array(response['data'][0]['embedding'])
            return embedding
        
        # Chunk the code
        chunks = chunk_code(code)
        if not chunks:
            return None
        
        # Generate embeddings for each chunk
        chunk_embeddings = []
        for chunk in chunks:
            try:
                response = openai.embeddings.create(
                    model="text-embedding-3-large",
                    input=chunk['text']
                )
                chunk_emb = np.array(response['data'][0]['embedding'])
                chunk_embeddings.append(chunk_emb)
            except Exception as e:
                logger.warning(f"Failed to embed chunk {chunk['chunk_id']}: {e}")
                continue
        
        if not chunk_embeddings:
            return None
        
        # Aggregate chunk embeddings
        # Strategy: Max-pooling (captures most salient features across chunks)
        chunk_matrix = np.vstack(chunk_embeddings)
        aggregated = np.max(chunk_matrix, axis=0)
        
        # Normalize for cosine similarity
        norm = np.linalg.norm(aggregated)
        if norm > 0:
            aggregated = aggregated / norm
        
        logger.info(f"Generated chunked embedding from {len(chunk_embeddings)} chunks")
        return aggregated
        
    except Exception as e:
        logger.error(f"Embedding generation failed: {e}")
        return None

def generate_chunk_embeddings(code: str) -> Tuple[Optional[np.ndarray], List[np.ndarray]]:
    """Generate both aggregated and individual chunk embeddings.
    
    Returns:
        - aggregated: Single vector representing entire function
        - chunk_vectors: List of vectors for each chunk (for fine-grained retrieval)
    """
    if not OPENAI_API_KEY or not code:
        return None, []
    
    chunks = chunk_code(code)
    chunk_vectors = []
    
    for chunk in chunks:
        try:
            response = openai.embeddings.create(
                model="text-embedding-3-large",
                input=chunk['text']
            )
            emb = np.array(response['data'][0]['embedding'])
            chunk_vectors.append(emb)
        except Exception as e:
            logger.warning(f"Chunk embedding failed: {e}")
            # Add zero vector to maintain alignment
            if chunk_vectors:
                chunk_vectors.append(np.zeros_like(chunk_vectors[0]))
    
    if not chunk_vectors:
        return None, []
    
    # Aggregate using max-pooling
    aggregated = np.max(np.vstack(chunk_vectors), axis=0)
    aggregated = aggregated / np.linalg.norm(aggregated)
    
    return aggregated, chunk_vectors

def generate_embedding_old(code):
    """Generate OpenAI embedding for code snippet"""
    if not client or not code:
        return None
    
    try:
        response = client.embeddings.create(
            model="text-embedding-ada-002",
            input=code
        )
        return response.data[0].embedding
    except Exception as e:
        print(f"Error generating embedding: {e}")
        return None

## Step 6: KB2 Construction Pipeline
Main pipeline integrating CPG analysis, fallback extraction, and chunked embeddings.

In [None]:
# Cell 7: Build KB2 with Advanced Features

def process_single_entry(composite_key: str, kind: str, cpg_path: Path, kb1_meta: Dict) -> Dict[str, Any]:
    """Process a single KB2 entry with fallback handling.
    
    Academic contribution: Dual-path processing based on CPG quality.
    """
    entry = {
        'composite_key': composite_key,
        'kind': kind,
        'cpg_path': str(cpg_path),
        'mode': 'normal',  # normal or fallback
        'flat_cpg': False,
        'features': {},
        'code': '',
        'embedding': None,
        'chunk_embeddings': [],
        'embedding_failed': False
    }
    
    # Check if CPG is flat
    cpg_key = f"{composite_key}::{kind}"
    is_flat = cpg_key in flat_cpgs
    
    if is_flat:
        logger.info(f"Processing flat CPG: {cpg_key}")
        entry['flat_cpg'] = True
        entry['mode'] = 'fallback'
        
        # Try to extract code from CPG UNKNOWN nodes
        code = extract_code_from_cpg(cpg_path)
        if not code:
            # Load from KB1 if available
            code_vuln, code_patch = load_code_body(composite_key, kb1_meta)
            code = code_vuln if kind == 'vuln' else code_patch
        
        if code:
            entry['code'] = code
            entry['features'] = extract_fallback_features(code)
        else:
            logger.warning(f"No code available for {cpg_key}")
    else:
        # Normal CPG processing
        try:
            if extract_kb2_features:
                entry['features'] = extract_kb2_features(str(cpg_path))
                entry['mode'] = 'normal'
            else:
                raise ImportError("kb2_preprocessing not available")
            
            # Get code for embedding
            code = entry['features'].get('code_snippet', '')
            if not code:
                code = extract_code_from_cpg(cpg_path)
            if not code:
                code_vuln, code_patch = load_code_body(composite_key, kb1_meta)
                code = code_vuln if kind == 'vuln' else code_patch
            
            entry['code'] = code or ''
            
        except Exception as e:
            logger.warning(f"CPG processing failed for {cpg_key}: {e}, using fallback")
            entry['mode'] = 'fallback'
            entry['flat_cpg'] = True
            
            # Fallback to heuristic extraction
            code = extract_code_from_cpg(cpg_path)
            if not code:
                code_vuln, code_patch = load_code_body(composite_key, kb1_meta)
                code = code_vuln if kind == 'vuln' else code_patch
            
            if code:
                entry['code'] = code
                entry['features'] = extract_fallback_features(code)
    
    # Generate embeddings (with chunking if needed)
    if entry['code']:
        aggregated, chunks = generate_chunk_embeddings(entry['code'])
        if aggregated is not None:
            entry['embedding'] = aggregated
            entry['chunk_embeddings'] = chunks
        else:
            entry['embedding_failed'] = True
    
    return entry

def build_kb2_(pairs, limit=None):
    """
    Build KB2 with advanced features:
    - Dual-path processing (normal CPG vs fallback)
    - Chunked embeddings for long functions
    - Double indexation (vuln vs patch)
    - Hybrid retrieval preparation (dense + sparse)
    """
    kb2_entries = []

    # Double embeddings: separate lists for vulnerable and patched code
    vuln_embeddings = []
    patch_embeddings = []
    
    # Chunk embeddings for fine-grained retrieval
    vuln_chunk_embeddings = []
    patch_chunk_embeddings = []

    # Double structural features: separate lists for vulnerable and patched code
    vuln_structural_features = []
    patch_structural_features = []

    # Double TF-IDF: separate lists for vulnerable and patched code
    vuln_tfidf_texts = []
    patch_tfidf_texts = []

    # Statistics
    stats = {
        'total': 0,
        'normal': 0,
        'fallback': 0,
        'embedding_failed': 0,
        'chunked': 0
    }

    # Limit the number of items if a limit is specified
    items = list(pairs.items())[:limit] if limit else pairs.items()

    print(f"Building KB2 with DOUBLE INDEXATION for {len(items)} pairs...")

    for i, (composite_key, info) in enumerate(tqdm(items, desc="Building KB2")):
        if limit and i >= limit:
            break
        
        stats['total'] += 1

        # Process vulnerability entry
        vuln_entry = process_single_entry(
            composite_key, 'vuln', info['vuln'], get_meta_from_kb1(composite_key, kb1_by_composite)
        )
        
        # Process patch entry
        patch_entry = process_single_entry(
            composite_key, 'patch', info['patch'], get_meta_from_kb1(composite_key, kb1_by_composite)
        )
        
        # Update statistics
        for entry in [vuln_entry, patch_entry]:
            if entry['mode'] == 'normal':
                stats['normal'] += 1
            else:
                stats['fallback'] += 1
            
            if entry['embedding_failed']:
                stats['embedding_failed'] += 1
            
            if len(entry.get('chunk_embeddings', [])) > 1:
                stats['chunked'] += 1
        
        # Collect embeddings and features
        if vuln_entry['embedding'] is not None:
            vuln_embeddings.append(vuln_entry['embedding'])
            vuln_chunk_embeddings.append(vuln_entry['chunk_embeddings'])
        
        if patch_entry['embedding'] is not None:
            patch_embeddings.append(patch_entry['embedding'])
            patch_chunk_embeddings.append(patch_entry['chunk_embeddings'])

        # Construct the KB2 entry with all relevant metadata and features
        entry = {
            'composite_key': composite_key,  # Composite key from KB1
            'cve': info['cve'],
            'cwe': info['cwe'],
            'instance_id': info['instance_id'],
            'kb1_metadata': get_meta_from_kb1(composite_key, kb1_by_composite),  # Complete KB1 metadata

            # Separate features for VULN and PATCH
            'vuln_features': {
                'structural_features': vuln_entry['features'],
                'semantic_features': vuln_entry['features'],
                'code_before_change': vuln_entry.get('code', '')
            },
            'patch_features': {
                'structural_features': patch_entry['features'],
                'semantic_features': patch_entry['features'],
                'code_after_change': patch_entry.get('code', '')
            },

            # Separate embedding keys for VULN and PATCH
            'vuln_embed_key': len(vuln_embeddings) - 1 if vuln_entry['embedding'] is not None else None,
            'patch_embed_key': len(patch_embeddings) - 1 if patch_entry['embedding'] is not None else None,

            # Timestamp for entry creation
            'ts': datetime.now().isoformat()
        }

        # Store the vulnerable code's embedding, structural features, and TF-IDF text if available
        if vuln_entry['embedding'] is not None:
            vuln_structural_features.append(list(vuln_entry['features'].values()) if isinstance(vuln_entry['features'], dict) else [])
            vuln_tfidf_texts.append(vuln_entry.get('code', ''))

        # Store the patched code's embedding, structural features, and TF-IDF text if available
        if patch_entry['embedding'] is not None:
            patch_structural_features.append(list(patch_entry['features'].values()) if isinstance(patch_entry['features'], dict) else [])
            patch_tfidf_texts.append(patch_entry.get('code', ''))

        # Add the entry to the KB2 entries list
        kb2_entries.append(entry)

    # Print statistics
    print(f"\n📊 KB2 Build Statistics:")
    print(f"  Total pairs processed: {stats['total']}")
    print(f"  Normal CPG processing: {stats['normal']}")
    print(f"  Fallback processing: {stats['fallback']}")
    print(f"  Embedding failures: {stats['embedding_failed']}")
    print(f"  Chunked embeddings: {stats['chunked']}")
    
    # Save embeddings
    vuln_embeddings_array = np.array(vuln_embeddings, dtype='float32')
    patch_embeddings_array = np.array(patch_embeddings, dtype='float32')
    
    np.save(OUTPUT_DIR/'kb2_vuln_embeddings.npy', vuln_embeddings_array)
    np.save(OUTPUT_DIR/'kb2_patch_embeddings.npy', patch_embeddings_array)
    
    # Return all constructed data structures for further processing or saving
    return (kb2_entries, vuln_embeddings, patch_embeddings, 
            vuln_chunk_embeddings, patch_chunk_embeddings,
            vuln_structural_features, patch_structural_features, 
            vuln_tfidf_texts, patch_tfidf_texts, stats)

In [None]:
# Deletion and documentation of the 4 entries with excessively large files
import json
from datetime import datetime

# The 4 problematic entries to delete
problematic_entries = [
    'CWE-787_CVE-2018-10882_0',
    'CWE-125_CVE-2016-10208_0', 
    'CWE-416_CVE-2022-1973_0',
    'CWE-476_CVE-2018-1094_0'
]

print("🧹 DELETION OF PROBLEMATIC ENTRIES")
print("=" * 50)

# Load existing KB2
with open(OUTPUT_DIR/'kb2.json', 'r') as f:
    kb2 = json.load(f)

print(f"📊 Initial state: {len(kb2)} entries in KB2")

# Document removed entries
removed_entries_doc = {
    'removal_timestamp': datetime.now().isoformat(),
    'removal_reason': 'Code files too large for OpenAI embedding API (>8192 tokens)',
    'removed_entries': {},
    'statistics': {
        'original_kb2_size': len(kb2),
        'removed_count': 0,
        'final_kb2_size': 0
    }
}

# Remove and document each entry
removed_count = 0
for composite_key in problematic_entries:
    if composite_key in kb2:
        # Save metadata before deletion
        entry = kb2[composite_key]
        removed_entries_doc['removed_entries'][composite_key] = {
            'cve': entry.get('cve'),
            'cwe': entry.get('cwe'),
            'instance_id': entry.get('instance_id'),
            'removal_reason': f"Code file size too large for embedding generation",
            'kb1_metadata_available': entry.get('kb1_metadata') is not None,
            'structural_features_available': entry.get('vuln_features') is not None
        }
        
        # Delete the entry
        del kb2[composite_key]
        removed_count += 1
        print(f"🗑️  Removed: {composite_key}")
        
        # Display details
        if entry.get('kb1_metadata'):
            kb1_meta = entry['kb1_metadata']
            print(f"   • CVE: {entry.get('cve')}")
            print(f"   • CWE: {entry.get('cwe')}")
            print(f"   • Description: {kb1_meta.get('N/A')}...")
    else:
        print(f"⚠️  Entry {composite_key} already absent from KB2")

# Update statistics
removed_entries_doc['statistics']['removed_count'] = removed_count
removed_entries_doc['statistics']['final_kb2_size'] = len(kb2)

print(f"\n📈 Results:")
print(f"  Entries removed: {removed_count}")
print(f"  Final KB2 size: {len(kb2)} entries")
print(f"  Coverage: {len(kb2)}/{removed_entries_doc['statistics']['original_kb2_size']} = {(len(kb2)/removed_entries_doc['statistics']['original_kb2_size']*100):.1f}%")

# Save cleaned KB2
with open(OUTPUT_DIR/'kb2.json', 'w') as f:
    json.dump(kb2, f, indent=2)
print(f"\n✅ Cleaned KB2 saved: {len(kb2)} entries")

# Save removal documentation
with open(OUTPUT_DIR/'kb2_removed_entries_log.json', 'w') as f:
    json.dump(removed_entries_doc, f, indent=2)
print(f"✅ Removal log saved: kb2_removed_entries_log.json")

# Display removal report
print(f"\n📋 REMOVAL REPORT:")
print(f"=" * 30)
for composite_key, details in removed_entries_doc['removed_entries'].items():
    print(f"• {composite_key}")
    print(f"  └─ {details['cwe']} / {details['cve']}")
    print(f"  └─ Reason: Code file too large (>8192 tokens)")

print(f"\n🎯 FINAL RESULT:")
print(f"📊 Final KB2: {len(kb2)} entries (99.83% coverage)")
print(f"📝 Documentation: The 4 removed entries are documented")
print(f"🔥 Base ready for hybrid RRF system!")
print(f"💡 Negligible impact: 0.17% fewer entries")

## 📋 Analyse des Fichiers KB2 Preprocessing

Évaluation de l'utilité et de la validité de tous les fichiers dans `scripts/kb2_preprocessing/` pour la reconstruction de KB2.