In [None]:
"""
Hybrid RAG Detection & Repair System - Following Original Architecture
Pipeline: Code → KB1(3) + KB2(3) → Fusion → Best Match → Context → LLM Detection/Repair → Evaluation Agent
"""

import json
import numpy as np
import requests
import openai
from pathlib import Path
from typing import Dict, List, Tuple, Optional, Any
from dataclasses import dataclass
from collections import Counter
import logging
import time
from sentence_transformers import SentenceTransformer

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Configurations
CONFIG = {
    'kb1_enriched_dir': Path.home() / "Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/processed/kb1",
    'kb2_path': Path.home() / "Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/tmp/kb2_final_with_embeddings.json",
    
    #  pipeline parameters
    'kb1_top_k': 3,              # Exactly 3 candidates from KB1
    'kb2_top_k': 3,              # Exactly 3 candidates from KB2  
    'fusion_alpha': 0.7,         # Weight for embedding similarity
    'fusion_beta': 0.3,          # Weight for feature similarity
    'coherence_bonus': 0.1,      # Bonus when both KBs agree
    
    # LLM configuration
    'gpt4_model': 'gpt-4',
    'qwen_model': 'qwen2.5-coder:7b',
    'ollama_url': 'http://localhost:11434',
    
    # Text embedding
    'text_model': 'all-MiniLM-L6-v2'
}

@dataclass
class KB1Candidate:
    """KB1 search result candidate"""
    entry_id: str
    cve_id: str
    similarity_score: float
    vulnerability_type: str
    fix_strategy_type: str
    complexity_level: str
    kb2_link_id: Optional[str]
    entry_data: Dict

@dataclass
class KB2Candidate:
    """KB2 search result candidate"""
    entry_key: str
    cve_id: str
    similarity_score: float
    file_type: str
    dangerous_calls: List[str]
    graph_stats: Dict
    entry_data: Dict

@dataclass
class FusionResult:
    """Best match after fusion analysis"""
    cve_id: str
    final_score: float
    kb1_candidate: Optional[KB1Candidate]
    kb2_candidate: Optional[KB2Candidate]
    coherence_bonus: float
    confidence: float

@dataclass
class EnrichedContext:
    """Enriched context from KB1+KB2 jointure"""
    kb1_context: Dict
    kb2_context: Dict
    structural_insights: Dict
    behavioral_patterns: Dict
    fix_examples: Dict

@dataclass
class DetectionResult:
    """LLM detection task result"""
    is_vulnerable: bool
    vulnerability_types: List[str]
    confidence_score: float
    explanation: str
    reasoning: str
    llm_model: str

@dataclass
class RepairResult:
    """LLM repair task result"""
    fixed_code: str
    explanation: str
    changes_made: List[str]
    fix_strategy: str
    llm_model: str

@dataclass
class EvaluationResult:
    """Evaluation agent assessment"""
    detection_accuracy: float
    repair_quality: float
    security_improvement: float
    code_functionality: float
    overall_score: float
    feedback: str

print("HYBRID RAG DETECTION & REPAIR SYSTEM")
print("=" * 45)
print("Following YOUR original pipeline architecture...")
print(f"KB1 candidates: {CONFIG['kb1_top_k']}")
print(f"KB2 candidates: {CONFIG['kb2_top_k']}")
print(f"Fusion weights: α={CONFIG['fusion_alpha']}, β={CONFIG['fusion_beta']}")
print(f"LLM models: GPT-4 baseline + Qwen2.5-coder via Ollama")

# Cell 1: KB1 and KB2 Search Engines (YOUR 3+3 Architecture)

class KB1SearchEngine:
    """
    KB1 textual search engine - returns exactly 3 candidates.
    """
    
    def __init__(self, kb1_enriched_dir: Path):
        self.kb1_data = {}
        self.text_embeddings = {}
        self.text_model = SentenceTransformer(CONFIG['text_model'])
        
        self._load_enriched_kb1(kb1_enriched_dir)
        self._compute_text_embeddings()
        
        logger.info(f"KB1 Search Engine: {len(self.kb1_data)} entries loaded")
    
    def _load_enriched_kb1(self, kb1_dir: Path) -> None:
        """Load enriched KB1 data"""
        print("Loading enriched KB1...")
        cwe_files = list(kb1_dir.glob("gpt-4o-mini_CWE-*_enriched.json"))
        
        for cwe_file in cwe_files:
            with open(cwe_file, 'r') as f:
                cwe_data = json.load(f)
            
            for cve_id, cve_instances in cwe_data.items():
                if isinstance(cve_instances, list):
                    for idx, instance in enumerate(cve_instances):
                        entry_key = f"{cve_id}_{idx}"
                        self.kb1_data[entry_key] = instance
                else:
                    self.kb1_data[cve_id] = cve_instances
        
        print(f"KB1 loaded: {len(self.kb1_data)} entries")
    
    def _compute_text_embeddings(self) -> None:
        """Precompute text embeddings"""
        print("Computing KB1 text embeddings...")
        
        for entry_key, entry in self.kb1_data.items():
            text_parts = [
                entry.get('vulnerability_behavior', {}).get('specific_code_behavior_causing_vulnerability', ''),
                entry.get('solution', ''),
                entry.get('code_before_change', ''),
                entry.get('structural_description', '')
            ]
            combined_text = ' '.join([part for part in text_parts if part])
            embedding = self.text_model.encode([combined_text])[0]
            self.text_embeddings[entry_key] = embedding
        
        print(f"KB1 embeddings: {len(self.text_embeddings)}")
    
    def get_top_3_candidates(self, query_text: str) -> List[KB1Candidate]:
        """
        Get exactly 3 best candidates from KB1.
        YOUR pipeline specification.
        """
        
        query_embedding = self.text_model.encode([query_text])[0]
        
        # Calculate similarities
        similarities = []
        for entry_key, entry_embedding in self.text_embeddings.items():
            similarity = np.dot(query_embedding, entry_embedding) / (
                np.linalg.norm(query_embedding) * np.linalg.norm(entry_embedding)
            )
            similarities.append((entry_key, float(similarity)))
        
        # Get top 3
        similarities.sort(key=lambda x: x[1], reverse=True)
        top_3 = similarities[:CONFIG['kb1_top_k']]
        
        # Convert to KB1Candidate objects
        candidates = []
        for entry_key, similarity in top_3:
            entry = self.kb1_data[entry_key]
            
            candidate = KB1Candidate(
                entry_id=entry_key,
                cve_id=entry.get('CVE_id', ''),
                similarity_score=similarity,
                vulnerability_type=entry.get('vulnerability_type', 'unknown'),
                fix_strategy_type=entry.get('fix_strategy_type', 'unknown'),
                complexity_level=entry.get('complexity_level', 'unknown'),
                kb2_link_id=entry.get('kb2_link_id'),
                entry_data=entry
            )
            candidates.append(candidate)
        
        logger.info(f"KB1 top-3 candidates: {[c.cve_id for c in candidates]}")
        return candidates


class KB2SearchEngine:
    """
    KB2 structural search engine - returns exactly 3 candidates.
    """
    
    def __init__(self, kb2_path: Path):
        print("Loading KB2...")
        with open(kb2_path, 'r') as f:
            self.kb2_data = json.load(f)
        
        # Build embedding matrix
        self.valid_entries = {}
        embeddings_list = []
        self.entry_keys = []
        
        for key, entry in self.kb2_data.items():
            if entry.get('embedding_computed', False):
                self.valid_entries[key] = entry
                embeddings_list.append(np.array(entry['graph_embedding'], dtype=np.float32))
                self.entry_keys.append(key)
        
        self.embeddings_matrix = np.vstack(embeddings_list)
        logger.info(f"KB2 Search Engine: {len(self.valid_entries)} entries loaded")
    
    def generate_mock_cpg_embedding(self, code: str) -> np.ndarray:
        """
        Mock CPG embedding generation for demo.
        In production: Code → Joern → CPG → NetworkX → Structural embedding
        """
        # Analyze code patterns
        code_lower = code.lower()
        features = []
        
        # Dangerous function detection
        dangerous_funcs = ['strcpy', 'strcat', 'sprintf', 'malloc', 'free', 'memcpy', 'gets']
        for func in dangerous_funcs:
            features.append(1.0 if func in code_lower else 0.0)
        
        # Code complexity indicators  
        features.extend([
            len(code.split('\n')) / 100.0,  # Line count
            code.count('{') / 20.0,         # Block count
            code.count('*') / 10.0,         # Pointer usage
            code.count('if') / 10.0,        # Conditionals
            code.count('for') / 10.0        # Loops
        ])
        
        # Pad to 128 dimensions
        while len(features) < 128:
            features.append(0.0)
        features = features[:128]
        
        # Normalize
        embedding = np.array(features, dtype=np.float32)
        norm = np.linalg.norm(embedding)
        if norm > 0:
            embedding = embedding / norm
        
        return embedding
    
    def get_top_3_candidates(self, code: str) -> List[KB2Candidate]:
        """
        Get exactly 3 best candidates from KB2.
        YOUR pipeline specification.
        """
        
        query_embedding = self.generate_mock_cpg_embedding(code)
        
        # Compute similarities
        similarities = np.dot(self.embeddings_matrix, query_embedding)
        
        # Get top 3 indices
        top_3_indices = np.argsort(similarities)[-CONFIG['kb2_top_k']:][::-1]
        
        # Convert to KB2Candidate objects
        candidates = []
        for idx in top_3_indices:
            entry_key = self.entry_keys[idx]
            entry = self.valid_entries[entry_key]
            similarity = float(similarities[idx])
            
            candidate = KB2Candidate(
                entry_key=entry_key,
                cve_id=entry.get('cve_id', ''),
                similarity_score=similarity,
                file_type=entry.get('file_type', 'unknown'),
                dangerous_calls=list(entry.get('features', {}).get('security_features', {}).get('dangerous_calls', {}).keys()),
                graph_stats=entry.get('graph_statistics', {}),
                entry_data=entry
            )
            candidates.append(candidate)
        
        logger.info(f"KB2 top-3 candidates: {[c.cve_id for c in candidates]}")
        return candidates


# Initialize search engines
print("\nInitializing search engines...")
kb1_engine = KB1SearchEngine(CONFIG['kb1_enriched_dir'])
kb2_engine = KB2SearchEngine(CONFIG['kb2_path'])

# Cell 2: Fusion Analysis Engine (YOUR Architecture)

class FusionAnalysisEngine:
    """
    Fusion analysis following YOUR exact pipeline.
    
    Input: 3 KB1 candidates + 3 KB2 candidates
    Output: 1 best match with enriched context
    """
    
    def __init__(self):
        logger.info("Fusion Analysis Engine initialized")
    
    def analyze_fusion(self, kb1_candidates: List[KB1Candidate], 
                      kb2_candidates: List[KB2Candidate]) -> FusionResult:
        """
        YOUR fusion algorithm: find best match from 3+3 candidates.
        
        Strategy:
        1. Cross-match by CVE ID when possible
        2. Weighted scoring with coherence bonus
        3. Return single best match
        """
        
        print("Running fusion analysis...")
        
        # Create all possible candidate combinations
        fusion_scores = []
        
        # Strategy 1: Direct CVE matches (highest priority)
        kb1_cves = {c.cve_id: c for c in kb1_candidates if c.cve_id}
        kb2_cves = {c.cve_id: c for c in kb2_candidates if c.cve_id}
        
        common_cves = set(kb1_cves.keys()) & set(kb2_cves.keys())
        
        for cve_id in common_cves:
            kb1_cand = kb1_cves[cve_id]
            kb2_cand = kb2_cves[cve_id]
            
            # Weighted fusion score
            final_score = (CONFIG['fusion_alpha'] * kb1_cand.similarity_score + 
                          CONFIG['fusion_beta'] * kb2_cand.similarity_score)
            
            # Coherence bonus for same CVE
            coherence_bonus = CONFIG['coherence_bonus']
            final_score += coherence_bonus
            
            confidence = min((kb1_cand.similarity_score + kb2_cand.similarity_score) / 2 + coherence_bonus, 1.0)
            
            fusion_scores.append((final_score, cve_id, kb1_cand, kb2_cand, coherence_bonus, confidence))
        
        # Strategy 2: If no direct matches, cross-product all combinations
        if not fusion_scores:
            for kb1_cand in kb1_candidates:
                for kb2_cand in kb2_candidates:
                    # Check if CVEs are related (same base)
                    kb1_base = kb1_cand.cve_id.split('_')[0] if '_' in kb1_cand.cve_id else kb1_cand.cve_id
                    kb2_base = kb2_cand.cve_id.split('_')[0] if '_' in kb2_cand.cve_id else kb2_cand.cve_id
                    
                    base_bonus = 0.05 if kb1_base == kb2_base else 0.0
                    
                    final_score = (CONFIG['fusion_alpha'] * kb1_cand.similarity_score + 
                                  CONFIG['fusion_beta'] * kb2_cand.similarity_score + base_bonus)
                    
                    confidence = (kb1_cand.similarity_score + kb2_cand.similarity_score) / 2
                    
                    fusion_scores.append((final_score, kb1_cand.cve_id, kb1_cand, kb2_cand, base_bonus, confidence))
        
        # Strategy 3: KB1 only or KB2 only (fallback)
        if not fusion_scores:
            # Best KB1 candidate only
            best_kb1 = kb1_candidates[0] if kb1_candidates else None
            if best_kb1:
                fusion_scores.append((best_kb1.similarity_score, best_kb1.cve_id, best_kb1, None, 0.0, best_kb1.similarity_score))
            
            # Best KB2 candidate only
            best_kb2 = kb2_candidates[0] if kb2_candidates else None
            if best_kb2:
                fusion_scores.append((best_kb2.similarity_score, best_kb2.cve_id, None, best_kb2, 0.0, best_kb2.similarity_score))
        
        # Select best fusion result
        if fusion_scores:
            fusion_scores.sort(key=lambda x: x[0], reverse=True)
            best_score, cve_id, kb1_cand, kb2_cand, coherence_bonus, confidence = fusion_scores[0]
            
            result = FusionResult(
                cve_id=cve_id,
                final_score=best_score,
                kb1_candidate=kb1_cand,
                kb2_candidate=kb2_cand,
                coherence_bonus=coherence_bonus,
                confidence=confidence
            )
            
            print(f"Fusion result: {cve_id} (score: {best_score:.3f}, confidence: {confidence:.3f})")
            return result
        
        # No valid fusion found
        return FusionResult(
            cve_id="NO_MATCH",
            final_score=0.0,
            kb1_candidate=None,
            kb2_candidate=None,
            coherence_bonus=0.0,
            confidence=0.0
        )
    
    def build_enriched_context(self, fusion_result: FusionResult) -> EnrichedContext:
        """
        Build enriched context via KB1+KB2 jointure.
        YOUR pipeline: context from both KBs via jointure ID.
        """
        
        print("Building enriched context via KB1+KB2 jointure...")
        
        kb1_context = {}
        kb2_context = {}
        structural_insights = {}
        behavioral_patterns = {}
        fix_examples = {}
        
        # Extract KB1 context
        if fusion_result.kb1_candidate:
            kb1_data = fusion_result.kb1_candidate.entry_data
            
            kb1_context = {
                'vulnerability_behavior': kb1_data.get('vulnerability_behavior', {}),
                'solution': kb1_data.get('solution', ''),
                'gpt_analysis': kb1_data.get('GPT_analysis', ''),
                'vulnerability_type': fusion_result.kb1_candidate.vulnerability_type,
                'fix_strategy_type': fusion_result.kb1_candidate.fix_strategy_type,
                'complexity_level': fusion_result.kb1_candidate.complexity_level,
                'structural_description': kb1_data.get('structural_description', '')
            }
            
            fix_examples = {
                'code_before': kb1_data.get('code_before_change', ''),
                'code_after': kb1_data.get('code_after_change', ''),
                'modified_lines': kb1_data.get('modified_lines', {})
            }
            
            behavioral_patterns = kb1_data.get('vulnerability_behavior', {})
        
        # Extract KB2 context  
        if fusion_result.kb2_candidate:
            kb2_data = fusion_result.kb2_candidate.entry_data
            
            kb2_context = {
                'cve_id': fusion_result.kb2_candidate.cve_id,
                'file_type': fusion_result.kb2_candidate.file_type,
                'dangerous_calls': fusion_result.kb2_candidate.dangerous_calls,
                'graph_statistics': fusion_result.kb2_candidate.graph_stats,
                'security_features': kb2_data.get('features', {}).get('security_features', {}),
                'complexity_metrics': kb2_data.get('features', {}).get('complexity_metrics', {}),
                'code_patterns': kb2_data.get('features', {}).get('code_patterns', {})
            }
            
            structural_insights = {
                'dangerous_functions_detected': fusion_result.kb2_candidate.dangerous_calls,
                'graph_complexity': fusion_result.kb2_candidate.graph_stats,
                'has_memory_operations': any(call in fusion_result.kb2_candidate.dangerous_calls 
                                           for call in ['malloc', 'free', 'calloc', 'realloc']),
                'has_string_operations': any(call in fusion_result.kb2_candidate.dangerous_calls 
                                           for call in ['strcpy', 'strcat', 'sprintf', 'gets']),
                'complexity_level': 'high' if fusion_result.kb2_candidate.graph_stats.get('nodes', 0) > 200 else 'medium'
            }
        
        enriched_context = EnrichedContext(
            kb1_context=kb1_context,
            kb2_context=kb2_context,
            structural_insights=structural_insights,
            behavioral_patterns=behavioral_patterns,
            fix_examples=fix_examples
        )
        
        print("Enriched context built successfully")
        return enriched_context


# Initialize fusion engine
fusion_engine = FusionAnalysisEngine()

print("✅ Phase 1 complete: KB1+KB2 search engines and fusion analysis ready")
print("Next: LLM Detection/Repair + Evaluation Agent...")

HYBRID RAG FUSION SYSTEM
Initializing multimodal vulnerability detection...
✅ kb1_enriched_dir: /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/processed/kb1
✅ kb2_path: /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/tmp/kb2_final_with_embeddings.json
✅ kb2_export_dir: /Users/vernetemmanueladjobi/Documents/RessourcesStages/Projets/VulRAG-Hybrid-System/data/tmp/kb2_production_export

Fusion Configuration:
  KB1 weight: 0.4
  KB2 weight: 0.6
  Text model: all-MiniLM-L6-v2
  Final top-k: 5
