# RAG Evaluation Notebook

This notebook provides comprehensive evaluation tools for testing RAG system performance.

## Features
- RAG system evaluation metrics
- Retrieval accuracy testing
- Response quality assessment
- A/B testing framework
- Performance benchmarking
- Response time analysis
- Quality scoring


## Setup and Configuration


In [1]:
# Import required libraries
import os
import sys
from pathlib import Path
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict, Any, Tuple

# Add src to path for imports
PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))


# Import our modules
from src.utils.config import config_manager
from src.rag.chain import create_rag_chain, create_databricks_rag_chain

print("✅ Setup complete!")


✅ Setup complete!


## Evaluation Framework Setup


In [2]:
# Initialize RAG chain for evaluation
print("🔧 Initializing RAG chain for evaluation...")

if config_manager.data.use_databricks:
    chain = create_databricks_rag_chain()
    print("✅ Databricks RAG chain initialized")
else:
    chain = create_rag_chain(
        index_dir=config_manager.data.index_dir,
        use_databricks=False
    )
    print("✅ Local RAG chain initialized")

# Define evaluation test cases
test_cases = [
    {
        "query": "What is the project charter about?",
        "expected_topics": ["project", "charter", "vision", "goals"],
        "category": "general"
    },
    {
        "query": "What technology stack is used?",
        "expected_topics": ["technology", "stack", "databricks", "unity catalog"],
        "category": "technical"
    },
    {
        "query": "How do I set up the environment?",
        "expected_topics": ["setup", "environment", "installation", "configuration"],
        "category": "setup"
    },
    {
        "query": "What are the main features?",
        "expected_topics": ["features", "capabilities", "functionality"],
        "category": "features"
    },
    {
        "query": "What is the architecture?",
        "expected_topics": ["architecture", "design", "components", "layers"],
        "category": "architecture"
    }
]

print(f"📋 Loaded {len(test_cases)} test cases for evaluation")


🔧 Initializing RAG chain for evaluation...


  embeddings = HuggingFaceEmbeddings(model_name=embedding_model)


✅ Local RAG chain initialized
📋 Loaded 5 test cases for evaluation


## Performance Evaluation


In [3]:
# Run performance evaluation
def evaluate_rag_performance(test_cases: List[Dict], chain) -> pd.DataFrame:
    """Evaluate RAG performance on test cases."""
    results = []
    
    print("🧪 Running performance evaluation...")
    
    for i, test_case in enumerate(test_cases):
        query = test_case["query"]
        expected_topics = test_case["expected_topics"]
        category = test_case["category"]
        
        print(f"\\nTesting {i+1}/{len(test_cases)}: {query}")
        
        try:
            # Measure response time
            start_time = time.time()
            response = chain.invoke(query)
            end_time = time.time()
            
            response_time = end_time - start_time
            
            # Calculate response length
            response_length = len(response)
            
            # Simple topic coverage check
            response_lower = response.lower()
            topics_found = sum(1 for topic in expected_topics if topic.lower() in response_lower)
            topic_coverage = topics_found / len(expected_topics)
            
            # Quality indicators
            has_sources = "[source:" in response.lower()
            is_comprehensive = response_length > 100
            is_concise = response_length < 1000
            
            results.append({
                "query": query,
                "category": category,
                "response_time": response_time,
                "response_length": response_length,
                "topic_coverage": topic_coverage,
                "topics_found": topics_found,
                "total_topics": len(expected_topics),
                "has_sources": has_sources,
                "is_comprehensive": is_comprehensive,
                "is_concise": is_concise,
                "response": response
            })
            
            print(f"  ✅ Response time: {response_time:.2f}s, Coverage: {topic_coverage:.2f}")
            
        except Exception as e:
            print(f"  ❌ Error: {e}")
            results.append({
                "query": query,
                "category": category,
                "response_time": None,
                "response_length": 0,
                "topic_coverage": 0,
                "topics_found": 0,
                "total_topics": len(expected_topics),
                "has_sources": False,
                "is_comprehensive": False,
                "is_concise": False,
                "response": f"Error: {e}"
            })
    
    return pd.DataFrame(results)

# Run evaluation
results_df = evaluate_rag_performance(test_cases, chain)

print(f"\\n📊 Evaluation complete! Processed {len(results_df)} test cases.")


🧪 Running performance evaluation...
\nTesting 1/5: What is the project charter about?
  ✅ Response time: 49.73s, Coverage: 0.50
\nTesting 2/5: What technology stack is used?
  ✅ Response time: 39.83s, Coverage: 1.00
\nTesting 3/5: How do I set up the environment?
  ✅ Response time: 57.54s, Coverage: 0.25
\nTesting 4/5: What are the main features?
  ❌ Error: Request timed out.
\nTesting 5/5: What is the architecture?
  ✅ Response time: 50.70s, Coverage: 0.25
\n📊 Evaluation complete! Processed 5 test cases.
