In [11]:
import json
import time
from typing import List, Dict, Optional
from dataclasses import dataclass, asdict
from datetime import datetime
import openai
import requests
from pathlib import Path

In [12]:
@dataclass
class TestQuery:
    """Test query with ground truth and evaluation criteria"""
    id: int
    query: str
    category: str
    difficulty: str
    ground_truth_answer: str
    expected_sources: List[str]
    evaluation_criteria: Dict[str, str]
    min_answer_length: int
    must_include_keywords: List[str]

In [13]:
TEST_DATASET = [
    TestQuery(
        id=1,
        query="What are hiking trails in British Columbia?",
        category="location",
        difficulty="easy",
        ground_truth_answer="British Columbia has numerous hiking trails including Summit Trail (Wilderness), Fisher-Thunder-Park-Cascade Cross-Park Trek, and Meeting of Waters Trail. These trails offer diverse wilderness experiences in BC South region.",
        expected_sources=["Summit Trail", "Fisher-Thunder-Park", "Meeting of Waters"],
        evaluation_criteria={
            "accuracy": "Must mention actual BC trails from the database",
            "completeness": "Should list at least 2-3 specific trails",
            "relevance": "Must focus on BC region specifically",
            "citations": "Should cite sources properly"
        },
        min_answer_length=200,
        must_include_keywords=["British Columbia", "trail"]
    ),
    
    TestQuery(
        id=2,
        query="Find trails with concrete surface in Quebec",
        category="surface_attribute",
        difficulty="medium",
        ground_truth_answer="Boulevard des Laurentides is a footway trail with a concrete surface located in Quebec South, Canada. It is suitable for walking and provides a paved surface for accessibility.",
        expected_sources=["Boulevard des Laurentides"],
        evaluation_criteria={
            "accuracy": "Must identify concrete surface trails specifically",
            "specificity": "Should mention exact trail names",
            "relevance": "Must filter by Quebec region AND concrete surface",
            "citations": "Should cite source for each trail mentioned"
        },
        min_answer_length=150,
        must_include_keywords=["concrete", "Quebec"]
    ),
    
    TestQuery(
        id=3,
        query="Are there wheelchair accessible trails in Ontario?",
        category="accessibility",
        difficulty="easy",
        ground_truth_answer="Yes, Ontario has wheelchair accessible trails including 'Wheelchairs only' and 'Wheelchair loading' trails in Ontario South, as well as the Accessible Trail in York Region which provides opportunities for persons with disabilities.",
        expected_sources=["Wheelchairs only", "Wheelchair loading", "Accessible Trail"],
        evaluation_criteria={
            "accuracy": "Must confirm accessibility features",
            "completeness": "Should mention multiple accessible trails",
            "helpfulness": "Should provide specific trail names and locations",
            "citations": "Should cite sources for each trail"
        },
        min_answer_length=200,
        must_include_keywords=["wheelchair", "accessible", "Ontario"]
    ),
    
    TestQuery(
        id=4,
        query="What trails allow bicycles?",
        category="activity",
        difficulty="easy",
        ground_truth_answer="Several trails permit bicycles including 'I Will Allow It' in Ontario South and 'Eazy Does It' in Alberta South. Both are path trails with dirt/rock surfaces suitable for mountain biking.",
        expected_sources=["I Will Allow It", "Eazy Does It"],
        evaluation_criteria={
            "accuracy": "Must identify bicycle-permitted trails",
            "completeness": "Should list at least 2 trails",
            "specificity": "Should mention trail names and locations",
            "citations": "Should cite sources"
        },
        min_answer_length=150,
        must_include_keywords=["bicycle"]
    ),
    
    TestQuery(
        id=5,
        query="Tell me about Banff National Park",
        category="park_information",
        difficulty="easy",
        ground_truth_answer="Banff National Park is Canada's first national park, established in 1885, and is part of the Canadian Rocky Mountain Parks UNESCO World Heritage Site. Located in Alberta, it features Rocky Mountain peaks, glacial lakes, and diverse wildlife. The park offers camping, hiking, and other outdoor activities. Important regulations include prohibition of drones (fines up to $25,000) and ice safety guidelines for winter activities.",
        expected_sources=["Banff National Park"],
        evaluation_criteria={
            "accuracy": "Must include correct historical facts (first park, 1885, UNESCO)",
            "completeness": "Should cover features, activities, and regulations",
            "organization": "Should present information in logical structure",
            "citations": "Should cite source information"
        },
        min_answer_length=300,
        must_include_keywords=["Banff", "national park", "Alberta"]
    ),
    
    TestQuery(
        id=6,
        query="What are challenging mountain trails in Alberta?",
        category="difficulty_filter",
        difficulty="hard",
        ground_truth_answer="Challenging mountain trails in Alberta include Plateau Mountain Alternate Route. However, the database has limited information on difficulty ratings. For comprehensive challenging trail information, consulting Parks Canada or Alberta Parks websites is recommended.",
        expected_sources=["Plateau Mountain"],
        evaluation_criteria={
            "accuracy": "Must acknowledge limited information in database",
            "honesty": "Should not hallucinate trail names not in sources",
            "helpfulness": "Should suggest where to find more information",
            "citations": "Should cite available sources"
        },
        min_answer_length=200,
        must_include_keywords=["Alberta", "trail"]
    ),
    
    TestQuery(
        id=7,
        query="Find beginner-friendly trails near Toronto",
        category="multi_criteria",
        difficulty="medium",
        ground_truth_answer="Beginner-friendly trails near Toronto include Maple Trail and Maple City Trail in Ontario South/Central regions. These trails feature accessible surfaces (grass, paved) and permit bicycles, making them suitable for beginners. They offer easy terrain within the greater Toronto area.",
        expected_sources=["Maple Trail", "Maple City Trail"],
        evaluation_criteria={
            "accuracy": "Must identify trails in Toronto area (Ontario South/Central)",
            "relevance": "Must filter by beginner-friendly difficulty",
            "specificity": "Should mention trail names and features",
            "helpfulness": "Should explain why they're beginner-friendly",
            "citations": "Should cite sources"
        },
        min_answer_length=200,
        must_include_keywords=["Toronto", "beginner", "trail"]
    ),
]


In [14]:
# =============================================================================
# LLM-AS-JUDGE EVALUATOR
# =============================================================================

class LLMJudge:
    """
    Uses an LLM to evaluate RAG pipeline answers
    Supports: GPT-4o, Claude, or free alternatives (Groq)
    """
    
    def __init__(
        self, 
        judge_provider: str = "groq",  # "openai", "anthropic", "groq"
        judge_model: str = "llama-3.1-8b-instant"
    ):
        self.provider = judge_provider
        self.model = judge_model
        
        if judge_provider == "openai":
            self.client = openai.OpenAI()
        elif judge_provider == "groq":
            import os
            self.api_key = os.getenv("GROQ_API_KEY")
    
    def evaluate_answer(
        self, 
        test_query: TestQuery,
        generated_answer: str,
        retrieved_sources: List[str]
    ) -> Dict:
        """
        Evaluate a generated answer against ground truth
        Returns scores and detailed feedback
        """
        
        evaluation_prompt = self._build_evaluation_prompt(
            test_query, 
            generated_answer, 
            retrieved_sources
        )
        
        # Get LLM evaluation
        evaluation_response = self._call_judge(evaluation_prompt)
        
        # Parse evaluation scores
        scores = self._parse_evaluation(evaluation_response)
        
        # Add keyword check
        scores['keyword_match'] = self._check_keywords(
            generated_answer, 
            test_query.must_include_keywords
        )
        
        # Add length check
        scores['length_adequate'] = len(generated_answer) >= test_query.min_answer_length
        
        # Calculate overall score
        scores['overall_score'] = self._calculate_overall_score(scores)
        
        return {
            'scores': scores,
            'feedback': evaluation_response,
            'passed': scores['overall_score'] >= 0.7
        }
    
    def _build_evaluation_prompt(
        self, 
        test_query: TestQuery,
        generated_answer: str,
        retrieved_sources: List[str]
    ) -> str:
        """Build comprehensive evaluation prompt"""
        
        return f"""You are an expert evaluator for a RAG (Retrieval-Augmented Generation) system about Canadian trails and parks.

            **TASK**: Evaluate the quality of the generated answer against the ground truth and criteria.

            **USER QUERY**: 
            {test_query.query}

            **GROUND TRUTH ANSWER** (Reference):
            {test_query.ground_truth_answer}

            **GENERATED ANSWER** (To Evaluate):
            {generated_answer}

            **RETRIEVED SOURCES**:
            {', '.join(retrieved_sources) if retrieved_sources else 'None'}

            **EXPECTED SOURCES**:
            {', '.join(test_query.expected_sources)}

            **EVALUATION CRITERIA**:
            {json.dumps(test_query.evaluation_criteria, indent=2)}

            **INSTRUCTIONS**:
            Evaluate the generated answer on these dimensions (score each 0.0-1.0):

            1. **Accuracy** (0-1): Are the facts correct? Does it match ground truth?
            2. **Completeness** (0-1): Does it cover all important aspects?
            3. **Relevance** (0-1): Is it directly answering the query?
            4. **Citation Quality** (0-1): Are sources properly cited?
            5. **Clarity** (0-1): Is the answer well-structured and clear?
            6. **Hallucination Check** (0-1): 1.0 if no hallucinations, 0.0 if major hallucinations

            **IMPORTANT**: 
            - Hallucination = mentioning trail names or facts NOT in retrieved sources
            - The answer should match the query category: {test_query.category}
            - Difficulty level: {test_query.difficulty}

            **OUTPUT FORMAT** (JSON only, no markdown):
            {{
                "accuracy": 0.0-1.0,
                "completeness": 0.0-1.0,
                "relevance": 0.0-1.0,
                "citation_quality": 0.0-1.0,
                "clarity": 0.0-1.0,
                "hallucination_check": 0.0-1.0,
                "reasoning": "Brief explanation of scores",
                "strengths": "What the answer does well",
                "weaknesses": "What could be improved"
            }}"""
    
    def _call_judge(self, prompt: str) -> str:
        """Call the LLM judge"""
        
        if self.provider == "groq":
            response = requests.post(
                "https://api.groq.com/openai/v1/chat/completions",
                headers={
                    "Authorization": f"Bearer {self.api_key}",
                    "Content-Type": "application/json"
                },
                json={
                    "model": self.model,
                    "messages": [{"role": "user", "content": prompt}],
                    "temperature": 0.1,  # Low temp for consistent evaluation
                    "max_tokens": 1000
                },
                timeout=30
            )
            return response.json()['choices'][0]['message']['content']
        
        elif self.provider == "openai":
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[{"role": "user", "content": prompt}],
                temperature=0.1,
                max_tokens=1000
            )
            return response.choices[0].message.content
        
        else:
            raise ValueError(f"Unsupported judge provider: {self.provider}")
    
    def _parse_evaluation(self, response: str) -> Dict[str, float]:
        """Parse LLM evaluation response"""
        try:
            # Try to extract JSON from response
            import re
            json_match = re.search(r'\{.*\}', response, re.DOTALL)
            if json_match:
                eval_dict = json.loads(json_match.group())
                
                return {
                    'accuracy': float(eval_dict.get('accuracy', 0.5)),
                    'completeness': float(eval_dict.get('completeness', 0.5)),
                    'relevance': float(eval_dict.get('relevance', 0.5)),
                    'citation_quality': float(eval_dict.get('citation_quality', 0.5)),
                    'clarity': float(eval_dict.get('clarity', 0.5)),
                    'hallucination_check': float(eval_dict.get('hallucination_check', 0.5)),
                    'reasoning': eval_dict.get('reasoning', ''),
                    'strengths': eval_dict.get('strengths', ''),
                    'weaknesses': eval_dict.get('weaknesses', '')
                }
        except Exception as e:
            print(f"‚ö†Ô∏è  Error parsing evaluation: {e}")
        
        # Fallback: default scores
        return {
            'accuracy': 0.5,
            'completeness': 0.5,
            'relevance': 0.5,
            'citation_quality': 0.5,
            'clarity': 0.5,
            'hallucination_check': 0.5,
            'reasoning': 'Failed to parse evaluation',
            'strengths': '',
            'weaknesses': ''
        }
    
    def _check_keywords(self, answer: str, keywords: List[str]) -> float:
        """Check if required keywords are present"""
        if not keywords:
            return 1.0
        
        answer_lower = answer.lower()
        matches = sum(1 for kw in keywords if kw.lower() in answer_lower)
        return matches / len(keywords)
    
    def _calculate_overall_score(self, scores: Dict) -> float:
        """Calculate weighted overall score"""
        weights = {
            'accuracy': 0.25,
            'completeness': 0.15,
            'relevance': 0.20,
            'citation_quality': 0.10,
            'clarity': 0.10,
            'hallucination_check': 0.20
        }
        
        total = sum(
            scores.get(metric, 0) * weight 
            for metric, weight in weights.items()
        )
        
        # Penalize if keywords missing or length inadequate
        if not scores.get('keyword_match', 1.0) >= 0.5:
            total *= 0.8
        if not scores.get('length_adequate', True):
            total *= 0.9
        
        return min(1.0, total)


In [15]:
# =============================================================================
# AUTOMATED EVALUATION RUNNER
# =============================================================================

class RAGEvaluator:
    """
    Complete evaluation system for RAG pipelines
    """
    
    def __init__(self, judge_provider: str = "groq"):
        self.judge = LLMJudge(judge_provider)
        self.results = []
    
    def evaluate_pipeline(
        self, 
        pipeline_name: str,
        pipeline_function,
        test_queries: List[TestQuery] = None
    ) -> Dict:
        """
        Evaluate a RAG pipeline on test dataset
        
        Args:
            pipeline_name: Name of the pipeline
            pipeline_function: Function that takes a query and returns answer dict
            test_queries: List of test queries (uses TEST_DATASET if None)
        """
        
        if test_queries is None:
            test_queries = TEST_DATASET
        
        print(f"\n{'='*70}")
        print(f"üß™ EVALUATING: {pipeline_name}")
        print(f"{'='*70}")
        
        pipeline_results = {
            'pipeline_name': pipeline_name,
            'timestamp': datetime.now().isoformat(),
            'query_results': [],
            'aggregate_metrics': {}
        }
        
        for test_query in test_queries:
            print(f"\n[{test_query.id}/{len(test_queries)}] {test_query.query[:50]}...")
            
            try:
                # Get answer from pipeline
                start_time = time.time()
                answer_result = pipeline_function(test_query.query)
                query_time = time.time() - start_time
                
                # Evaluate answer
                evaluation = self.judge.evaluate_answer(
                    test_query,
                    answer_result['answer'],
                    answer_result.get('sources', [])
                )
                
                # Store results
                query_result = {
                    'query_id': test_query.id,
                    'query': test_query.query,
                    'category': test_query.category,
                    'difficulty': test_query.difficulty,
                    'generated_answer': answer_result['answer'],
                    'sources': answer_result.get('sources', []),
                    'query_time': query_time,
                    'evaluation': evaluation,
                    'passed': evaluation['passed']
                }
                
                pipeline_results['query_results'].append(query_result)
                
                # Print quick feedback
                score = evaluation['scores']['overall_score']
                status = "‚úÖ PASS" if evaluation['passed'] else "‚ùå FAIL"
                print(f"   {status} | Score: {score:.2f} | Time: {query_time:.2f}s")
                
            except Exception as e:
                print(f"   ‚ùå ERROR: {str(e)}")
                pipeline_results['query_results'].append({
                    'query_id': test_query.id,
                    'query': test_query.query,
                    'error': str(e),
                    'passed': False
                })
        
        # Calculate aggregate metrics
        pipeline_results['aggregate_metrics'] = self._calculate_aggregate_metrics(
            pipeline_results['query_results']
        )
        
        self.results.append(pipeline_results)
        return pipeline_results
    
    def _calculate_aggregate_metrics(self, query_results: List[Dict]) -> Dict:
        """Calculate aggregate metrics across all queries"""
        
        successful = [r for r in query_results if 'evaluation' in r]
        
        if not successful:
            return {'error': 'No successful evaluations'}
        
        # Average scores
        avg_scores = {}
        score_keys = ['accuracy', 'completeness', 'relevance', 'citation_quality', 
                     'clarity', 'hallucination_check', 'overall_score']
        
        for key in score_keys:
            scores = [r['evaluation']['scores'][key] for r in successful]
            avg_scores[f'avg_{key}'] = sum(scores) / len(scores)
        
        # Pass rate
        passed = sum(1 for r in successful if r['passed'])
        pass_rate = passed / len(successful)
        
        # Average time
        avg_time = sum(r['query_time'] for r in successful) / len(successful)
        
        # Category breakdown
        category_scores = {}
        for result in successful:
            cat = result['category']
            if cat not in category_scores:
                category_scores[cat] = []
            category_scores[cat].append(result['evaluation']['scores']['overall_score'])
        
        category_avg = {
            cat: sum(scores) / len(scores) 
            for cat, scores in category_scores.items()
        }
        
        return {
            **avg_scores,
            'pass_rate': pass_rate,
            'avg_query_time': avg_time,
            'total_queries': len(query_results),
            'successful_queries': len(successful),
            'category_breakdown': category_avg
        }
    
    def compare_pipelines(self) -> Dict:
        """Compare all evaluated pipelines"""
        
        if len(self.results) < 2:
            print("‚ö†Ô∏è  Need at least 2 pipelines to compare")
            return {}
        
        print(f"\n{'='*70}")
        print("üìä PIPELINE COMPARISON")
        print(f"{'='*70}\n")
        
        # Comparison table
        print(f"{'Pipeline':<30} {'Score':<10} {'Pass Rate':<12} {'Avg Time'}")
        print("-" * 70)
        
        for result in self.results:
            metrics = result['aggregate_metrics']
            print(f"{result['pipeline_name']:<30} "
                  f"{metrics['avg_overall_score']:.3f}    "
                  f"{metrics['pass_rate']*100:.0f}%{'':<8} "
                  f"{metrics['avg_query_time']:.2f}s")
        
        # Find winners
        best_quality = max(self.results, key=lambda x: x['aggregate_metrics']['avg_overall_score'])
        fastest = min(self.results, key=lambda x: x['aggregate_metrics']['avg_query_time'])
        best_pass_rate = max(self.results, key=lambda x: x['aggregate_metrics']['pass_rate'])
        
        print(f"\nüèÜ WINNERS:")
        print(f"   Best Quality: {best_quality['pipeline_name']} "
              f"(score: {best_quality['aggregate_metrics']['avg_overall_score']:.3f})")
        print(f"   Fastest: {fastest['pipeline_name']} "
              f"(time: {fastest['aggregate_metrics']['avg_query_time']:.2f}s)")
        print(f"   Best Pass Rate: {best_pass_rate['pipeline_name']} "
              f"(rate: {best_pass_rate['aggregate_metrics']['pass_rate']*100:.0f}%)")
        
        return {
            'best_quality': best_quality['pipeline_name'],
            'fastest': fastest['pipeline_name'],
            'best_pass_rate': best_pass_rate['pipeline_name'],
            'recommendation': self._get_recommendation()
        }
    
    def _get_recommendation(self) -> str:
        """Get final recommendation based on all metrics"""
        
        if not self.results:
            return "No pipelines evaluated"
        
        # Calculate balanced score (quality * speed factor)
        scores = []
        for result in self.results:
            metrics = result['aggregate_metrics']
            quality = metrics['avg_overall_score']
            time_factor = 1.0 / (1.0 + metrics['avg_query_time'] / 10.0)
            balanced = quality * (0.7 + 0.3 * time_factor)
            
            scores.append({
                'name': result['pipeline_name'],
                'balanced_score': balanced,
                'quality': quality,
                'time': metrics['avg_query_time']
            })
        
        best = max(scores, key=lambda x: x['balanced_score'])
        
        return f"{best['name']} (Quality: {best['quality']:.3f}, Time: {best['time']:.2f}s)"
    
    def save_results(self, filename: str = None):
        """Save evaluation results to JSON"""
        
        if filename is None:
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            filename = f"rag_evaluation_{timestamp}.json"
        
        with open(filename, 'w') as f:
            json.dump({
                'evaluation_results': self.results,
                'timestamp': datetime.now().isoformat(),
                'test_dataset_size': len(TEST_DATASET)
            }, f, indent=2)
        
        print(f"\nüíæ Results saved to: {filename}")
        return filename


In [16]:
import subprocess

# Convert pipeline.ipynb to pipeline.py
subprocess.run([
    "jupyter", "nbconvert", 
    "--to", "python", 
    "pipeline.ipynb"
])

print("‚úÖ Created pipeline.py")
from pipeline import FreeRAGPipeline, FREE_PIPELINES

‚úÖ Created pipeline.py


In [17]:
def example_usage():
    """
    Example: Evaluate your free RAG pipelines
    """
    
    # Initialize evaluator
    evaluator = RAGEvaluator(judge_provider="groq")  # Free judge!
    
    # Pipelines to evaluate
    pipelines_to_test = [
        ("Groq Fast XLarge", FREE_PIPELINES["groq_fast_xlarge"]),
        ("Ollama quality", FREE_PIPELINES["ollama_quality"]),
        ("Ollama Speed", FREE_PIPELINES["ollama_speed"])
    ]
    
    # Evaluate each pipeline
    for name, config in pipelines_to_test:
        # Create pipeline
        pipeline = FreeRAGPipeline(config)
        
        # Wrapper function for evaluator
        def pipeline_func(query: str) -> Dict:
            result = pipeline.query(query)
            return {
                'answer': result['answer'],
                'sources': result['sources']
            }
        
        # Evaluate
        evaluator.evaluate_pipeline(name, pipeline_func)
    
    # Compare results
    comparison = evaluator.compare_pipelines()
    
    # Save results
    evaluator.save_results()
    
    print(f"\n‚úÖ EVALUATION COMPLETE!")
    print(f"üèÜ Recommended Pipeline: {comparison['recommendation']}")


if __name__ == "__main__":
    example_usage()

üìÅ loading collection from: data/vector_db/extra_large_minilm
‚ö° Using Groq: llama-3.1-8b-instant

üß™ EVALUATING: Groq Fast XLarge

[1/7] What are hiking trails in British Columbia?...

üöÄ üÜì Groq Fast - XLarge (Llama 3.1)
‚ùì Query: What are hiking trails in British Columbia?

üì• Retrieving documents...
üó∫Ô∏è  Detected province: british columbia -> ['British Columbia South', 'British Columbia North']
   ‚úÖ Retrieved 5 results
üí¨ Generating answer...

‚è±Ô∏è  Total: 6.69s | üí∞ Cost: $0.00 (FREE!)

üìä Answer:
There are several hiking trails in British Columbia South, Canada. Some of the notable trails include:

1. **Summit Trail (Wilderness)**: This is a hiking trail located in the Okanogan-Wenatchee National Forest, part of the lwn network. For more information, visit https://www.fs.usda.gov/recarea/okawen/recreation/hiking/recarea/?recid=57353 [Source 1].

2. **Fisher-Thunder-Park-Cascade Cross-Park Trek**: This is a hiking trail located in North Cascades National 

## Result visualization

In [18]:
"""
Evaluation Dashboard - Visualize RAG Pipeline Performance
Creates comprehensive charts and reports
"""

import json
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime


class EvaluationDashboard:
    """
    Create beautiful visualizations of evaluation results
    """
    
    def __init__(self, results_file: str):
        """Load evaluation results"""
        with open(results_file, 'r') as f:
            data = json.load(f)
        
        self.results = data['evaluation_results']
        self.timestamp = data['timestamp']
        
        # Set style
        sns.set_style("whitegrid")
        plt.rcParams['font.family'] = 'sans-serif'
        plt.rcParams['font.size'] = 10
    
    def create_all_charts(self):
        """Generate all visualization charts"""
        print("\n" + "="*70)
        print("üìä GENERATING EVALUATION DASHBOARD")
        print("="*70 + "\n")
        
        self.create_overall_scores_chart()
        self.create_metric_breakdown_radar()
        self.create_category_performance()
        self.create_pass_rate_chart()
        self.create_time_vs_quality_scatter()
        self.create_detailed_comparison_table()
        
        print("\n" + "="*70)
        print("‚úÖ ALL CHARTS GENERATED!")
        print("="*70)
    
    def create_overall_scores_chart(self):
        """Overall quality scores comparison"""
        pipelines = [r['pipeline_name'] for r in self.results]
        scores = [r['aggregate_metrics']['avg_overall_score'] for r in self.results]
        
        fig, ax = plt.subplots(figsize=(12, 6))
        
        colors = plt.cm.viridis(np.linspace(0.3, 0.9, len(pipelines)))
        bars = ax.barh(pipelines, scores, color=colors, alpha=0.8)
        
        # Add value labels
        for bar, score in zip(bars, scores):
            width = bar.get_width()
            ax.text(width + 0.01, bar.get_y() + bar.get_height()/2,
                   f'{score:.3f}',
                   ha='left', va='center', fontweight='bold', fontsize=11)
        
        # Add target line
        ax.axvline(0.7, color='red', linestyle='--', alpha=0.5, label='Target (0.7)')
        
        ax.set_xlabel('Overall Quality Score', fontsize=12, fontweight='bold')
        ax.set_title('üèÜ Overall Pipeline Quality Comparison', fontsize=14, fontweight='bold', pad=20)
        ax.set_xlim(0, 1.0)
        ax.legend()
        ax.grid(axis='x', alpha=0.3)
        
        plt.tight_layout()
        plt.savefig('eval_overall_scores.png', dpi=300, bbox_inches='tight')
        print("‚úÖ Saved: eval_overall_scores.png")
        plt.close()
    
    def create_metric_breakdown_radar(self):
        """Radar chart comparing all metrics"""
        metrics = ['accuracy', 'completeness', 'relevance', 
                  'citation_quality', 'clarity', 'hallucination_check']
        
        num_pipelines = len(self.results)
        if num_pipelines > 4:
            num_pipelines = 4  # Limit for readability
        
        fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection='polar'))
        
        angles = np.linspace(0, 2 * np.pi, len(metrics), endpoint=False).tolist()
        angles += angles[:1]  # Complete the circle
        
        colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A']
        
        for idx, result in enumerate(self.results[:num_pipelines]):
            values = [result['aggregate_metrics'][f'avg_{m}'] for m in metrics]
            values += values[:1]  # Complete the circle
            
            ax.plot(angles, values, 'o-', linewidth=2, 
                   label=result['pipeline_name'], color=colors[idx])
            ax.fill(angles, values, alpha=0.15, color=colors[idx])
        
        ax.set_xticks(angles[:-1])
        ax.set_xticklabels([m.replace('_', ' ').title() for m in metrics], fontsize=10)
        ax.set_ylim(0, 1)
        ax.set_yticks([0.2, 0.4, 0.6, 0.8, 1.0])
        ax.set_yticklabels(['0.2', '0.4', '0.6', '0.8', '1.0'], fontsize=8)
        ax.grid(True, alpha=0.3)
        
        ax.set_title('üìä Metric Breakdown Comparison', 
                    fontsize=14, fontweight='bold', pad=20, y=1.08)
        ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
        
        plt.tight_layout()
        plt.savefig('eval_metric_radar.png', dpi=300, bbox_inches='tight')
        print("‚úÖ Saved: eval_metric_radar.png")
        plt.close()
    
    def create_category_performance(self):
        """Performance by query category"""
        categories = set()
        for result in self.results:
            categories.update(result['aggregate_metrics']['category_breakdown'].keys())
        
        categories = sorted(categories)
        
        fig, ax = plt.subplots(figsize=(14, 6))
        
        x = np.arange(len(categories))
        width = 0.8 / len(self.results)
        
        for idx, result in enumerate(self.results):
            scores = [
                result['aggregate_metrics']['category_breakdown'].get(cat, 0)
                for cat in categories
            ]
            
            offset = (idx - len(self.results)/2) * width + width/2
            ax.bar(x + offset, scores, width, 
                  label=result['pipeline_name'], alpha=0.8)
        
        ax.set_xlabel('Query Category', fontsize=12, fontweight='bold')
        ax.set_ylabel('Average Score', fontsize=12, fontweight='bold')
        ax.set_title('üìÇ Performance by Query Category', fontsize=14, fontweight='bold', pad=20)
        ax.set_xticks(x)
        ax.set_xticklabels([c.replace('_', ' ').title() for c in categories], rotation=45, ha='right')
        ax.set_ylim(0, 1.0)
        ax.legend()
        ax.grid(axis='y', alpha=0.3)
        
        plt.tight_layout()
        plt.savefig('eval_category_performance.png', dpi=300, bbox_inches='tight')
        print("‚úÖ Saved: eval_category_performance.png")
        plt.close()
    
    def create_pass_rate_chart(self):
        """Pass rate comparison"""
        pipelines = [r['pipeline_name'] for r in self.results]
        pass_rates = [r['aggregate_metrics']['pass_rate'] * 100 for r in self.results]
        
        fig, ax = plt.subplots(figsize=(12, 6))
        
        colors = ['#2ECC71' if pr >= 70 else '#E74C3C' for pr in pass_rates]
        bars = ax.barh(pipelines, pass_rates, color=colors, alpha=0.8)
        
        # Add value labels
        for bar, rate in zip(bars, pass_rates):
            width = bar.get_width()
            ax.text(width + 1, bar.get_y() + bar.get_height()/2,
                   f'{rate:.0f}%',
                   ha='left', va='center', fontweight='bold', fontsize=11)
        
        # Add target line
        ax.axvline(70, color='orange', linestyle='--', alpha=0.7, 
                  linewidth=2, label='Target (70%)')
        
        ax.set_xlabel('Pass Rate (%)', fontsize=12, fontweight='bold')
        ax.set_title('‚úÖ Query Success Pass Rate', fontsize=14, fontweight='bold', pad=20)
        ax.set_xlim(0, 105)
        ax.legend()
        ax.grid(axis='x', alpha=0.3)
        
        plt.tight_layout()
        plt.savefig('eval_pass_rate.png', dpi=300, bbox_inches='tight')
        print("‚úÖ Saved: eval_pass_rate.png")
        plt.close()
    
    def create_time_vs_quality_scatter(self):
        """Speed vs Quality tradeoff"""
        fig, ax = plt.subplots(figsize=(12, 8))
        
        for result in self.results:
            metrics = result['aggregate_metrics']
            quality = metrics['avg_overall_score']
            time = metrics['avg_query_time']
            
            ax.scatter(time, quality, s=500, alpha=0.6, edgecolors='black', linewidth=2)
            ax.annotate(result['pipeline_name'], (time, quality),
                       xytext=(10, 10), textcoords='offset points',
                       fontsize=10, fontweight='bold',
                       bbox=dict(boxstyle='round,pad=0.5', facecolor='white', alpha=0.8))
        
        # Add quadrant lines
        if self.results:
            median_time = np.median([r['aggregate_metrics']['avg_query_time'] for r in self.results])
            median_quality = np.median([r['aggregate_metrics']['avg_overall_score'] for r in self.results])
            
            ax.axvline(median_time, color='gray', linestyle='--', alpha=0.5)
            ax.axhline(median_quality, color='gray', linestyle='--', alpha=0.5)
            
            # Quadrant labels
            ax.text(0.02, 0.98, 'Fast & High Quality\n(IDEAL)', 
                   transform=ax.transAxes, fontsize=10, va='top', 
                   bbox=dict(boxstyle='round', facecolor='lightgreen', alpha=0.3))
            ax.text(0.98, 0.98, 'Slow but High Quality', 
                   transform=ax.transAxes, fontsize=10, va='top', ha='right',
                   bbox=dict(boxstyle='round', facecolor='lightyellow', alpha=0.3))
            ax.text(0.02, 0.02, 'Fast but Low Quality', 
                   transform=ax.transAxes, fontsize=10, va='bottom',
                   bbox=dict(boxstyle='round', facecolor='lightyellow', alpha=0.3))
            ax.text(0.98, 0.02, 'Slow & Low Quality\n(WORST)', 
                   transform=ax.transAxes, fontsize=10, va='bottom', ha='right',
                   bbox=dict(boxstyle='round', facecolor='lightcoral', alpha=0.3))
        
        ax.set_xlabel('‚ö° Average Query Time (seconds) - Lower is Better', 
                     fontsize=12, fontweight='bold')
        ax.set_ylabel('üìö Quality Score - Higher is Better', 
                     fontsize=12, fontweight='bold')
        ax.set_title('üéØ Speed vs Quality Tradeoff', fontsize=14, fontweight='bold', pad=20)
        ax.grid(True, alpha=0.3)
        ax.set_ylim(0, 1.05)
        
        plt.tight_layout()
        plt.savefig('eval_speed_vs_quality.png', dpi=300, bbox_inches='tight')
        print("‚úÖ Saved: eval_speed_vs_quality.png")
        plt.close()
    
    def create_detailed_comparison_table(self):
        """Comprehensive comparison table"""
        data = []
        for result in self.results:
            metrics = result['aggregate_metrics']
            data.append([
                result['pipeline_name'],
                f"{metrics['avg_overall_score']:.3f}",
                f"{metrics['avg_accuracy']:.3f}",
                f"{metrics['avg_completeness']:.3f}",
                f"{metrics['avg_hallucination_check']:.3f}",
                f"{metrics['pass_rate']*100:.0f}%",
                f"{metrics['avg_query_time']:.2f}s"
            ])
        
        fig, ax = plt.subplots(figsize=(16, len(data) + 2))
        ax.axis('tight')
        ax.axis('off')
        
        columns = ['Pipeline', 'Overall', 'Accuracy', 'Complete', 
                  'No Halluc.', 'Pass Rate', 'Avg Time']
        
        table = ax.table(cellText=data, colLabels=columns, 
                        cellLoc='center', loc='center',
                        colWidths=[0.25, 0.12, 0.12, 0.12, 0.12, 0.12, 0.15])
        
        table.auto_set_font_size(False)
        table.set_fontsize(10)
        table.scale(1, 2.5)
        
        # Style header
        for i in range(len(columns)):
            table[(0, i)].set_facecolor('#4A90E2')
            table[(0, i)].set_text_props(weight='bold', color='white', fontsize=11)
        
        # Style cells - color code by performance
        for i in range(1, len(data) + 1):
            # Overall score cell
            overall_score = float(data[i-1][1])
            if overall_score >= 0.8:
                table[(i, 1)].set_facecolor('#90EE90')  # Green
            elif overall_score >= 0.7:
                table[(i, 1)].set_facecolor('#FFE4B5')  # Yellow
            else:
                table[(i, 1)].set_facecolor('#FFB6C1')  # Red
            
            # Alternating row colors
            if i % 2 == 0:
                for j in range(len(columns)):
                    if j != 1:  # Don't override overall score color
                        table[(i, j)].set_facecolor('#F5F5F5')
        
        plt.title('üìä Detailed Pipeline Comparison', 
                 fontsize=16, fontweight='bold', pad=20)
        plt.tight_layout()
        plt.savefig('eval_comparison_table.png', dpi=300, bbox_inches='tight')
        print("‚úÖ Saved: eval_comparison_table.png")
        plt.close()
    
    def generate_report(self):
        """Generate text report"""
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        report_file = f"evaluation_report_{timestamp}.txt"
        
        with open(report_file, 'w', encoding='utf-8') as f:
            f.write("="*70 + "\n")
            f.write("RAG PIPELINE EVALUATION REPORT\n")
            f.write("="*70 + "\n\n")
            f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"Pipelines Evaluated: {len(self.results)}\n\n")
            
            # Overall rankings
            f.write("="*70 + "\n")
            f.write("üèÜ OVERALL RANKINGS\n")
            f.write("="*70 + "\n\n")
            
            sorted_by_quality = sorted(
                self.results, 
                key=lambda x: x['aggregate_metrics']['avg_overall_score'], 
                reverse=True
            )
            
            for idx, result in enumerate(sorted_by_quality, 1):
                metrics = result['aggregate_metrics']
                f.write(f"{idx}. {result['pipeline_name']}\n")
                f.write(f"   Quality Score: {metrics['avg_overall_score']:.3f}\n")
                f.write(f"   Pass Rate: {metrics['pass_rate']*100:.0f}%\n")
                f.write(f"   Avg Time: {metrics['avg_query_time']:.2f}s\n\n")
            
            # Detailed breakdown
            f.write("\n" + "="*70 + "\n")
            f.write("üìä DETAILED BREAKDOWN\n")
            f.write("="*70 + "\n\n")
            
            for result in self.results:
                metrics = result['aggregate_metrics']
                f.write(f"\n{result['pipeline_name']}\n")
                f.write("-" * 70 + "\n")
                f.write(f"Overall Score:        {metrics['avg_overall_score']:.3f}\n")
                f.write(f"Accuracy:             {metrics['avg_accuracy']:.3f}\n")
                f.write(f"Completeness:         {metrics['avg_completeness']:.3f}\n")
                f.write(f"Relevance:            {metrics['avg_relevance']:.3f}\n")
                f.write(f"Citation Quality:     {metrics['avg_citation_quality']:.3f}\n")
                f.write(f"Clarity:              {metrics['avg_clarity']:.3f}\n")
                f.write(f"Hallucination Check:  {metrics['avg_hallucination_check']:.3f}\n")
                f.write(f"Pass Rate:            {metrics['pass_rate']*100:.0f}%\n")
                f.write(f"Avg Query Time:       {metrics['avg_query_time']:.2f}s\n")
                
                # Category breakdown
                f.write("\nCategory Performance:\n")
                for cat, score in metrics['category_breakdown'].items():
                    f.write(f"  {cat.replace('_', ' ').title():<20} {score:.3f}\n")
            
            # Recommendations
            f.write("\n" + "="*70 + "\n")
            f.write("üí° RECOMMENDATIONS\n")
            f.write("="*70 + "\n\n")
            
            best_quality = max(self.results, 
                             key=lambda x: x['aggregate_metrics']['avg_overall_score'])
            fastest = min(self.results, 
                         key=lambda x: x['aggregate_metrics']['avg_query_time'])
            best_pass = max(self.results, 
                          key=lambda x: x['aggregate_metrics']['pass_rate'])
            
            f.write(f"üèÜ Best Quality: {best_quality['pipeline_name']}\n")
            f.write(f"   Score: {best_quality['aggregate_metrics']['avg_overall_score']:.3f}\n\n")
            
            f.write(f"‚ö° Fastest: {fastest['pipeline_name']}\n")
            f.write(f"   Time: {fastest['aggregate_metrics']['avg_query_time']:.2f}s\n\n")
            
            f.write(f"‚úÖ Best Pass Rate: {best_pass['pipeline_name']}\n")
            f.write(f"   Rate: {best_pass['aggregate_metrics']['pass_rate']*100:.0f}%\n\n")
            
            # Calculate balanced recommendation
            scores = []
            for result in self.results:
                metrics = result['aggregate_metrics']
                quality = metrics['avg_overall_score']
                time_factor = 1.0 / (1.0 + metrics['avg_query_time'] / 10.0)
                balanced = quality * (0.7 + 0.3 * time_factor)
                scores.append((result['pipeline_name'], balanced, quality, metrics['avg_query_time']))
            
            best_balanced = max(scores, key=lambda x: x[1])
            
            f.write(f"‚öñÔ∏è  Recommended (Balanced): {best_balanced[0]}\n")
            f.write(f"   Balanced Score: {best_balanced[1]:.3f}\n")
            f.write(f"   Quality: {best_balanced[2]:.3f} | Time: {best_balanced[3]:.2f}s\n")
        
        print(f"‚úÖ Saved: {report_file}")
        return report_file


def create_dashboard(results_file: str):
    """Create complete evaluation dashboard"""
    dashboard = EvaluationDashboard(results_file)
    dashboard.create_all_charts()
    dashboard.generate_report()
    
    print("\n" + "="*70)
    print("üìä DASHBOARD COMPLETE")
    print("="*70)
    print("\nGenerated files:")
    print("  üìä eval_overall_scores.png")
    print("  üìä eval_metric_radar.png")
    print("  üìä eval_category_performance.png")
    print("  üìä eval_pass_rate.png")
    print("  üìä eval_speed_vs_quality.png")
    print("  üìä eval_comparison_table.png")
    print("  üìù evaluation_report_*.txt")


In [19]:
def find_latest_evaluation_file():
    """Find most recent evaluation results file"""
    eval_files = list(Path(".").glob("rag_evaluation_*.json"))
    
    if not eval_files:
        print("‚ùå No evaluation files found!")
        print("   Run the evaluation system first to generate results.")
        return None
    
    latest_file = max(eval_files, key=lambda x: x.stat().st_mtime)
    print(f"üìÇ Found latest evaluation: {latest_file}")
    return str(latest_file)


def run_dashboard(results_file: str = None):
    """
    Run the evaluation dashboard
    
    Usage:
        run_dashboard()                          # Auto-finds latest file
        run_dashboard("rag_evaluation_*.json")   # Use specific file
    """
    
    if results_file is None:
        results_file = find_latest_evaluation_file()
        if results_file is None:
            return
    
    print(f"\nüìä Creating dashboard from: {results_file}")
    create_dashboard(results_file)


# Run automatically when cell is executed
# Comment out this line if you don't want auto-run
print("\n" + "="*70)
print("üìä EVALUATION DASHBOARD")
print("="*70)

run_dashboard()


üìä EVALUATION DASHBOARD
üìÇ Found latest evaluation: rag_evaluation_20260111_123308.json

üìä Creating dashboard from: rag_evaluation_20260111_123308.json

üìä GENERATING EVALUATION DASHBOARD



  plt.tight_layout()
  plt.savefig('eval_overall_scores.png', dpi=300, bbox_inches='tight')


‚úÖ Saved: eval_overall_scores.png


  plt.tight_layout()
  plt.savefig('eval_metric_radar.png', dpi=300, bbox_inches='tight')


‚úÖ Saved: eval_metric_radar.png


  plt.tight_layout()
  plt.savefig('eval_category_performance.png', dpi=300, bbox_inches='tight')


‚úÖ Saved: eval_category_performance.png


  plt.tight_layout()
  plt.savefig('eval_pass_rate.png', dpi=300, bbox_inches='tight')


‚úÖ Saved: eval_pass_rate.png


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig('eval_speed_vs_quality.png', dpi=300, bbox_inches='tight')
  plt.savefig('eval_speed_vs_quality.png', dpi=300, bbox_inches='tight')
  plt.savefig('eval_speed_vs_quality.png', dpi=300, bbox_inches='tight')


‚úÖ Saved: eval_speed_vs_quality.png


  plt.tight_layout()
  plt.savefig('eval_comparison_table.png', dpi=300, bbox_inches='tight')


‚úÖ Saved: eval_comparison_table.png

‚úÖ ALL CHARTS GENERATED!
‚úÖ Saved: evaluation_report_20260111_123316.txt

üìä DASHBOARD COMPLETE

Generated files:
  üìä eval_overall_scores.png
  üìä eval_metric_radar.png
  üìä eval_category_performance.png
  üìä eval_pass_rate.png
  üìä eval_speed_vs_quality.png
  üìä eval_comparison_table.png
  üìù evaluation_report_*.txt
