# 🤖 Multi-Agent Research System Evaluation Notebook

This notebook provides a comprehensive interface for running and evaluating the multi-agent research system with Phoenix observability.

## 📋 Features
- Interactive evaluation controls
- Real-time progress tracking
- Phoenix tracing integration
- Comprehensive result analysis
- Quality metrics visualization


## 🔧 Setup and Imports

In [None]:
import sys
import os
import asyncio
import logging
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from typing import Dict, List, Any
import json
from pathlib import Path

# Add project root to path
project_root = Path().absolute().parent
sys.path.insert(0, str(project_root))

print(f"📁 Project root: {project_root}")
print(f"🐍 Python path updated")

In [None]:
# Import multi-agent system components
from agents.multi_agents import MultiAgentResearchSystem, initialize_system
from agents.supervisor import SupervisorAgent
from agents.search import SearchAgent
from agents.citation import CitationAgent
from config.settings import settings, ReasoningEffort, Verbosity
from evaluation_dataset import EVALUATION_QUERIES, get_queries_by_complexity
from evaluation.phoenix_integration import phoenix_integration, start_evaluation_session

print("✅ Multi-agent system imports successful")

In [None]:
# Configure logging for notebook
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler()
    ]
)

# Set library loggers to WARNING to reduce noise
logging.getLogger('httpx').setLevel(logging.WARNING)
logging.getLogger('openai').setLevel(logging.WARNING)

logger = logging.getLogger(__name__)
print("📝 Logging configured")

## 🚀 System Initialization

In [None]:
# Initialize the multi-agent research system
system = MultiAgentResearchSystem(
    supervisor_reasoning=ReasoningEffort.MEDIUM,
    supervisor_verbosity=Verbosity.MEDIUM,
    search_reasoning=ReasoningEffort.LOW,
    citation_reasoning=ReasoningEffort.LOW,
    enable_phoenix_tracing=True
)

print("🤖 Multi-Agent Research System Initialized")
print(f"📊 Agents: {list(system.supervisor.sub_agents.keys())}")
print(f"🔍 Phoenix Tracing: {'Enabled' if system.enable_phoenix_tracing else 'Disabled'}")

## 🧪 Phoenix Integration Setup

In [None]:
# Start Phoenix evaluation session
session_name = f"notebook_evaluation_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
session_id = await start_evaluation_session(session_name)

print(f"🔥 Phoenix Session Started: {session_id}")
print(f"📍 Access Phoenix UI at: http://localhost:6006")

## 📊 Evaluation Dataset Overview

In [None]:
# Display evaluation dataset summary
def analyze_dataset():
    """Analyze and display evaluation dataset statistics."""
    total_queries = len(EVALUATION_QUERIES)
    
    # Group by complexity
    complexity_counts = {}
    query_types = {}
    domains = {}
    current_info_needed = 0
    
    for query in EVALUATION_QUERIES:
        # Count by complexity
        complexity = query.get('expected_model', 'unknown')
        complexity_counts[complexity] = complexity_counts.get(complexity, 0) + 1
        
        # Count by type
        query_type = query.get('type', 'unknown')
        query_types[query_type] = query_types.get(query_type, 0) + 1
        
        # Count by domain
        domain = query.get('domain', 'unknown')
        domains[domain] = domains.get(domain, 0) + 1
        
        # Count current info requirements
        if query.get('requires_current_info', False):
            current_info_needed += 1
    
    return {
        'total': total_queries,
        'complexity': complexity_counts,
        'types': query_types,
        'domains': domains,
        'current_info': current_info_needed
    }

dataset_stats = analyze_dataset()

print("📈 Evaluation Dataset Analysis")
print("=" * 50)
print(f"📊 Total Queries: {dataset_stats['total']}")
print(f"🔍 Require Current Info: {dataset_stats['current_info']}")
print()
print("📋 By Complexity:")
for complexity, count in dataset_stats['complexity'].items():
    print(f"  {complexity}: {count} queries")
print()
print("📝 By Type:")
for qtype, count in dataset_stats['types'].items():
    print(f"  {qtype}: {count} queries")
print()
print("🏷️ By Domain (top 10):")
sorted_domains = sorted(dataset_stats['domains'].items(), key=lambda x: x[1], reverse=True)[:10]
for domain, count in sorted_domains:
    print(f"  {domain}: {count} queries")

In [None]:
# Visualize dataset distribution
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Evaluation Dataset Distribution', fontsize=16, fontweight='bold')

# Complexity distribution
complexities = list(dataset_stats['complexity'].keys())
complexity_counts = list(dataset_stats['complexity'].values())
axes[0,0].pie(complexity_counts, labels=complexities, autopct='%1.1f%%', startangle=90)
axes[0,0].set_title('Distribution by Complexity')

# Query type distribution  
types = list(dataset_stats['types'].keys())
type_counts = list(dataset_stats['types'].values())
axes[0,1].pie(type_counts, labels=types, autopct='%1.1f%%', startangle=90)
axes[0,1].set_title('Distribution by Query Type')

# Top domains
top_domains = sorted_domains[:8]
domain_names = [d[0] for d in top_domains]
domain_counts = [d[1] for d in top_domains]
axes[1,0].bar(range(len(domain_names)), domain_counts)
axes[1,0].set_xticks(range(len(domain_names)))
axes[1,0].set_xticklabels(domain_names, rotation=45, ha='right')
axes[1,0].set_title('Top 8 Domains')
axes[1,0].set_ylabel('Number of Queries')

# Current info requirement
current_info_data = ['Requires Current Info', 'Historical Info OK']
current_info_counts = [dataset_stats['current_info'], dataset_stats['total'] - dataset_stats['current_info']]
axes[1,1].pie(current_info_counts, labels=current_info_data, autopct='%1.1f%%', startangle=90)
axes[1,1].set_title('Current Information Requirements')

plt.tight_layout()
plt.show()

## 🎮 Interactive Evaluation Controls

In [None]:
# Interactive widgets for evaluation control
import ipywidgets as widgets
from IPython.display import display, clear_output
import time

# Evaluation control widgets
max_queries_slider = widgets.IntSlider(
    value=5,
    min=1,
    max=20,
    step=1,
    description='Max Queries:',
    style={'description_width': 'initial'},
    layout={'width': '400px'}
)

complexity_filter = widgets.SelectMultiple(
    options=['gpt-5-nano', 'gpt-5-mini', 'gpt-5'],
    value=['gpt-5-nano', 'gpt-5-mini', 'gpt-5'],
    description='Complexity Levels:',
    style={'description_width': 'initial'},
    layout={'width': '400px', 'height': '80px'}
)

domain_filter = widgets.Dropdown(
    options=['All'] + [d[0] for d in sorted_domains[:10]],
    value='All',
    description='Domain Filter:',
    style={'description_width': 'initial'},
    layout={'width': '400px'}
)

run_button = widgets.Button(
    description='🚀 Run Evaluation',
    button_style='success',
    layout={'width': '200px', 'height': '40px'}
)

progress_bar = widgets.IntProgress(
    value=0,
    min=0,
    max=100,
    description='Progress:',
    bar_style='info',
    style={'bar_color': '#1f77b4', 'description_width': 'initial'},
    layout={'width': '400px'}
)

status_output = widgets.Output()

print("🎮 Interactive Evaluation Controls")
display(widgets.VBox([
    widgets.HBox([max_queries_slider]),
    widgets.HBox([complexity_filter]),
    widgets.HBox([domain_filter]),
    widgets.HBox([run_button]),
    widgets.HBox([progress_bar]),
    status_output
]))

In [None]:
# Evaluation results storage
evaluation_results = []
current_evaluation_session = None

async def run_single_evaluation(query_data: Dict, session_id: str) -> Dict:
    """Run evaluation on a single query and collect metrics."""
    start_time = time.time()
    
    try:
        # Process query through multi-agent system
        result = await system.process_query(
            query=query_data['query'],
            trace_id=f"{session_id}_{query_data.get('id', 'unknown')}",
            session_id=session_id
        )
        
        execution_time = time.time() - start_time
        
        # Extract key metrics
        response_length = len(result.get('response', ''))
        citations_count = len(result.get('citations', []))
        total_tokens = result.get('total_tokens', 0)
        
        # Simple quality metrics (would be enhanced with proper evaluation models)
        quality_score = min(1.0, response_length / 500)  # Basic length-based score
        citation_completeness = min(1.0, citations_count / query_data.get('expected_sources', 5))
        
        return {
            'query_id': query_data.get('id', 'unknown'),
            'query': query_data['query'],
            'expected_complexity': query_data.get('expected_model', 'unknown'),
            'domain': query_data.get('domain', 'unknown'),
            'type': query_data.get('type', 'unknown'),
            'status': result.get('status', 'unknown'),
            'response_length': response_length,
            'citations_count': citations_count,
            'total_tokens': total_tokens,
            'execution_time': execution_time,
            'quality_score': quality_score,
            'citation_completeness': citation_completeness,
            'agents_used': result.get('agents_used', []),
            'model_used': result.get('model_used', 'unknown'),
            'timestamp': datetime.now().isoformat(),
            'success': result.get('status') == 'success'
        }
        
    except Exception as e:
        execution_time = time.time() - start_time
        logger.error(f"Evaluation failed for query {query_data.get('id', 'unknown')}: {str(e)}")
        
        return {
            'query_id': query_data.get('id', 'unknown'),
            'query': query_data['query'],
            'expected_complexity': query_data.get('expected_model', 'unknown'),
            'domain': query_data.get('domain', 'unknown'),
            'type': query_data.get('type', 'unknown'),
            'status': 'error',
            'response_length': 0,
            'citations_count': 0,
            'total_tokens': 0,
            'execution_time': execution_time,
            'quality_score': 0.0,
            'citation_completeness': 0.0,
            'agents_used': [],
            'model_used': 'error',
            'timestamp': datetime.now().isoformat(),
            'success': False,
            'error': str(e)
        }

async def run_evaluation_batch(queries: List[Dict], session_id: str) -> List[Dict]:
    """Run evaluation on a batch of queries with progress tracking."""
    results = []
    total = len(queries)
    
    with status_output:
        clear_output(wait=True)
        print(f"🚀 Starting evaluation of {total} queries...")
    
    for i, query_data in enumerate(queries):
        # Update progress
        progress_bar.value = int((i / total) * 100)
        
        with status_output:
            clear_output(wait=True)
            print(f"📝 Processing query {i+1}/{total}: {query_data['query'][:50]}...")
        
        # Run single evaluation
        result = await run_single_evaluation(query_data, session_id)
        results.append(result)
        
        # Brief pause to prevent overwhelming the API
        await asyncio.sleep(0.5)
    
    progress_bar.value = 100
    with status_output:
        clear_output(wait=True)
        print(f"✅ Evaluation complete! Processed {len(results)} queries")
    
    return results

print("📊 Evaluation functions defined")

In [None]:
# Button click handler
async def on_run_button_clicked(b):
    """Handle run evaluation button click."""
    global evaluation_results, current_evaluation_session
    
    # Get filter settings
    max_queries = max_queries_slider.value
    selected_complexities = list(complexity_filter.value)
    selected_domain = domain_filter.value
    
    # Filter queries based on settings
    filtered_queries = []
    for query in EVALUATION_QUERIES:
        # Filter by complexity
        if query.get('expected_model', 'unknown') not in selected_complexities:
            continue
            
        # Filter by domain
        if selected_domain != 'All' and query.get('domain', 'unknown') != selected_domain:
            continue
            
        filtered_queries.append(query)
        
        # Limit to max queries
        if len(filtered_queries) >= max_queries:
            break
    
    if not filtered_queries:
        with status_output:
            clear_output(wait=True)
            print("⚠️ No queries match the selected filters!")
        return
    
    # Start new evaluation session
    eval_session_id = f"interactive_eval_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
    current_evaluation_session = eval_session_id
    
    # Run evaluation
    try:
        run_button.disabled = True
        run_button.description = '⏳ Running...'
        
        results = await run_evaluation_batch(filtered_queries, eval_session_id)
        evaluation_results.extend(results)
        
        # Show summary
        successful = len([r for r in results if r['success']])
        failed = len(results) - successful
        avg_time = sum(r['execution_time'] for r in results) / len(results)
        total_tokens = sum(r['total_tokens'] for r in results)
        
        with status_output:
            clear_output(wait=True)
            print(f"🎉 Evaluation Complete!")
            print(f"✅ Successful: {successful}")
            print(f"❌ Failed: {failed}")
            print(f"⏱️ Avg Time: {avg_time:.2f}s")
            print(f"🪙 Total Tokens: {total_tokens:,}")
            print(f"📊 Results stored in evaluation_results")
        
    finally:
        run_button.disabled = False
        run_button.description = '🚀 Run Evaluation'

# Connect button to handler
run_button.on_click(lambda b: asyncio.create_task(on_run_button_clicked(b)))

print("🎮 Button handler connected")

## 🧪 Quick Test Evaluation

Let's run a quick test with a few sample queries to verify everything works:

In [None]:
# Quick test evaluation
test_queries = [
    {
        'id': 'test_1',
        'query': 'What is artificial intelligence?',
        'expected_model': 'gpt-5-nano',
        'domain': 'technology',
        'type': 'qa',
        'expected_sources': 3
    },
    {
        'id': 'test_2', 
        'query': 'Explain the latest developments in quantum computing and their implications for cryptography',
        'expected_model': 'gpt-5',
        'domain': 'technology',
        'type': 'research',
        'expected_sources': 5
    }
]

print("🧪 Running quick test evaluation...")
test_session_id = f"test_eval_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
test_results = await run_evaluation_batch(test_queries, test_session_id)

# Display test results
print("\n📊 Test Results:")
for result in test_results:
    status_emoji = "✅" if result['success'] else "❌"
    print(f"{status_emoji} {result['query_id']}: {result['execution_time']:.2f}s, {result['total_tokens']} tokens, {result['citations_count']} citations")

print(f"\n🎯 Test completed! {len([r for r in test_results if r['success']])}/{len(test_results)} successful")

## 📈 Results Analysis and Visualization

In [None]:
# Results analysis functions
def analyze_evaluation_results(results: List[Dict]) -> Dict[str, Any]:
    """Analyze evaluation results and compute comprehensive metrics."""
    if not results:
        return {"error": "No results to analyze"}
    
    successful = [r for r in results if r['success']]
    failed = [r for r in results if not r['success']]
    
    # Basic metrics
    metrics = {
        'total_queries': len(results),
        'successful': len(successful),
        'failed': len(failed),
        'success_rate': len(successful) / len(results) if results else 0,
    }
    
    if successful:
        # Performance metrics
        execution_times = [r['execution_time'] for r in successful]
        token_counts = [r['total_tokens'] for r in successful]
        response_lengths = [r['response_length'] for r in successful]
        citation_counts = [r['citations_count'] for r in successful]
        quality_scores = [r['quality_score'] for r in successful]
        
        metrics.update({
            'avg_execution_time': sum(execution_times) / len(execution_times),
            'median_execution_time': sorted(execution_times)[len(execution_times)//2],
            'max_execution_time': max(execution_times),
            'min_execution_time': min(execution_times),
            'total_tokens': sum(token_counts),
            'avg_tokens': sum(token_counts) / len(token_counts),
            'avg_response_length': sum(response_lengths) / len(response_lengths),
            'avg_citations': sum(citation_counts) / len(citation_counts),
            'avg_quality_score': sum(quality_scores) / len(quality_scores),
        })
        
        # Performance by complexity
        by_complexity = {}
        for result in successful:
            complexity = result['expected_complexity']
            if complexity not in by_complexity:
                by_complexity[complexity] = []
            by_complexity[complexity].append(result)
        
        complexity_metrics = {}
        for complexity, comp_results in by_complexity.items():
            complexity_metrics[complexity] = {
                'count': len(comp_results),
                'avg_time': sum(r['execution_time'] for r in comp_results) / len(comp_results),
                'avg_tokens': sum(r['total_tokens'] for r in comp_results) / len(comp_results),
                'avg_quality': sum(r['quality_score'] for r in comp_results) / len(comp_results),
            }
        
        metrics['by_complexity'] = complexity_metrics
    
    return metrics

def visualize_results(results: List[Dict]):
    """Create comprehensive visualizations of evaluation results."""
    if not results:
        print("No results to visualize")
        return
    
    # Convert to DataFrame for easier analysis
    df = pd.DataFrame(results)
    
    # Create subplots
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    fig.suptitle('Multi-Agent System Evaluation Results', fontsize=16, fontweight='bold')
    
    # 1. Success Rate
    success_counts = df['success'].value_counts()
    axes[0,0].pie(success_counts.values, labels=['Success', 'Failed'], autopct='%1.1f%%', 
                  colors=['#2ecc71', '#e74c3c'], startangle=90)
    axes[0,0].set_title('Success Rate')
    
    # 2. Execution Time Distribution
    successful_df = df[df['success'] == True]
    if not successful_df.empty:
        axes[0,1].hist(successful_df['execution_time'], bins=15, alpha=0.7, color='#3498db')
        axes[0,1].set_xlabel('Execution Time (seconds)')
        axes[0,1].set_ylabel('Frequency')
        axes[0,1].set_title('Execution Time Distribution')
        axes[0,1].axvline(successful_df['execution_time'].mean(), color='red', linestyle='--', 
                         label=f'Mean: {successful_df["execution_time"].mean():.2f}s')
        axes[0,1].legend()
    
    # 3. Token Usage by Complexity
    if not successful_df.empty:
        complexity_tokens = successful_df.groupby('expected_complexity')['total_tokens'].mean()
        axes[0,2].bar(complexity_tokens.index, complexity_tokens.values, color='#9b59b6')
        axes[0,2].set_xlabel('Complexity Level')
        axes[0,2].set_ylabel('Average Tokens')
        axes[0,2].set_title('Token Usage by Complexity')
        axes[0,2].tick_params(axis='x', rotation=45)
    
    # 4. Response Length vs Citations
    if not successful_df.empty:
        scatter = axes[1,0].scatter(successful_df['response_length'], successful_df['citations_count'], 
                                   c=successful_df['execution_time'], cmap='viridis', alpha=0.6)
        axes[1,0].set_xlabel('Response Length (characters)')
        axes[1,0].set_ylabel('Number of Citations')
        axes[1,0].set_title('Response Length vs Citations')
        plt.colorbar(scatter, ax=axes[1,0], label='Execution Time (s)')
    
    # 5. Performance by Domain
    if not successful_df.empty:
        domain_performance = successful_df.groupby('domain')['quality_score'].mean().sort_values(ascending=False)
        if len(domain_performance) > 10:
            domain_performance = domain_performance.head(10)
        axes[1,1].barh(range(len(domain_performance)), domain_performance.values, color='#e67e22')
        axes[1,1].set_yticks(range(len(domain_performance)))
        axes[1,1].set_yticklabels(domain_performance.index)
        axes[1,1].set_xlabel('Average Quality Score')
        axes[1,1].set_title('Performance by Domain (Top 10)')
    
    # 6. Execution Time by Complexity
    if not successful_df.empty:
        complexity_times = []
        complexity_labels = []
        for complexity in successful_df['expected_complexity'].unique():
            times = successful_df[successful_df['expected_complexity'] == complexity]['execution_time']
            complexity_times.append(times.values)
            complexity_labels.append(complexity)
        
        axes[1,2].boxplot(complexity_times, labels=complexity_labels)
        axes[1,2].set_ylabel('Execution Time (seconds)')
        axes[1,2].set_xlabel('Complexity Level')
        axes[1,2].set_title('Execution Time by Complexity')
        axes[1,2].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()

print("📊 Analysis and visualization functions ready")

In [None]:
# Analyze and visualize current results
if evaluation_results:
    print("📈 Analyzing Current Results")
    print("=" * 40)
    
    metrics = analyze_evaluation_results(evaluation_results)
    
    print(f"📊 Total Queries: {metrics['total_queries']}")
    print(f"✅ Successful: {metrics['successful']} ({metrics['success_rate']:.1%})")
    print(f"❌ Failed: {metrics['failed']}")
    
    if metrics['successful'] > 0:
        print(f"⏱️ Avg Execution Time: {metrics['avg_execution_time']:.2f}s")
        print(f"🪙 Total Tokens: {metrics['total_tokens']:,}")
        print(f"📝 Avg Response Length: {metrics['avg_response_length']:.0f} chars")
        print(f"📚 Avg Citations: {metrics['avg_citations']:.1f}")
        print(f"⭐ Avg Quality Score: {metrics['avg_quality_score']:.2f}")
        
        if 'by_complexity' in metrics:
            print("\n📋 Performance by Complexity:")
            for complexity, comp_metrics in metrics['by_complexity'].items():
                print(f"  {complexity}: {comp_metrics['avg_time']:.2f}s, {comp_metrics['avg_tokens']:.0f} tokens")
    
    # Show visualizations
    print("\n📊 Generating Visualizations...")
    visualize_results(evaluation_results)
    
else:
    print("📝 No evaluation results yet. Use the controls above to run an evaluation!")

## 💾 Export Results

In [None]:
# Export functions
def export_results_to_csv(results: List[Dict], filename: str = None):
    """Export evaluation results to CSV file."""
    if not results:
        print("No results to export")
        return
    
    if filename is None:
        filename = f"evaluation_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    
    df = pd.DataFrame(results)
    df.to_csv(filename, index=False)
    print(f"📁 Results exported to: {filename}")
    return filename

def export_results_to_json(results: List[Dict], filename: str = None):
    """Export evaluation results to JSON file."""
    if not results:
        print("No results to export")
        return
    
    if filename is None:
        filename = f"evaluation_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    
    with open(filename, 'w') as f:
        json.dump(results, f, indent=2, default=str)
    
    print(f"📁 Results exported to: {filename}")
    return filename

# Export widgets
export_csv_button = widgets.Button(
    description='💾 Export CSV',
    button_style='info',
    layout={'width': '150px'}
)

export_json_button = widgets.Button(
    description='💾 Export JSON',
    button_style='info', 
    layout={'width': '150px'}
)

def on_export_csv_clicked(b):
    if evaluation_results:
        export_results_to_csv(evaluation_results)
    else:
        print("No results to export")

def on_export_json_clicked(b):
    if evaluation_results:
        export_results_to_json(evaluation_results)
    else:
        print("No results to export")

export_csv_button.on_click(on_export_csv_clicked)
export_json_button.on_click(on_export_json_clicked)

print("💾 Export Controls")
display(widgets.HBox([export_csv_button, export_json_button]))

## 🎯 Custom Query Testing

Test the multi-agent system with your own custom queries:

In [None]:
# Custom query testing interface
custom_query_text = widgets.Textarea(
    placeholder='Enter your custom query here...',
    description='Custom Query:',
    layout={'width': '100%', 'height': '100px'},
    style={'description_width': 'initial'}
)

test_button = widgets.Button(
    description='🧪 Test Query',
    button_style='warning',
    layout={'width': '150px'}
)

custom_output = widgets.Output()

async def test_custom_query(query: str):
    """Test a custom query through the multi-agent system."""
    if not query.strip():
        print("Please enter a query to test")
        return
    
    with custom_output:
        clear_output(wait=True)
        print(f"🧪 Testing query: {query}")
        print("⏳ Processing...")
    
    start_time = time.time()
    
    try:
        result = await system.process_query(
            query=query,
            trace_id=f"custom_test_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
            session_id="custom_testing"
        )
        
        execution_time = time.time() - start_time
        
        with custom_output:
            clear_output(wait=True)
            print(f"🧪 Query: {query}")
            print(f"✅ Status: {result.get('status', 'unknown')}")
            print(f"⏱️ Execution Time: {execution_time:.2f}s")
            print(f"🪙 Total Tokens: {result.get('total_tokens', 0):,}")
            print(f"📚 Citations: {len(result.get('citations', []))}")
            print(f"🤖 Agents Used: {', '.join(result.get('agents_used', []))}")
            print("\n📄 Response:")
            print("-" * 50)
            response = result.get('response', 'No response available')
            if isinstance(response, dict) and 'extracted_content' in response:
                print(response['extracted_content'][:500] + "..." if len(response['extracted_content']) > 500 else response['extracted_content'])
            else:
                print(str(response)[:500] + "..." if len(str(response)) > 500 else str(response))
        
    except Exception as e:
        execution_time = time.time() - start_time
        with custom_output:
            clear_output(wait=True)
            print(f"❌ Error testing query: {str(e)}")
            print(f"⏱️ Time before error: {execution_time:.2f}s")

async def on_test_button_clicked(b):
    query = custom_query_text.value
    await test_custom_query(query)

test_button.on_click(lambda b: asyncio.create_task(on_test_button_clicked(b)))

print("🧪 Custom Query Testing")
display(widgets.VBox([
    custom_query_text,
    test_button,
    custom_output
]))

## 🧹 Cleanup and Session Management

In [None]:
# Cleanup functions
def clear_results():
    """Clear all evaluation results."""
    global evaluation_results
    evaluation_results = []
    print("🧹 Evaluation results cleared")

async def close_phoenix_session():
    """Close the current Phoenix session."""
    try:
        final_metrics = await phoenix_integration.close_session()
        print(f"🔥 Phoenix session closed: {final_metrics}")
    except Exception as e:
        print(f"⚠️ Error closing Phoenix session: {str(e)}")

async def system_shutdown():
    """Gracefully shutdown the multi-agent system."""
    try:
        await system.shutdown()
        print("🤖 Multi-agent system shutdown complete")
    except Exception as e:
        print(f"⚠️ Error during system shutdown: {str(e)}")

# Cleanup buttons
clear_button = widgets.Button(
    description='🧹 Clear Results',
    button_style='danger',
    layout={'width': '150px'}
)

close_session_button = widgets.Button(
    description='🔥 Close Phoenix',
    button_style='warning',
    layout={'width': '150px'}
)

shutdown_button = widgets.Button(
    description='🛑 Shutdown System',
    button_style='danger',
    layout={'width': '150px'}
)

clear_button.on_click(lambda b: clear_results())
close_session_button.on_click(lambda b: asyncio.create_task(close_phoenix_session()))
shutdown_button.on_click(lambda b: asyncio.create_task(system_shutdown()))

print("🧹 Cleanup Controls")
display(widgets.HBox([clear_button, close_session_button, shutdown_button]))

## 📚 Usage Instructions

### 🚀 Getting Started
1. **Initialize**: Run the setup cells to initialize the multi-agent system
2. **Configure**: Use the interactive controls to set evaluation parameters
3. **Run**: Click "🚀 Run Evaluation" to start the evaluation process
4. **Analyze**: View results and visualizations in real-time
5. **Export**: Save results to CSV or JSON for further analysis

### 🎮 Interactive Controls
- **Max Queries**: Limit the number of queries to evaluate
- **Complexity Filter**: Select which model complexity levels to test
- **Domain Filter**: Focus on specific domains or test all
- **Progress Bar**: Real-time progress tracking during evaluation

### 🧪 Custom Testing
- Use the "Custom Query Testing" section to test individual queries
- Great for debugging or exploring system behavior

### 📊 Results Analysis
- Comprehensive metrics including success rate, execution time, token usage
- Visual analysis with multiple chart types
- Performance breakdown by complexity level and domain

### 🔥 Phoenix Integration
- Real-time tracing and observability
- Visit http://localhost:6006 to view Phoenix UI
- Session management and cleanup functions

### 💡 Tips
- Start with small batches (5-10 queries) to test functionality
- Monitor Phoenix UI for detailed trace information
- Export results regularly for backup
- Use custom queries to test edge cases


In [None]:
# Final status check
print("🎉 Multi-Agent Evaluation Notebook Ready!")
print("=" * 50)
print(f"🤖 System Status: {'✅ Active' if system.is_initialized else '❌ Inactive'}")
print(f"🔥 Phoenix Session: {session_id}")
print(f"📊 Dataset Size: {len(EVALUATION_QUERIES)} queries")
print(f"📈 Current Results: {len(evaluation_results)} evaluations")
print("\n🚀 Ready to run evaluations!")