# Amp Model Evaluation Results Aggregation

This notebook aggregates and analyzes results from the model evaluation suite.

In [None]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

In [None]:
# Load evaluation results
def load_eval_results(results_dir="../results"):
    """Load all evaluation result files."""
    results_path = Path(results_dir)
    all_results = []
    
    for result_file in results_path.glob("*.json"):
        with open(result_file) as f:
            data = json.load(f)
            data['eval_name'] = result_file.stem
            all_results.append(data)
    
    return all_results

In [None]:
# Aggregate metrics by model
def aggregate_by_model(results):
    """Group results by model and calculate aggregate metrics."""
    df = pd.DataFrame(results)
    
    # Group by model and calculate metrics
    model_stats = df.groupby('model').agg({
        'success_rate': 'mean',
        'latency_s': ['mean', 'std'],
        'tokens': ['mean', 'std'],
        'eval_name': 'count'
    }).round(3)
    
    model_stats.columns = ['success_rate', 'avg_latency', 'latency_std', 
                          'avg_tokens', 'tokens_std', 'total_runs']
    
    return model_stats

In [None]:
# Load and display results
try:
    results = load_eval_results()
    if results:
        model_comparison = aggregate_by_model(results)
        print("Model Performance Summary:")
        print(model_comparison)
    else:
        print("No evaluation results found. Run evaluations first.")
except Exception as e:
    print(f"Error loading results: {e}")
    print("Make sure to run evaluations first to generate results.")

In [None]:
# Visualization functions
def plot_model_comparison(model_stats):
    """Create comparison plots for model performance."""
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    
    # Success rate comparison
    axes[0,0].bar(model_stats.index, model_stats['success_rate'])
    axes[0,0].set_title('Success Rate by Model')
    axes[0,0].set_ylabel('Success Rate')
    
    # Latency comparison  
    axes[0,1].bar(model_stats.index, model_stats['avg_latency'])
    axes[0,1].set_title('Average Latency by Model')
    axes[0,1].set_ylabel('Latency (seconds)')
    
    # Token usage comparison
    axes[1,0].bar(model_stats.index, model_stats['avg_tokens'])
    axes[1,0].set_title('Average Tokens by Model')
    axes[1,0].set_ylabel('Tokens')
    
    # Total runs
    axes[1,1].bar(model_stats.index, model_stats['total_runs'])
    axes[1,1].set_title('Total Evaluation Runs')
    axes[1,1].set_ylabel('Number of Runs')
    
    plt.tight_layout()
    return fig

In [None]:
# Generate plots if we have data
if 'model_comparison' in locals() and not model_comparison.empty:
    fig = plot_model_comparison(model_comparison)
    plt.show()
else:
    print("No data available for plotting. Run evaluations first.")

## Next Steps

1. Run the evaluation suite: `openai tools evaluate amp-eval/evals/tool_calling_micro.yaml --registry amp-eval/adapters`
2. Rerun this notebook to see the results
3. Add more sophisticated analysis as needed