# Benchmarking LLMs with LLM Lab

This notebook demonstrates how to run comprehensive benchmarks across multiple LLM providers.

## Table of Contents
1. [Setup](#setup)
2. [Running Standard Benchmarks](#standard-benchmarks)
3. [Custom Benchmark Creation](#custom-benchmarks)
4. [Results Analysis](#results-analysis)
5. [Performance Comparison](#performance-comparison)

## Setup <a id='setup'></a>

In [None]:
import os
import sys
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Setup path
project_root = Path.cwd().parent.parent
sys.path.insert(0, str(project_root))

# Import LLM Lab modules
from src.benchmarks import BenchmarkRunner
from src.evaluation.improved_evaluation import calculate_accuracy, calculate_f1_score
from src.logging.results_logger import ResultsLogger
from src.providers import AnthropicProvider, GoogleProvider, OpenAIProvider

# Set style for visualizations
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

print("Setup complete!")

## Running Standard Benchmarks <a id='standard-benchmarks'></a>

LLM Lab includes several standard benchmarks:
- **TruthfulQA**: Measures truthfulness and accuracy
- **GSM8K**: Grade school math problems
- **HellaSWAG**: Commonsense reasoning
- **ARC**: AI2 Reasoning Challenge
- **MMLU**: Massive Multitask Language Understanding

In [None]:
# Initialize providers
providers = []

if os.getenv("OPENAI_API_KEY"):
    providers.append(OpenAIProvider(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-3.5-turbo"))

if os.getenv("GOOGLE_API_KEY"):
    providers.append(GoogleProvider(api_key=os.getenv("GOOGLE_API_KEY"), model="gemini-1.5-flash"))

print(f"Initialized {len(providers)} providers")

In [None]:
# Run a quick truthfulness benchmark
if providers:
    runner = BenchmarkRunner(providers[0])

    # Run on a small subset for demonstration
    results = runner.run_benchmark(
        benchmark_name="truthfulness",
        num_samples=5,  # Small sample for demo
    )

    print("Benchmark Results:")
    print(f"Accuracy: {results.get('accuracy', 0):.2%}")
    print(f"Average response time: {results.get('avg_time', 0):.2f}s")
    print(f"Total tokens used: {results.get('total_tokens', 0)}")
else:
    print("No providers available. Please set API keys.")

## Custom Benchmark Creation <a id='custom-benchmarks'></a>

Let's create a custom benchmark for domain-specific evaluation.

In [None]:
# Define custom benchmark questions
custom_benchmark = [
    {
        "question": "What is the capital of France?",
        "expected_answer": "Paris",
        "category": "geography",
    },
    {"question": "What is 15 + 27?", "expected_answer": "42", "category": "math"},
    {
        "question": "Who wrote 'Romeo and Juliet'?",
        "expected_answer": "William Shakespeare",
        "category": "literature",
    },
    {
        "question": "What is the chemical symbol for water?",
        "expected_answer": "H2O",
        "category": "science",
    },
    {
        "question": "In what year did World War II end?",
        "expected_answer": "1945",
        "category": "history",
    },
]

print(f"Created custom benchmark with {len(custom_benchmark)} questions")
print(f"Categories: {set(q['category'] for q in custom_benchmark)}")

In [None]:
# Run custom benchmark
def run_custom_benchmark(provider, questions):
    results = []

    for q in questions:
        try:
            response = provider.generate(
                prompt=q["question"],
                max_tokens=50,
                temperature=0.1,  # Low temperature for factual answers
            )

            answer = response.get("content", "").strip()
            is_correct = q["expected_answer"].lower() in answer.lower()

            results.append(
                {
                    "question": q["question"],
                    "expected": q["expected_answer"],
                    "actual": answer[:100],  # Truncate long answers
                    "correct": is_correct,
                    "category": q["category"],
                }
            )
        except Exception as e:
            results.append(
                {
                    "question": q["question"],
                    "expected": q["expected_answer"],
                    "actual": f"Error: {str(e)}",
                    "correct": False,
                    "category": q["category"],
                }
            )

    return results


# Run benchmark on available providers
if providers:
    benchmark_results = {}
    for provider in providers:
        print(f"\nRunning benchmark on {provider.provider_name}...")
        results = run_custom_benchmark(provider, custom_benchmark)
        benchmark_results[provider.provider_name] = results

        # Calculate accuracy
        accuracy = sum(r["correct"] for r in results) / len(results)
        print(f"Accuracy: {accuracy:.2%}")

        # Show results by category
        df = pd.DataFrame(results)
        category_accuracy = df.groupby("category")["correct"].mean()
        print("\nAccuracy by category:")
        print(category_accuracy.to_string())

## Results Analysis <a id='results-analysis'></a>

Let's analyze and visualize the benchmark results.

In [None]:
# Create detailed results DataFrame
if benchmark_results:
    all_results = []
    for provider_name, results in benchmark_results.items():
        for r in results:
            all_results.append({"provider": provider_name, **r})

    df_results = pd.DataFrame(all_results)

    # Summary statistics
    print("Overall Results Summary")
    print("=" * 50)
    summary = df_results.groupby("provider")["correct"].agg(["sum", "count", "mean"])
    summary.columns = ["Correct", "Total", "Accuracy"]
    summary["Accuracy"] = summary["Accuracy"].apply(lambda x: f"{x:.2%}")
    print(summary)

    # Detailed category breakdown
    print("\nCategory Performance Matrix")
    print("=" * 50)
    pivot_table = pd.pivot_table(
        df_results, values="correct", index="category", columns="provider", aggfunc="mean"
    )
    print(pivot_table.applymap(lambda x: f"{x:.0%}"))

## Performance Comparison <a id='performance-comparison'></a>

Let's create visualizations to compare provider performance.

In [None]:
# Visualization of results
if benchmark_results and len(benchmark_results) > 0:
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))

    # Overall accuracy comparison
    ax1 = axes[0]
    provider_accuracy = df_results.groupby("provider")["correct"].mean()
    provider_accuracy.plot(
        kind="bar", ax=ax1, color=sns.color_palette("husl", len(provider_accuracy))
    )
    ax1.set_title("Overall Accuracy by Provider", fontsize=14, fontweight="bold")
    ax1.set_xlabel("Provider")
    ax1.set_ylabel("Accuracy")
    ax1.set_ylim([0, 1])
    ax1.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: "{:.0%}".format(y)))

    # Category performance heatmap
    ax2 = axes[1]
    pivot_data = pd.pivot_table(
        df_results, values="correct", index="category", columns="provider", aggfunc="mean"
    )
    sns.heatmap(
        pivot_data, annot=True, fmt=".0%", cmap="YlOrRd", ax=ax2, cbar_kws={"label": "Accuracy"}
    )
    ax2.set_title("Performance Heatmap by Category", fontsize=14, fontweight="bold")
    ax2.set_xlabel("Provider")
    ax2.set_ylabel("Category")

    plt.tight_layout()
    plt.show()

    # Detailed results table
    print("\nSample Responses Comparison")
    print("=" * 80)
    for idx, row in df_results.head(3).iterrows():
        print(f"Question: {row['question']}")
        print(f"Provider: {row['provider']}")
        print(f"Expected: {row['expected']}")
        print(f"Actual: {row['actual']}")
        print(f"Correct: {'✓' if row['correct'] else '✗'}")
        print("-" * 80)

## Advanced Benchmarking Features

### Batch Processing and Parallel Execution

In [None]:
# Example of batch processing for efficiency
def batch_benchmark(provider, questions, batch_size=5):
    """Run benchmark in batches for better efficiency."""
    results = []

    for i in range(0, len(questions), batch_size):
        batch = questions[i : i + batch_size]
        batch_prompts = [q["question"] for q in batch]

        try:
            # Use batch generation if available
            if hasattr(provider, "batch_generate"):
                responses = provider.batch_generate(batch_prompts, max_tokens=50)
                for q, resp in zip(batch, responses):
                    answer = resp.get("content", "").strip()
                    is_correct = q["expected_answer"].lower() in answer.lower()
                    results.append(
                        {"question": q["question"], "correct": is_correct, "batch_processed": True}
                    )
            else:
                # Fall back to sequential processing
                for q in batch:
                    resp = provider.generate(q["question"], max_tokens=50)
                    answer = resp.get("content", "").strip()
                    is_correct = q["expected_answer"].lower() in answer.lower()
                    results.append(
                        {"question": q["question"], "correct": is_correct, "batch_processed": False}
                    )
        except Exception as e:
            print(f"Batch processing error: {e}")

    return results


print("Batch processing function defined for efficient benchmarking")

## Export Results

Save benchmark results for future analysis.

In [None]:
# Export results to CSV
if "df_results" in locals() and not df_results.empty:
    output_path = project_root / "results" / "benchmark_results.csv"
    output_path.parent.mkdir(exist_ok=True)
    df_results.to_csv(output_path, index=False)
    print(f"Results saved to: {output_path}")

    # Create summary report
    summary_report = {
        "date": pd.Timestamp.now().isoformat(),
        "providers_tested": list(benchmark_results.keys()),
        "num_questions": len(custom_benchmark),
        "overall_accuracy": df_results.groupby("provider")["correct"].mean().to_dict(),
        "category_performance": pivot_table.to_dict() if "pivot_table" in locals() else {},
    }

    import json

    summary_path = project_root / "results" / "benchmark_summary.json"
    with open(summary_path, "w") as f:
        json.dump(summary_report, f, indent=2, default=str)
    print(f"Summary report saved to: {summary_path}")

## Conclusion

This notebook demonstrated:

1. Running standard benchmarks with LLM Lab
2. Creating custom benchmarks for domain-specific evaluation
3. Analyzing and visualizing benchmark results
4. Comparing performance across multiple providers
5. Batch processing for efficient benchmarking

### Next Steps

- Explore more advanced benchmarks (GSM8K, MMLU, etc.)
- Create domain-specific evaluation metrics
- Set up automated benchmarking pipelines
- Integrate with monitoring for continuous evaluation