# VL-Uncertainty-Benchmark Analysis

This notebook provides interactive analysis of benchmark results.

## Setup

In [None]:
import sys
sys.path.insert(0, '..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from src.metrics import (
    expected_calibration_error,
    reliability_diagram,
    plot_calibration_comparison,
    compute_calibration_metrics,
)
from src.analysis import (
    find_high_confidence_failures,
    cluster_failures_by_degradation,
    generate_failure_report,
    plot_pareto_frontier,
    plot_edge_vs_cloud_comparison,
    compute_scaling_efficiency,
)

plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

## Load Results

Update the path below to point to your benchmark results.

In [None]:
# Update this path to your results directory
RESULTS_DIR = '../results/run_YYYYMMDD_HHMMSS'

# Load summary and detailed results
summary_df = pd.read_csv(f'{RESULTS_DIR}/summary.csv')
results_df = pd.read_csv(f'{RESULTS_DIR}/all_results.csv')

print(f"Loaded {len(summary_df)} models, {len(results_df)} evaluations")
summary_df.head()

## 1. Overall Model Performance

In [None]:
# Sort by uncertainty AUROC (higher is better)
summary_sorted = summary_df.sort_values('uncertainty_auroc', ascending=False)

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# AUROC comparison
colors = ['green' if t == 'edge' else 'blue' for t in summary_sorted['tier']]
axes[0].barh(summary_sorted['model'], summary_sorted['uncertainty_auroc'], color=colors)
axes[0].set_xlabel('Uncertainty AUROC')
axes[0].set_title('Can Uncertainty Predict Errors?')
axes[0].axvline(x=0.5, color='red', linestyle='--', label='Random')

# ECE comparison
axes[1].barh(summary_sorted['model'], summary_sorted['calibrated_ece'], color=colors)
axes[1].set_xlabel('ECE (lower is better)')
axes[1].set_title('Calibration Error')

# Accuracy comparison
axes[2].barh(summary_sorted['model'], summary_sorted['accuracy'], color=colors)
axes[2].set_xlabel('Accuracy')
axes[2].set_title('Overall Accuracy')

plt.tight_layout()
plt.show()

## 2. Pareto Analysis: Compute vs Quality

In [None]:
# Plot Pareto frontier
fig = plot_pareto_frontier(
    summary_df,
    x='params',
    y='uncertainty_auroc',
    model_col='model',
    tier_col='tier',
    title='Compute vs. Uncertainty Quality Tradeoff'
)
plt.show()

In [None]:
# Compute scaling efficiency
scaling = compute_scaling_efficiency(summary_df, 'params', 'uncertainty_auroc')

print("Scaling Analysis:")
print(f"  Slope: {scaling['slope']:.4f}")
print(f"  R²: {scaling['r_squared']:.3f}")
print(f"  Interpretation: {scaling['interpretation']}")

## 3. Edge vs Cloud Comparison

In [None]:
# Compare edge vs cloud within model families
fig = plot_edge_vs_cloud_comparison(summary_df, metric_col='uncertainty_auroc')
plt.show()

In [None]:
# Statistical comparison
edge_models = summary_df[summary_df['tier'] == 'edge']
cloud_models = summary_df[summary_df['tier'] == 'cloud']

print("Edge vs Cloud Statistics:")
print(f"\nEdge Models (n={len(edge_models)}):")
print(f"  Mean AUROC: {edge_models['uncertainty_auroc'].mean():.3f}")
print(f"  Mean ECE: {edge_models['calibrated_ece'].mean():.3f}")
print(f"  Mean Accuracy: {edge_models['accuracy'].mean():.3f}")

print(f"\nCloud Models (n={len(cloud_models)}):")
print(f"  Mean AUROC: {cloud_models['uncertainty_auroc'].mean():.3f}")
print(f"  Mean ECE: {cloud_models['calibrated_ece'].mean():.3f}")
print(f"  Mean Accuracy: {cloud_models['accuracy'].mean():.3f}")

## 4. Reliability Diagrams

In [None]:
# Select a few models to compare
models_to_compare = summary_df['model'].head(4).tolist()

fig, axes = plt.subplots(2, 2, figsize=(12, 12))
axes = axes.flatten()

for ax, model_name in zip(axes, models_to_compare):
    model_df = results_df[results_df['model'] == model_name]
    
    # Compute calibration curve
    from src.metrics.calibration import calibration_curve
    confs, accs, counts = calibration_curve(
        model_df['calibrated_confidence'].values,
        model_df['is_correct'].values
    )
    
    ax.plot([0, 1], [0, 1], 'k--', label='Perfect')
    ax.bar(confs, accs, width=0.05, alpha=0.7)
    ax.set_xlabel('Confidence')
    ax.set_ylabel('Accuracy')
    ax.set_title(f'{model_name}')
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)

plt.tight_layout()
plt.show()

## 5. Performance by Degradation Type

In [None]:
# Aggregate performance by degradation
deg_performance = results_df.groupby(['model', 'degradation_type']).agg({
    'is_correct': 'mean',
    'calibrated_confidence': 'mean',
}).reset_index()

# Pivot for heatmap
pivot_acc = deg_performance.pivot(index='model', columns='degradation_type', values='is_correct')

plt.figure(figsize=(12, 8))
sns.heatmap(pivot_acc, annot=True, fmt='.2f', cmap='RdYlGn', vmin=0, vmax=1)
plt.title('Model Accuracy by Degradation Type')
plt.tight_layout()
plt.show()

In [None]:
# Performance degradation with severity
severity_perf = results_df.groupby(['model', 'severity']).agg({
    'is_correct': 'mean',
}).reset_index()

plt.figure(figsize=(10, 6))
for model in summary_df['model'].unique()[:5]:  # Top 5 models
    model_data = severity_perf[severity_perf['model'] == model]
    plt.plot(model_data['severity'], model_data['is_correct'], 'o-', label=model)

plt.xlabel('Degradation Severity')
plt.ylabel('Accuracy')
plt.title('Accuracy vs. Degradation Severity')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

## 6. Failure Analysis

In [None]:
# Find high-confidence failures
failures = find_high_confidence_failures(
    results_df,
    confidence_threshold=0.9,
    confidence_col='calibrated_confidence'
)

print(f"Found {len(failures)} high-confidence failures (>90% confidence but wrong)")
print(f"This is {len(failures)/len(results_df):.2%} of all predictions")

In [None]:
# Cluster failures by degradation
if len(failures) > 0:
    clusters = cluster_failures_by_degradation(failures)
    
    print("\nFailures by Degradation Type:")
    for deg_type, stats in clusters.items():
        print(f"  {deg_type}: {stats['count']} failures ({stats['percentage']:.1%})")

In [None]:
# Generate full failure report
if len(failures) > 0:
    report = generate_failure_report(failures, total_samples=len(results_df))
    
    print("\nFailure Report Summary:")
    print(f"  Total failures: {report['summary']['total_failures']}")
    print(f"  Failure rate: {report['summary']['failure_rate']:.2%}")
    
    print("\nRecommendations:")
    for rec in report['recommendations']:
        print(f"  - {rec}")

## 7. Key Conclusions

### Research Question: Does scaling up models give meaningfully better uncertainty calibration?

Based on the analysis above:

In [None]:
# Summarize findings
print("="*60)
print("KEY FINDINGS")
print("="*60)

if len(edge_models) > 0 and len(cloud_models) > 0:
    auroc_diff = cloud_models['uncertainty_auroc'].mean() - edge_models['uncertainty_auroc'].mean()
    ece_diff = edge_models['calibrated_ece'].mean() - cloud_models['calibrated_ece'].mean()
    
    print(f"\n1. Uncertainty Quality (AUROC):")
    print(f"   Edge avg: {edge_models['uncertainty_auroc'].mean():.3f}")
    print(f"   Cloud avg: {cloud_models['uncertainty_auroc'].mean():.3f}")
    print(f"   Difference: {auroc_diff:.3f}")
    
    print(f"\n2. Calibration (ECE):")
    print(f"   Edge avg: {edge_models['calibrated_ece'].mean():.3f}")
    print(f"   Cloud avg: {cloud_models['calibrated_ece'].mean():.3f}")
    print(f"   Difference: {ece_diff:.3f}")
    
    print(f"\n3. Recommendation for Robotics Middleware:")
    if auroc_diff < 0.05 and ece_diff < 0.02:
        print("   Edge models provide comparable uncertainty quality.")
        print("   They are likely SUFFICIENT for reactive→deliberative switching.")
    else:
        print("   Cloud models provide meaningfully better uncertainty.")
        print("   Consider using cloud models for critical decisions.")