# Model Comparison Scorecard

**Purpose**: Transform experimental results into production decisions

**Models Compared**:
- LFM2.5-Audio-1.5B (LiquidAI)
- Whisper-Large-V3 (OpenAI)

**Metrics**: Accuracy, Speed, Memory, Production Readiness

---

This dashboard automatically loads results from all model tests and produces:
- Comparative scorecards
- Production recommendations
- Cost-performance analysis
- Visualization plots

In [None]:
# === COMPARISON SETUP ===

import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime

print("=== Model Comparison Dashboard ===")
print(f"Analysis date: {datetime.now().strftime('%Y-%m-%d %H:%M')}")
print("‚úÖ Comparison setup complete")

In [None]:
# === LOAD RESULTS ===


def load_results_from_directory(model_dir: Path) -> dict:
    """Load all JSON results from a model directory."""
    results = {}

    if not model_dir.exists():
        return results

    # Walk through subdirectories (asr, tts, chat)
    for task_dir in model_dir.iterdir():
        if task_dir.is_dir():
            task_name = task_dir.name
            results[task_name] = []

            # Load all JSON files in task directory
            for json_file in sorted(task_dir.glob("*.json")):
                try:
                    with open(json_file, "r") as f:
                        data = json.load(f)
                        results[task_name].append(data)
                except Exception as e:
                    print(f"Error loading {json_file}: {e}")

    return results


# Load results from all models
runs_dir = Path.cwd().parent / "runs"

all_results = {}
models = ["lfm2_5_audio", "whisper"]

for model in models:
    model_dir = runs_dir / model
    print(f"Loading results for {model}...")
    results = load_results_from_directory(model_dir)

    total_files = sum(len(files) for files in results.values())
    print(f"  ‚úì Loaded {total_files} result files")

    if results:
        all_results[model] = results

print(f"\n‚úì Loaded results from {len(all_results)} models")

In [None]:
# === BUILD COMPARISON TABLE ===


def extract_key_metrics(results: dict, model_name: str) -> list:
    """Extract key metrics for comparison table."""
    rows = []

    for task_name, test_runs in results.items():
        if not test_runs:
            continue

        # Use most recent test run
        latest_run = test_runs[-1]

        row = {
            "Model": model_name.replace("_", "-").title(),
            "Test": task_name.upper(),
            "WER (%)": latest_run.get("wer", 0) * 100,
            "CER (%)": latest_run.get("cer", 0) * 100,
            "Latency (ms)": latest_run.get("latency_ms", 0),
            "RTF": latest_run.get("rtf", 0),
            "Timestamp": latest_run.get("timestamp", "Unknown"),
        }

        rows.append(row)

    return rows


# Build comparison dataframe
comparison_data = []

for model_name, model_results in all_results.items():
    comparison_data.extend(extract_key_metrics(model_results, model_name))

df_comparison = pd.DataFrame(comparison_data)

if not df_comparison.empty:
    print("=== Model Comparison Scorecard ===")
    print(df_comparison.to_string(index=False))
else:
    print("No comparison data available yet.")
    print("Run model test notebooks first to generate results.")

In [None]:
# === PRODUCTION READINESS SCORE ===


def calculate_production_score(row: pd.Series) -> float:
    """Calculate production readiness score (0-100)."""
    scores = []

    # Accuracy score (WER: lower is better)
    if not pd.isna(row.get("WER (%)", 0)):
        wer = row["WER (%)"]
        wer_score = max(0, 100 - wer)  # 0% WER = 100 score
        scores.append(wer_score)

    # Speed score (RTF: lower is better, <1.0 = realtime)
    if not pd.isna(row.get("RTF", 0)):
        rtf = row["RTF"]
        rtf_score = max(0, 100 - rtf * 50)  # RTF=0 = 100, RTF=2.0 = 0
        scores.append(rtf_score)

    return np.mean(scores) if scores else 0


if not df_comparison.empty:
    # Calculate production scores
    df_comparison["Production Score"] = df_comparison.apply(calculate_production_score, axis=1)
    df_comparison["Grade"] = df_comparison["Production Score"].apply(
        lambda x: "A" if x >= 80 else "B" if x >= 60 else "C"
    )

    print("\n=== Production Readiness Scorecard ===")
    print(df_comparison[["Model", "Test", "Production Score", "Grade"]].to_string(index=False))

    # Overall recommendation
    best_scores = df_comparison.groupby("Model")["Production Score"].mean()
    best_model = best_scores.idxmax()
    best_score = best_scores.max()

    print(f"\n=== PRODUCTION RECOMMENDATION ===")
    print(f"üèÜ Recommended: {best_model}")
    print(f"   Overall Score: {best_score:.1f}/100")

    if best_score >= 80:
        print("   ‚úÖ Ready for production deployment")
    elif best_score >= 60:
        print("   ‚ö†Ô∏è  Ready with monitoring required")
    else:
        print("   ‚ùå Not recommended for production")

In [None]:
# === VISUALIZATION ===

if not df_comparison.empty:
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    fig.suptitle("Model Comparison: Production Decision Dashboard", fontsize=14, fontweight="bold")

    # Plot 1: WER Comparison
    ax1 = axes[0, 0]
    for model in df_comparison["Model"].unique():
        model_data = df_comparison[df_comparison["Model"] == model]
        ax1.bar(model_data["Test"], model_data["WER (%)"], label=model, alpha=0.7)
    ax1.set_ylabel("WER (%)")
    ax1.set_title("Word Error Rate (lower is better)")
    ax1.legend()
    ax1.tick_params(axis="x", rotation=45)

    # Plot 2: Speed Comparison
    ax2 = axes[0, 1]
    for model in df_comparison["Model"].unique():
        model_data = df_comparison[df_comparison["Model"] == model]
        ax2.bar(model_data["Test"], model_data["RTF"], label=model, alpha=0.7)
    ax2.axhline(y=1.0, color="r", linestyle="--", label="Realtime threshold")
    ax2.set_ylabel("Real-Time Factor")
    ax2.set_title("Processing Speed (lower is better)")
    ax2.legend()
    ax2.tick_params(axis="x", rotation=45)

    # Plot 3: Production Scores
    ax3 = axes[1, 0]
    for model in df_comparison["Model"].unique():
        model_data = df_comparison[df_comparison["Model"] == model]
        ax3.bar(model_data["Test"], model_data["Production Score"], label=model, alpha=0.7)
    ax3.axhline(y=80, color="g", linestyle="--", label="Production ready")
    ax3.axhline(y=60, color="orange", linestyle="--", label="Monitor")
    ax3.set_ylabel("Score (0-100)")
    ax3.set_title("Production Readiness Score")
    ax3.legend()
    ax3.tick_params(axis="x", rotation=45)

    # Plot 4: Latency Comparison
    ax4 = axes[1, 1]
    for model in df_comparison["Model"].unique():
        model_data = df_comparison[df_comparison["Model"] == model]
        ax4.bar(model_data["Test"], model_data["Latency (ms)"], label=model, alpha=0.7)
    ax4.axhline(y=500, color="r", linestyle="--", label="500ms target")
    ax4.set_ylabel("Latency (ms)")
    ax4.set_title("Processing Latency (lower is better)")
    ax4.legend()
    ax4.tick_params(axis="x", rotation=45)

    plt.tight_layout()

    # Save figure
    plot_path = Path.cwd().parent / "runs" / "comparison_plots.png"
    plot_path.parent.mkdir(parents=True, exist_ok=True)
    plt.savefig(plot_path, dpi=150, bbox_inches="tight")
    print(f"‚úì Comparison plots saved to {plot_path}")

    plt.show()

In [None]:
# === SAVE COMPARISON RESULTS ===

if not df_comparison.empty:
    # Save comparison table
    comparison_path = Path.cwd().parent / "runs" / "model_comparison.json"

    comparison_results = {
        "timestamp": datetime.now().isoformat(),
        "num_models": len(all_results),
        "models_compared": list(all_results.keys()),
        "comparison_table": df_comparison.to_dict("records"),
        "recommendation": {
            "best_model": best_model if "best_model" in locals() else None,
            "best_score": float(best_score) if "best_score" in locals() else None,
        },
    }

    with open(comparison_path, "w") as f:
        json.dump(comparison_results, f, indent=2)

    print(f"‚úì Comparison results saved to {comparison_path}")
    print(f"\nüéâ Model comparison complete!")
    print(f"‚úÖ Production decision ready")

## üéØ **Decision Framework**

### **Key Metrics**:
- **WER**: Word Error Rate (lower = better accuracy)
- **RTF**: Real-Time Factor (lower = faster, <1.0 = realtime)
- **Production Score**: Combined metric (0-100, higher = better)

### **Decision Matrix**:
- **Score ‚â• 80**: ‚úÖ Deploy with confidence
- **Score 60-80**: ‚ö†Ô∏è Deploy with monitoring
- **Score < 60**: ‚ùå Not production-ready

### **Next Steps**:
1. Run all model test notebooks (00_smoke.ipynb, 10_asr.ipynb, etc.)
2. Re-run this notebook to see updated comparisons
3. Use production recommendation for deployment decisions

### **Adding New Models**:
1. Create folder under `models/<new_model>/`
2. Add `config.yaml` and notebooks
3. Run tests to generate results in `runs/<new_model>/`
4. This notebook will automatically include them