# Whisper ASR Evaluation

**Purpose**: Comprehensive ASR testing using Whisper baseline

**Tests**: Full-length audio transcription with metrics

---

This notebook evaluates Whisper's ASR performance on your test recordings.

In [None]:
# === SETUP ===

import sys
import os
import json
from pathlib import Path
from datetime import datetime

# Add harness to path
harness_path = Path.cwd().parent.parent / "harness"
sys.path.insert(0, str(harness_path))

import torch
import whisper
import yaml

from harness import AudioLoader, ModelRegistry, PerformanceTimer
from harness.metrics_asr import ASRMetrics

print("=== Whisper ASR Evaluation ===")
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Device: {device}")

In [None]:
# === LOAD MODEL ===

config_path = Path.cwd().parent / "config.yaml"
with open(config_path, "r") as f:
    config = yaml.safe_load(f)

model_wrapper = ModelRegistry.load_model("whisper", config, device)
model = model_wrapper["model"]

print(f"âœ“ Whisper model loaded")

In [None]:
# === LOAD TEST DATA ===

# Test audio
test_audio_path = (
    Path.cwd().parent.parent.parent / "data" / "audio" / "PRIMARY" / "llm_recording_pranay.m4a"
)
ground_truth_path = Path.cwd().parent.parent.parent / "data" / "text" / "PRIMARY" / "llm.txt"

print(f"Loading audio: {test_audio_path.name}")

loader = AudioLoader(target_sample_rate=16000)
audio, sr, audio_metadata = loader.load_audio(test_audio_path, "whisper")

# Load ground truth
from harness import GroundTruthLoader

ground_truth = GroundTruthLoader.load_text(ground_truth_path)

print(f"âœ“ Audio loaded: {audio_metadata['duration_seconds']:.1f}s")
print(f"âœ“ Ground truth: {len(ground_truth)} chars")

In [None]:
# === RUN TRANSCRIPTION ===

timer = PerformanceTimer()

print("Transcribing audio...")
with timer.time_operation("whisper_full_transcribe"):
    result = model.transcribe(audio, language="en")

transcription = result["text"].strip()
latency_ms = timer.elapsed_time_ms

print(f"\n=== TRANSCRIPTION RESULTS ===")
print(f"Latency: {latency_ms:.1f}ms")
print(f"RTF: {latency_ms / 1000 / audio_metadata['duration_seconds']:.3f}x")
print(f"Transcription length: {len(transcription)} chars")
print(f"\nTranscription:")
print(transcription[:200] + "..." if len(transcription) > 200 else transcription)

In [None]:
# === CALCULATE METRICS ===

asr_result = ASRMetrics.evaluate(
    transcription=transcription,
    ground_truth=ground_truth,
    audio_duration_s=audio_metadata["duration_seconds"],
    latency_s=latency_ms / 1000,
    metadata={
        "model": "whisper-large-v3",
        "audio_file": str(test_audio_path.name),
        "timestamp": datetime.now().isoformat(),
    },
)

print(f"\n=== ASR METRICS ===")
print(f"WER: {asr_result.wer:.3f} ({asr_result.wer * 100:.1f}%)")
print(f"CER: {asr_result.cer:.3f} ({asr_result.cer * 100:.1f}%)")
print(f"Latency: {asr_result.latency_ms:.1f}ms")
print(f"RTF: {asr_result.rtv:.3f}x")

In [None]:
# === SAVE RESULTS ===

# Create results directory
results_dir = Path.cwd().parent.parent.parent / "runs" / "whisper" / "asr"
results_dir.mkdir(parents=True, exist_ok=True)

# Save results
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
result_file = results_dir / f"{timestamp}.json"

results = {
    "model": "whisper-large-v3",
    "test_type": "asr",
    "timestamp": timestamp,
    "audio_file": str(test_audio_path.name),
    "audio_duration_s": audio_metadata["duration_seconds"],
    "ground_truth_length": len(ground_truth),
    "transcription_length": len(transcription),
    "wer": asr_result.wer,
    "cer": asr_result.cer,
    "latency_ms": asr_result.latency_ms,
    "rtf": asr_result.rtv,
    "transcription": transcription,
    "ground_truth": ground_truth,
}

with open(result_file, "w") as f:
    json.dump(results, f, indent=2)

print(f"âœ“ Results saved to: {result_file}")
print(f"\nðŸŽ‰ Whisper ASR evaluation complete!")
print(f"âœ… Ready for comparison in compare/00_scorecard.ipynb")