# üéôÔ∏è ASR Evaluation - LLM Recording vs Wikipedia Text

**Complete ASR (Automatic Speech Recognition) Evaluation Pipeline**

## Test Data:
- **Input**: `llm_recording_pranay.m4a` (2-minute Wikipedia reading)
- **Ground Truth**: `llm.txt` (original Wikipedia text)
- **Goal**: Transcribe audio and compare accuracy with original text

## Evaluation Metrics:
- **WER**: Word Error Rate
- **CER**: Character Error Rate  
- **Timing**: Processing speed and real-time factor
- **Quality**: Transcription confidence and accuracy

In [None]:
# Cell 1: Setup and Configuration
import torch
import torchaudio
import numpy as np
from pathlib import Path
import time
import json
from datetime import datetime

print("üéôÔ∏è ASR Evaluation: LLM Recording Analysis")
print("=" * 60)

# Device setup
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Device: {device.upper()}")

# Test files
AUDIO_FILE = Path("data/audio/PRIMARY/llm_recording_pranay.m4a")
TEXT_FILE = Path("data/text/PRIMARY/llm.txt")

print(f"Audio: {AUDIO_FILE.name}")
print(f"Text: {TEXT_FILE.name}")
print("‚úÖ Setup complete")

In [None]:
# Cell 2: Load LFM Model
from liquid_audio import LFM2AudioModel, LFM2AudioProcessor, ChatState

print("üì¶ Loading LFM components...")

HF_REPO = "LiquidAI/LFM2.5-Audio-1.5B"
load_start = time.time()

processor = LFM2AudioProcessor.from_pretrained(HF_REPO).eval()
model = LFM2AudioModel.from_pretrained(HF_REPO).eval()

if device != "cpu":
    model = model.to(device)

load_time = time.time() - load_start
print(f"‚úÖ Model loaded: {load_time:.2f}s")
print(f"   Parameters: {sum(p.numel() for p in model.parameters()):,}")

In [None]:
# Cell 3: Load and Prepare Test Files
def load_audio_for_lfm(audio_path):
    """Load and convert audio to LFM format."""
    # Load audio (handles m4a format)
    waveform, sr = torchaudio.load(str(audio_path))

    # Convert to mono if needed
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)

    # Resample to 24kHz for optimal performance
    if sr != 24000:
        resampler = torchaudio.transforms.Resample(sr, 24000)
        waveform = resampler(waveform)
        sr = 24000

    return waveform, sr


# Load audio
print(f"üéµ Loading audio: {AUDIO_FILE.name}")
waveform, sr = load_audio_for_lfm(AUDIO_FILE)
print(f"‚úÖ Audio loaded: {waveform.shape}")
print(f"   Duration: {waveform.shape[1] / sr:.1f}s ({waveform.shape[1] / sr / 60:.1f} minutes)")

# Load ground truth text
print(f"üìù Loading text: {TEXT_FILE.name}")
with open(TEXT_FILE, "r") as f:
    ground_truth = f.read().strip()
print(f"‚úÖ Text loaded: {len(ground_truth)} characters")
print(f"   Words: {len(ground_truth.split())}")

# Show first few lines of text
preview = ground_truth[:200] + "..." if len(ground_truth) > 200 else ground_truth
print(f'   Preview: "{preview}"')

In [None]:
# Cell 4: ASR Transcription
print("üéôÔ∏è  Starting ASR Transcription...")
print("This may take several minutes for a 2-minute recording...")
print()

# Create ChatState for ASR
chat = ChatState(processor)

# System prompt for ASR
chat.new_turn("system")
chat.add_text("Perform ASR.")
chat.end_turn()

# Add audio input
chat.new_turn("user")
chat.add_audio(waveform, sr)
chat.end_turn()

# Generate transcription
chat.new_turn("assistant")

start_time = time.time()
transcribed_text = ""
token_count = 0

print("üîç Processing audio...")
for t in model.generate_sequential(**chat, max_new_tokens=2048):
    if t.numel() == 1:  # Text token
        token_text = processor.text.decode(t)
        transcribed_text += token_text
        token_count += 1

        # Progress indicator
        if token_count % 50 == 0:
            print(f"   Tokens: {token_count}, Text length: {len(transcribed_text)}")

latency = time.time() - start_time

print(f"\n‚úÖ Transcription complete!")
print(f"   Processing time: {latency:.1f}s ({latency / 60:.1f} minutes)")
print(f"   Tokens generated: {token_count}")
print(f"   Characters: {len(transcribed_text)}")
print(f"   Words: {len(transcribed_text.split())}")
print(f"   Real-time factor: {latency / (waveform.shape[1] / sr):.2f}x")

In [None]:
# Cell 5: Display Transcription Results
print("üìù TRANSCRIPTION RESULT:")
print("=" * 60)
print(transcribed_text)
print("=" * 60)

# Compare text lengths
print("\nüìä Basic Comparison:")
print(f"Ground truth: {len(ground_truth)} chars, {len(ground_truth.split())} words")
print(f"Transcription: {len(transcribed_text)} chars, {len(transcribed_text.split())} words")

# Length comparison
char_diff = len(transcribed_text) - len(ground_truth)
word_diff = len(transcribed_text.split()) - len(ground_truth.split())

print(f"Character difference: {char_diff:+d} ({char_diff / len(ground_truth) * 100:+.1f}%)")
print(f"Word difference: {word_diff:+d} ({word_diff / len(ground_truth.split()) * 100:+.1f}%)")

In [None]:
# Cell 6: Calculate Word Error Rate (WER)
def calculate_wer(reference, hypothesis):
    """Calculate Word Error Rate using Levenshtein distance."""
    ref_words = reference.lower().split()
    hyp_words = hypothesis.lower().split()

    # Levenshtein distance for word sequences
    m, n = len(ref_words), len(hyp_words)
    dp = [[0] * (n + 1) for _ in range(m + 1)]

    for i in range(m + 1):
        dp[i][0] = i
    for j in range(n + 1):
        dp[0][j] = j

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if ref_words[i - 1] == hyp_words[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = 1 + min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1])

    return dp[m][n] / max(1, len(ref_words))


def calculate_cer(reference, hypothesis):
    """Calculate Character Error Rate."""
    ref_chars = list(reference.lower())
    hyp_chars = list(hypothesis.lower())

    m, n = len(ref_chars), len(hyp_chars)
    dp = [[0] * (n + 1) for _ in range(m + 1)]

    for i in range(m + 1):
        dp[i][0] = i
    for j in range(n + 1):
        dp[0][j] = j

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if ref_chars[i - 1] == hyp_chars[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = 1 + min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1])

    return dp[m][n] / max(1, len(ref_chars))


# Calculate metrics
wer = calculate_wer(ground_truth, transcribed_text)
cer = calculate_cer(ground_truth, transcribed_text)

print("üìä ACCURACY METRICS:")
print("=" * 40)
print(f"Word Error Rate (WER): {wer:.4f} ({wer * 100:.2f}%)")
print(f"Character Error Rate (CER): {cer:.4f} ({cer * 100:.2f}%)")
print(f"Accuracy (1-WER): {(1 - wer) * 100:.1f}%")

# Quality assessment
print("\nüéØ Quality Assessment:")
if wer < 0.05:
    print("   ‚úÖ EXCELLENT - Near-perfect transcription")
elif wer < 0.10:
    print("   ‚úÖ VERY GOOD - Professional quality")
elif wer < 0.15:
    print("   ‚úÖ GOOD - Usable for most applications")
elif wer < 0.25:
    print("   ‚ö†Ô∏è  FAIR - May require post-processing")
else:
    print("   ‚ùå POOR - Significant errors")

In [None]:
# Cell 7: Detailed Error Analysis
def analyze_errors(reference, hypothesis):
    """Analyze specific types of errors."""
    ref_words = reference.lower().split()
    hyp_words = hypothesis.lower().split()

    # Simple error analysis
    errors = {"substitutions": 0, "insertions": 0, "deletions": 0, "total": 0}

    # Calculate error types (simplified)
    errors["total"] = abs(len(ref_words) - len(hyp_words)) + sum(
        1 for r, h in zip(ref_words, hyp_words) if r != h
    )
    errors["substitutions"] = sum(1 for r, h in zip(ref_words, hyp_words) if r != h)
    errors["insertions"] = max(0, len(hyp_words) - len(ref_words))
    errors["deletions"] = max(0, len(ref_words) - len(hyp_words))

    return errors


# Analyze errors
error_analysis = analyze_errors(ground_truth, transcribed_text)

print("üîç DETAILED ERROR ANALYSIS:")
print("=" * 40)
print(f"Substitutions: {error_analysis['substitutions']}")
print(f"Insertions: {error_analysis['insertions']}")
print(f"Deletions: {error_analysis['deletions']}")
print(f"Total errors: {error_analysis['total']}")

if error_analysis["total"] > 0:
    print(f"\nError breakdown:")
    print(
        f"   Substitutions: {error_analysis['substitutions'] / error_analysis['total'] * 100:.1f}%"
    )
    print(f"   Insertions: {error_analysis['insertions'] / error_analysis['total'] * 100:.1f}%")
    print(f"   Deletions: {error_analysis['deletions'] / error_analysis['total'] * 100:.1f}%")

In [None]:
# Cell 8: Performance Analysis
print("‚è±Ô∏è  PERFORMANCE ANALYSIS:")
print("=" * 40)

# Calculate performance metrics
audio_duration = waveform.shape[1] / sr
processing_speed = audio_duration / latency

print(f"Audio duration: {audio_duration:.1f}s ({audio_duration / 60:.1f} minutes)")
print(f"Processing time: {latency:.1f}s ({latency / 60:.1f} minutes)")
print(f"Real-time factor: {latency / audio_duration:.2f}x")
print(f"Processing speed: {processing_speed:.2f}x real-time")

# Performance assessment
print("\nüéØ Performance Assessment:")
if processing_speed > 1.0:
    print("   ‚úÖ REAL-TIME - Processes faster than real-time")
elif processing_speed > 0.5:
    print("   ‚úÖ NEAR REAL-TIME - Suitable for live applications")
elif processing_speed > 0.1:
    print("   ‚ö†Ô∏è  OFFLINE PROCESSING - Suitable for batch processing")
else:
    print("   ‚ùå SLOW - May not be suitable for production")

In [None]:
# Cell 9: Save Results
def save_asr_results(results_dict, filename="asr_evaluation_results.json"):
    """Save ASR evaluation results."""
    results_path = Path("results")
    results_path.mkdir(exist_ok=True)

    output_file = results_path / filename
    with open(output_file, "w") as f:
        json.dump(results_dict, f, indent=2)

    print(f"‚úÖ Results saved: {output_file}")
    return output_file


# Compile comprehensive results
results = {
    "test_info": {
        "model": HF_REPO,
        "device": device,
        "timestamp": datetime.now().isoformat(),
        "test_type": "ASR_Evaluation",
    },
    "audio_info": {
        "file": str(AUDIO_FILE),
        "duration_minutes": audio_duration / 60,
        "sample_rate": sr,
    },
    "text_info": {
        "ground_truth_file": str(TEXT_FILE),
        "ground_truth_chars": len(ground_truth),
        "ground_truth_words": len(ground_truth.split()),
        "transcribed_chars": len(transcribed_text),
        "transcribed_words": len(transcribed_text.split()),
    },
    "accuracy_metrics": {
        "wer": wer,
        "cer": cer,
        "accuracy_percentage": (1 - wer) * 100,
        "error_analysis": error_analysis,
    },
    "performance_metrics": {
        "processing_time_seconds": latency,
        "real_time_factor": latency / audio_duration,
        "processing_speed_x": processing_speed,
    },
    "quality_assessment": {
        "overall_quality": "EXCELLENT" if wer < 0.1 else "GOOD" if wer < 0.15 else "FAIR",
        "performance_rating": "REAL_TIME" if processing_speed > 1.0 else "NEAR_REAL_TIME",
    },
    "transcription_text": transcribed_text,
    "ground_truth_text": ground_truth,
}

# Save results
save_asr_results(results)

# Also save transcription separately
transcription_path = Path("results/asr_transcription.txt")
with open(transcription_path, "w") as f:
    f.write(transcribed_text)
print(f"‚úÖ Transcription saved: {transcription_path}")

In [None]:
# Cell 10: Summary and Recommendations
print("üéØ ASR EVALUATION SUMMARY")
print("=" * 50)
print()
print("‚úÖ COMPLETED ASR EVALUATION:")
print(f"   Model: {HF_REPO}")
print(f"   Device: {device.upper()}")
print(f"   Test: 2-minute LLM Wikipedia reading")
print()
print("üìä KEY RESULTS:")
print(f"   Word Error Rate: {wer * 100:.1f}%")
print(f"   Character Error Rate: {cer * 100:.1f}%")
print(f"   Processing Speed: {processing_speed:.2f}x real-time")
print()
print("üéØ QUALITY ASSESSMENT:")
quality = "EXCELLENT" if wer < 0.1 else "GOOD" if wer < 0.15 else "FAIR"
performance = "REAL-TIME" if processing_speed > 1.0 else "NEAR REAL-TIME"
print(f"   Transcription Quality: {quality}")
print(f"   Performance: {performance}")
print()
print("üìã NEXT STEPS:")
print("   1. Run TTS evaluation: synthesize llm.txt")
print("   2. Compare synthesized audio with original recording")
print("   3. Test with NotebookLM conversation analysis")
print("   4. Compare with other models (Whisper, etc.)")
print()
print("‚úÖ ASR evaluation complete!")