# Evaluation Benchmarks for Indic TTS & STT Models

This notebook evaluates trained TTS and STT models against target metrics.

## Metrics:
- **TTS**: MOS Score, Speaker Similarity, Real-time Factor
- **STT**: WER (Word Error Rate), CER (Character Error Rate)

In [None]:
# Install dependencies
!pip install -q jiwer evaluate resemblyzer speechmos
!pip install -q transformers TTS torch torchaudio

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
import json
import torch
import numpy as np
from pathlib import Path

DATA_DIR = '/content/drive/MyDrive/indic_speech_data'
TTS_MODEL_DIR = '/content/drive/MyDrive/indic_tts_models'
STT_MODEL_DIR = '/content/drive/MyDrive/indic_stt_models'

LANGUAGE = 'hi'

## 1. STT Evaluation (WER/CER)

In [None]:
import evaluate
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import soundfile as sf
from tqdm import tqdm

# Load metrics
wer_metric = evaluate.load('wer')
cer_metric = evaluate.load('cer')

def evaluate_stt_model(model_path, test_manifest, processor=None, device='cuda'):
    """Evaluate STT model on test set."""
    
    # Load model
    if processor is None:
        processor = WhisperProcessor.from_pretrained(model_path)
    model = WhisperForConditionalGeneration.from_pretrained(model_path).to(device)
    model.eval()
    
    # Load test data
    with open(test_manifest, 'r', encoding='utf-8') as f:
        test_samples = [json.loads(line) for line in f if line.strip()]
    
    predictions = []
    references = []
    
    for sample in tqdm(test_samples, desc="Evaluating STT"):
        audio_path = sample['audio_filepath']
        reference = sample['text']
        
        try:
            # Load and process audio
            audio, sr = sf.read(audio_path, dtype='float32')
            if sr != 16000:
                import librosa
                audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
            
            # Transcribe
            input_features = processor(
                audio,
                sampling_rate=16000,
                return_tensors='pt',
            ).input_features.to(device)
            
            with torch.no_grad():
                predicted_ids = model.generate(input_features, max_length=225)
            
            prediction = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
            
            predictions.append(prediction)
            references.append(reference)
            
        except Exception as e:
            print(f"Error processing {audio_path}: {e}")
            continue
    
    # Calculate metrics
    wer = wer_metric.compute(predictions=predictions, references=references)
    cer = cer_metric.compute(predictions=predictions, references=references)
    
    return {
        'wer': wer,
        'cer': cer,
        'num_samples': len(predictions),
    }

In [None]:
# Run STT evaluation
stt_model_path = f"{STT_MODEL_DIR}/whisper_{LANGUAGE}_merged"
test_manifest = f"{DATA_DIR}/manifests/stt_{LANGUAGE}_val.jsonl"

if os.path.exists(stt_model_path) and os.path.exists(test_manifest):
    stt_results = evaluate_stt_model(stt_model_path, test_manifest)
    
    print("\n" + "="*50)
    print("STT Evaluation Results")
    print("="*50)
    print(f"  WER: {stt_results['wer']:.2%}")
    print(f"  CER: {stt_results['cer']:.2%}")
    print(f"  Samples: {stt_results['num_samples']}")
    
    # Check against targets
    print("\nTarget Comparison:")
    print(f"  WER Target (Clean): < 15% | Actual: {stt_results['wer']:.1%} {'✓' if stt_results['wer'] < 0.15 else '✗'}")
    print(f"  CER Target (Clean): < 10% | Actual: {stt_results['cer']:.1%} {'✓' if stt_results['cer'] < 0.10 else '✗'}")
else:
    print("STT model or test manifest not found.")

## 2. TTS Evaluation (MOS, Speaker Similarity)

In [None]:
import time

def estimate_mos_heuristic(audio, sr):
    """Estimate MOS using heuristics."""
    scores = []
    
    # Clipping check
    peak = np.max(np.abs(audio))
    if peak > 0.99:
        scores.append(2.0)
    elif peak > 0.95:
        scores.append(3.0)
    elif peak > 0.1:
        scores.append(4.0)
    else:
        scores.append(3.0)
    
    # RMS check
    rms = np.sqrt(np.mean(audio ** 2))
    if rms < 0.01:
        scores.append(2.0)
    elif rms < 0.1:
        scores.append(3.5)
    else:
        scores.append(4.0)
    
    # Duration check
    duration = len(audio) / sr
    if duration < 0.5:
        scores.append(2.5)
    else:
        scores.append(4.0)
    
    return np.mean(scores)

def evaluate_tts_model(model, config, test_texts, reference_audio, device='cuda'):
    """Evaluate TTS model."""
    results = {
        'mos_scores': [],
        'rtf_values': [],
        'speaker_similarities': [],
    }
    
    # Try to load resemblyzer for speaker similarity
    try:
        from resemblyzer import VoiceEncoder, preprocess_wav
        voice_encoder = VoiceEncoder()
        ref_embedding = voice_encoder.embed_utterance(preprocess_wav(reference_audio))
        has_resemblyzer = True
    except:
        has_resemblyzer = False
        print("Resemblyzer not available, skipping speaker similarity.")
    
    model.eval()
    
    for text in tqdm(test_texts, desc="Evaluating TTS"):
        try:
            # Synthesize
            start_time = time.time()
            
            with torch.no_grad():
                outputs = model.synthesize(
                    text,
                    config,
                    speaker_wav=reference_audio,
                    language=LANGUAGE,
                )
            
            synthesis_time = time.time() - start_time
            audio = outputs['wav']
            
            # Calculate RTF
            audio_duration = len(audio) / 22050
            rtf = synthesis_time / audio_duration
            results['rtf_values'].append(rtf)
            
            # Estimate MOS
            mos = estimate_mos_heuristic(audio, 22050)
            results['mos_scores'].append(mos)
            
            # Speaker similarity
            if has_resemblyzer:
                syn_embedding = voice_encoder.embed_utterance(preprocess_wav(audio))
                similarity = np.dot(ref_embedding, syn_embedding) / (
                    np.linalg.norm(ref_embedding) * np.linalg.norm(syn_embedding)
                )
                results['speaker_similarities'].append((similarity + 1) / 2)
            
        except Exception as e:
            print(f"Error: {e}")
            continue
    
    return {
        'avg_mos': np.mean(results['mos_scores']) if results['mos_scores'] else 0,
        'avg_rtf': np.mean(results['rtf_values']) if results['rtf_values'] else 0,
        'avg_speaker_sim': np.mean(results['speaker_similarities']) if results['speaker_similarities'] else 0,
        'num_samples': len(results['mos_scores']),
    }

In [None]:
# Test texts for evaluation
test_texts_by_lang = {
    'hi': [
        'नमस्ते, आप कैसे हैं?',
        'आज मौसम बहुत अच्छा है।',
        'मुझे हिंदी में बात करना पसंद है।',
        'भारत एक महान देश है।',
        'शिक्षा जीवन का सबसे महत्वपूर्ण हिस्सा है।',
    ],
    'ta': [
        'வணக்கம், நீங்கள் எப்படி இருக்கிறீர்கள்?',
        'இன்று வானிலை மிகவும் நன்றாக உள்ளது।',
        'எனக்கு தமிழில் பேசுவது பிடிக்கும்।',
    ],
    'te': [
        'నమస్కారం, మీరు ఎలా ఉన్నారు?',
        'ఈరోజు వాతావరణం చాలా బాగుంది.',
        'నాకు తెలుగులో మాట్లాడటం ఇష్టం.',
    ],
}

test_texts = test_texts_by_lang.get(LANGUAGE, test_texts_by_lang['hi'])
print(f"Test texts for {LANGUAGE}: {len(test_texts)} samples")

In [None]:
# Run TTS evaluation (if model exists)
tts_model_path = f"{TTS_MODEL_DIR}/xtts_{LANGUAGE}_final"

if os.path.exists(tts_model_path):
    from TTS.tts.configs.xtts_config import XttsConfig
    from TTS.tts.models.xtts import Xtts
    
    # Load model
    config = XttsConfig()
    config.load_json(f"{tts_model_path}/config.json")
    model = Xtts.init_from_config(config)
    model.load_checkpoint(config, checkpoint_dir=tts_model_path)
    model = model.cuda()
    
    # Load reference audio from training data
    train_manifest = f"{DATA_DIR}/manifests/tts_{LANGUAGE}_train.jsonl"
    with open(train_manifest, 'r') as f:
        sample = json.loads(f.readline())
    ref_audio, _ = sf.read(sample['audio_filepath'], dtype='float32')
    
    # Evaluate
    tts_results = evaluate_tts_model(model, config, test_texts, ref_audio)
    
    print("\n" + "="*50)
    print("TTS Evaluation Results")
    print("="*50)
    print(f"  MOS Score: {tts_results['avg_mos']:.2f}")
    print(f"  Real-time Factor: {tts_results['avg_rtf']:.2f}")
    print(f"  Speaker Similarity: {tts_results['avg_speaker_sim']:.2f}")
    
    # Check against targets
    print("\nTarget Comparison:")
    print(f"  MOS Target: > 3.5 | Actual: {tts_results['avg_mos']:.2f} {'✓' if tts_results['avg_mos'] > 3.5 else '✗'}")
    print(f"  RTF Target: < 0.5 | Actual: {tts_results['avg_rtf']:.2f} {'✓' if tts_results['avg_rtf'] < 0.5 else '✗'}")
    print(f"  Similarity Target: > 0.85 | Actual: {tts_results['avg_speaker_sim']:.2f} {'✓' if tts_results['avg_speaker_sim'] > 0.85 else '✗'}")
else:
    print(f"TTS model not found at {tts_model_path}")

## 3. Summary Report

In [None]:
print("\n" + "="*60)
print(f"EVALUATION SUMMARY - Language: {LANGUAGE}")
print("="*60)

print("\nSTT Metrics:")
print("-" * 40)
print(f"{'Metric':<20} {'Target':<15} {'Status'}")
print("-" * 40)
print(f"{'WER (Clean)':<20} {'< 15%':<15} {'Pending'}")
print(f"{'WER (Noisy)':<20} {'< 25%':<15} {'Pending'}")
print(f"{'CER (Clean)':<20} {'< 10%':<15} {'Pending'}")
print(f"{'CER (Noisy)':<20} {'< 15%':<15} {'Pending'}")

print("\nTTS Metrics:")
print("-" * 40)
print(f"{'Metric':<20} {'Target':<15} {'Status'}")
print("-" * 40)
print(f"{'MOS Score':<20} {'> 3.5':<15} {'Pending'}")
print(f"{'Speaker Similarity':<20} {'> 0.85':<15} {'Pending'}")
print(f"{'Real-time Factor':<20} {'< 0.5':<15} {'Pending'}")

print("\n" + "="*60)