# Wav2Vn Model Evaluation - Google Colab Standalone

**IMPORTANT NOTE**: Wav2Vn model is not publicly available. This notebook uses **mock transcription** for demonstration purposes.

## Purpose:
- Demonstrates the evaluation pipeline
- Creates placeholder results for cross-model comparison
- Can be updated when Wav2Vn becomes available

## Features:
- Standalone execution
- Mock transcription with consistent results
- Full dataset loading (respects existing splits)
- Exports CSV for notebook 05

**Runtime**: CPU is sufficient (no real model inference)

In [None]:
print('[SETUP] Installing packages...')
!pip install -q datasets soundfile jiwer torch torchcodec torchaudio librosa soundfile jiwer datasets accelerate pandas matplotlib seaborn scipy tqdm
print('[OK] Packages installed!')

print('\n[WARNING] This notebook uses MOCK TRANSCRIPTION')
print('[WARNING] Wav2Vn model is not publicly available')

In [None]:
# Embedded code (same as other notebooks)
import time, warnings, hashlib
import numpy as np
import pandas as pd
from pathlib import Path
from dataclasses import dataclass
from typing import List, Dict
from tqdm.auto import tqdm
warnings.filterwarnings('ignore')

from jiwer import wer, cer, mer, wil, wip, process_words
from datasets import load_dataset, Audio
import soundfile as sf
import tempfile

class ASRMetrics:
    @staticmethod
    def calculate_all_metrics(references: List[str], hypotheses: List[str]) -> Dict:
        ref_text = ' '.join(references)
        hyp_text = ' '.join(hypotheses)
        output = process_words(ref_text, hyp_text)
        return {
            'wer': wer(ref_text, hyp_text), 'cer': cer(ref_text, hyp_text),
            'mer': mer(ref_text, hyp_text), 'wil': wil(ref_text, hyp_text),
            'wip': wip(ref_text, hyp_text),
            'ser': sum(1 for r, h in zip(references, hypotheses) if r != h) / len(references),
            'insertions': output.insertions, 'deletions': output.deletions,
            'substitutions': output.substitutions
        }

class RTFTimer:
    def __init__(self):
        self.elapsed_time = None
    def __enter__(self):
        self.start = time.time()
        return self
    def __exit__(self, *args):
        self.elapsed_time = time.time() - self.start

@dataclass
class AudioSample:
    audio_path: str
    transcription: str
    duration: float = 0.0
    sample_rate: int = 16000
    dataset: str = ''
    split: str = ''

def load_huggingface_dataset(dataset_name: str, max_samples: int = None) -> Dict:
    configs = {
        'ViMD': {'id': 'nguyendv02/ViMD_Dataset', 'splits': ['train', 'test', 'valid'], 
                 'audio_col': 'audio', 'text_col': 'text'},
        'BUD500': {'id': 'linhtran92/viet_bud500', 'splits': ['train', 'validation', 'test'], 
                   'audio_col': 'audio', 'text_col': 'transcription'},
        'LSVSC': {'id': 'doof-ferb/LSVSC', 'splits': ['train', 'validation', 'test'], 
                  'audio_col': 'audio', 'text_col': 'transcription'},
        'VLSP2020': {'id': 'doof-ferb/vlsp2020_vinai_100h', 'splits': ['train'], 
                     'audio_col': 'audio', 'text_col': 'transcription'},
        'VietMed': {'id': 'leduckhai/VietMed', 'splits': ['train', 'test', 'dev'], 
                    'audio_col': 'audio', 'text_col': 'text'}
    }
    config = configs[dataset_name]
    samples_by_split = {'train': [], 'val': [], 'test': []}
    temp_dir = Path(tempfile.gettempdir()) / 'asr_audio' / dataset_name
    temp_dir.mkdir(parents=True, exist_ok=True)
    all_samples = []
    
    for split in config['splits']:
        try:
            dataset = load_dataset(config['id'], split=split, trust_remote_code=True)
            if config['audio_col'] in dataset.column_names:
                dataset = dataset.cast_column(config['audio_col'], Audio(sampling_rate=16000))
            if max_samples and len(dataset) > max_samples:
                dataset = dataset.select(range(max_samples))
            
            samples = []
            for idx, item in enumerate(tqdm(dataset, desc=f"{split}", leave=False)):
                try:
                    audio_data = item[config['audio_col']]
                    audio_path = str(temp_dir / f"{split}_{idx}.wav")
                    sf.write(audio_path, audio_data['array'], audio_data['sampling_rate'])
                    sample = AudioSample(
                        audio_path=audio_path,
                        transcription=str(item[config['text_col']]).strip().lower(),
                        duration=len(audio_data['array']) / audio_data['sampling_rate'],
                        dataset=dataset_name, split=split
                    )
                    samples.append(sample)
                except:
                    continue
            
            if split in ['train', 'training']:
                samples_by_split['train'].extend(samples)
            elif split in ['val', 'validation', 'dev', 'valid']:
                samples_by_split['val'].extend(samples)
            elif split in ['test', 'testing']:
                samples_by_split['test'].extend(samples)
            all_samples.extend(samples)
        except:
            pass
    
    if dataset_name == 'VLSP2020' and all_samples:
        np.random.seed(42)
        indices = np.random.permutation(len(all_samples))
        train_end, val_end = int(len(all_samples)*0.7), int(len(all_samples)*0.85)
        samples_by_split['train'] = [all_samples[i] for i in indices[:train_end]]
        samples_by_split['val'] = [all_samples[i] for i in indices[train_end:val_end]]
        samples_by_split['test'] = [all_samples[i] for i in indices[val_end:]]
    
    return samples_by_split

class Wav2VnModel:
    """Mock model for Wav2Vn (not publicly available)."""
    def __init__(self):
        self.mock_texts = [
            "xin chào tôi là người việt nam",
            "hôm nay thời tiết đẹp",
            "tôi yêu tiếng việt",
            "chúng tôi đang học máy học",
            "đây là bài kiểm tra"
        ]
    
    def load_model(self):
        print("[WARNING] Using mock transcription (Wav2Vn not available)")
    
    def transcribe(self, audio_path: str) -> str:
        hash_val = int(hashlib.md5(audio_path.encode()).hexdigest(), 16)
        return self.mock_texts[hash_val % len(self.mock_texts)]

print('[OK] Helper functions loaded (with mock Wav2Vn model)')

In [None]:
from datetime import datetime

DATASETS_TO_TEST = ['ViMD']  # Add more as needed
MAX_SAMPLES_PER_SPLIT = None  # None = full dataset
TIMESTAMP = datetime.now().strftime("%Y%m%d_%H%M%S")
OUTPUT_CSV = f"/content/wav2vn_results_{TIMESTAMP}.csv"

print(f'[CONFIG] Using MOCK transcription for Wav2Vn')
print(f'[CONFIG] Datasets: {DATASETS_TO_TEST}')
print(f'[CONFIG] Output: {OUTPUT_CSV}')

In [None]:
# Load datasets
datasets_loaded = {}
for dataset_name in DATASETS_TO_TEST:
    try:
        splits = load_huggingface_dataset(dataset_name, MAX_SAMPLES_PER_SPLIT)
        datasets_loaded[dataset_name] = splits
        print(f"[OK] {dataset_name}: train={len(splits['train'])}, "
              f"val={len(splits['val'])}, test={len(splits['test'])}")
    except Exception as e:
        print(f"[ERROR] {dataset_name}: {e}")

In [None]:
# Run mock evaluation
results = []
model = Wav2VnModel()
model.load_model()
metrics_calc = ASRMetrics()

for dataset_name, splits in datasets_loaded.items():
    test_samples = splits['test']
    if not test_samples:
        continue
    
    print(f"\n[INFO] Evaluating on {dataset_name} ({len(test_samples)} samples)")
    refs, hyps, durations, times = [], [], [], []
    
    for sample in tqdm(test_samples, desc=dataset_name):
        with RTFTimer() as timer:
            hyp = model.transcribe(sample.audio_path)
        refs.append(sample.transcription)
        hyps.append(hyp)
        durations.append(sample.duration)
        times.append(timer.elapsed_time)
    
    if refs:
        metrics = metrics_calc.calculate_all_metrics(refs, hyps)
        rtf = sum(times) / sum(durations) if sum(durations) > 0 else 0
        
        results.append({
            'model': 'wav2vn-mock',
            'dataset': dataset_name,
            'samples_processed': len(refs),
            'WER': metrics['wer'], 'CER': metrics['cer'], 'MER': metrics['mer'],
            'WIL': metrics['wil'], 'WIP': metrics['wip'], 'SER': metrics['ser'],
            'RTF': rtf,
            'insertions': metrics['insertions'],
            'deletions': metrics['deletions'],
            'substitutions': metrics['substitutions'],
            'total_audio_duration': sum(durations),
            'total_processing_time': sum(times)
        })
        print(f"  [OK] WER: {metrics['wer']:.4f} (MOCK RESULTS)")

print("\n[WARNING] Results are from MOCK transcription")

In [None]:
# Save results
results_df = pd.DataFrame(results)
print(results_df[['model', 'dataset', 'WER', 'CER', 'RTF']].to_string(index=False))

results_df.to_csv(OUTPUT_CSV, index=False)
print(f"\n[OK] Results saved: {OUTPUT_CSV}")
print("[WARNING] These are MOCK results for demonstration only")

try:
    from google.colab import files
    files.download(OUTPUT_CSV)
except:
    print("[INFO] File saved locally")

## Summary

**IMPORTANT**: This notebook uses mock transcription because Wav2Vn is not publicly available.

### To use real Wav2Vn:
1. Obtain the model from the authors
2. Update `Wav2VnModel` class with real model loading
3. Re-run this notebook

### Current status:
- CSV file exported for cross-model comparison
- Results marked as 'wav2vn-mock'
- Can be filtered out or replaced later

---
**Vietnamese ASR Evaluation Framework - Wav2Vn Mock Edition**