# State-of-the-Art Speech Recognition Model Comparison

Comparing different state-of-the-art speech recognition models with a focus on Wav2Vec 2.0 as the baseline. We'll evaluate models on different aspects:

1. Accuracy (Word Error Rate - WER) and (character Error Rate - CER)
2. Processing Speed
3. Memory Usage
4. Language Support
5. Resource Requirements

Models to compare:
1. Wav2Vec 2.0 (Base)
2. Wav2Vec 2.0 (Large)
3. Whisper (Small)
4. XLSR-53 (Multilingual)

In [None]:
import torch
import torchaudio
import librosa
import soundfile as sf
import numpy as np
import pandas as pd
from transformers import (
    Wav2Vec2ForCTC, 
    Wav2Vec2Processor,
    WhisperForConditionalGeneration,
    WhisperProcessor
)
import transformers
import time
import psutil
import matplotlib.pyplot as plt
from pathlib import Path
from jiwer import wer
import warnings
import subprocess
warnings.filterwarnings('ignore')

def check_dependencies():
    """Verify all required packages are installed"""
    try:
        import google.protobuf
        import sentencepiece
        print("Required packages verified:")
        print(f"protobuf {google.protobuf.__version__}")
        print(f"torch {torch.__version__}")
        print(f"torchaudio {torchaudio.__version__}")
        print(f"ransformers {transformers.__version__}")
        
        # Check FFmpeg
        try:
            result = subprocess.run(['ffmpeg', '-version'], capture_output=True, text=True)
            if result.returncode == 0:
                ffmpeg_ver = result.stdout.split('\n')[0]
                print(f"{ffmpeg_ver}")
            else:
                print("FFmpeg check failed - make sure it's in PATH")
        except Exception as e:
            print(f"FFmpeg not found: {str(e)}")
            print("\nTo fix, run in your terminal:")
            print("winget install Gyan.FFmpeg")
            raise
            
    except ImportError as e:
        print("Missing required package:", str(e))
        print("\nTo fix, run in your terminal:")
        print("pip install protobuf>=3.20.0 sentencepiece librosa")
        print("\nThen restart the kernel.")
        raise

# Memory optimization
torch.backends.cudnn.enabled = False
torch.set_num_threads(4)  # Limit CPU threads for laptop

def print_system_info():
    """Print system information"""
    print(f"PyTorch Version: {torch.__version__}")
    print(f"Available Memory: {psutil.virtual_memory().available / 1024 / 1024 / 1024:.1f}GB")
    print(f"CPU Threads: {torch.get_num_threads()}")
    
print("\nSystem Information")
print_system_info()
print("\nPackage Check")
check_dependencies()

In [None]:
# Define models to compare
MODELS = {
    "wav2vec2-small": {
        "name": "facebook/wav2vec2-base-100h",
        "type": "wav2vec2",
        "description": "Smaller Wav2Vec2 (trained on 100h, faster)"
    },
    "wav2vec2-base": {
        "name": "facebook/wav2vec2-base-960h",
        "type": "wav2vec2",
        "description": "Base Wav2Vec2 (960h training)"
    },
    "whisper-small": {
        "name": "openai/whisper-small",
        "type": "whisper",
        "description": "OpenAI Whisper (small, multilingual)"
    },
    "xlsr-english": {
        "name": "facebook/wav2vec2-large-xlsr-53-english",
        "type": "wav2vec2",
        "description": "XLSR-53 fine-tuned for English"
    },
    "xlsr-spanish": {
        "name": "facebook/wav2vec2-large-xlsr-53-spanish",
        "type": "wav2vec2",
        "description": "XLSR-53 fine-tuned for Spanish"
    },
    "xlsr-french": {
        "name": "facebook/wav2vec2-large-xlsr-53-french",
        "type": "wav2vec2",
        "description": "XLSR-53 fine-tuned for French"
    },
    "xlsr-hindi": {
        "name": "facebook/wav2vec2-large-xlsr-53-hindi",
        "type": "wav2vec2",
        "description": "XLSR-53 fine-tuned for Hindi"
    }
}

print("Available models:")
for name, info in MODELS.items():
    print(f"{name:<15} - {info['description']}")

class ModelWrapper:
    def __init__(self, model_config):
        self.config = model_config
        self.model = None
        self.processor = None
        
    def load(self):
        """Load model with memory optimization"""
        print(f"\nLoading {self.config['name']}...")
        try:
            if self.config["type"] == "wav2vec2":
                self.processor = Wav2Vec2Processor.from_pretrained(self.config["name"])
                self.model = Wav2Vec2ForCTC.from_pretrained(
                    self.config["name"],
                    low_cpu_mem_usage=True
                )
            else:
                # whisper
                self.processor = WhisperProcessor.from_pretrained(self.config["name"])
                self.model = WhisperForConditionalGeneration.from_pretrained(
                    self.config["name"],
                    low_cpu_mem_usage=True
                )
            self.model.eval()
            print(f"Loaded {self.config['name']}")
            return self
        except Exception as e:
            print(f"Error loading {self.config['name']}: {str(e)}")
            raise
    
    def transcribe(self, audio, sr):
        """Transcribe audio"""
        if self.config["type"] == "wav2vec2":
            if audio.ndim > 1:
                audio = audio.mean(axis=1)
            inputs = self.processor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
            with torch.no_grad():
                logits = self.model(inputs.input_values).logits
            ids = torch.argmax(logits, dim=-1)
            return self.processor.batch_decode(ids)[0]
        else:  # whisper
            inputs = self.processor(audio, sampling_rate=sr, return_tensors="pt")
            with torch.no_grad():
                generated_ids = self.model.generate(inputs.input_features)
            return self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
            
    def unload(self):
        """Free memory"""
        try:
            del self.model
            del self.processor
            torch.cuda.empty_cache()
            import gc
            gc.collect()
            print(f"Unloaded model and freed memory")
        except Exception as e:
            print(f"Error during unload: {str(e)}")

In [None]:
# Load test datasets
def load_test_audio(audio_path, max_duration=30):
    """Load test audio file with duration limit"""
    try:
        
        try:
            audio, sr = librosa.load(audio_path, sr=16000, duration=max_duration)
            print(f"Loaded {audio_path} with librosa")
            return audio, sr
        except Exception as e:
            print(f"Librosa load failed, trying soundfile: {str(e)}")
            
        # Fallback to soundfile
        audio, sr = sf.read(audio_path)
        if len(audio) > sr * max_duration:
            audio = audio[:sr * max_duration]
            print(f"Truncated {audio_path} to {max_duration}s")
        
        # Convert to mono if stereo
        if len(audio.shape) > 1:
            audio = librosa.to_mono(audio.T)
        
        # Resample to 16kHz if needed
        if sr != 16000:
            audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
            sr = 16000
            print(f"Resampled to {sr}Hz")
            
        return audio, sr
    except Exception as e:
        print(f"Error loading {audio_path}: {str(e)}")
        return None, None

# Define test datasets
TEST_FILES = {
    "english": [
        "./data/test/eng1.wav",
        "./data/test/eng2.wav"
    ],
    "multilingual": [
        "./data/test/spanish1.wav",
        "./data/test/french1.wav",
        "./data/test/hindi1.wav"
    ]
}

print("\n=== Loading Reference Texts ===")
# Load reference texts (if available)
REFERENCE_TEXTS = {}
try:
    ref_path = Path("./data/test/references.txt")
    if ref_path.exists():
        print(f"Found references file at: {ref_path.absolute()}")
        # Open with utf-8 and replace invalid chars to avoid mojibake issues
        with open(ref_path, encoding='utf-8', errors='replace') as f:
            for line in f:
                line = line.strip()
                if '\t' in line:  # expect tab-separated
                    fname, text = line.split('\t', 1)
                    REFERENCE_TEXTS[fname.strip()] = text.strip()
                    print(f"Loaded reference for {fname}:")
                    print(f"Text: \"{text.strip()}\"")
                else:
                    print(f"Skipping malformed line: {line}")
        
        # Verify all test files have references
        print("\nChecking reference coverage:")
        for lang, files in TEST_FILES.items():
            for fpath in files:
                fname = Path(fpath).name
                if fname in REFERENCE_TEXTS:
                    print(f"{fname} has reference text")
                else:
                    print(f"Missing reference for {fname}")
    else:
        print("No references.txt found, WER will use placeholder text")
except Exception as e:
    print(f"Error loading references: {str(e)}")

def get_reference_text(file_path):
    """Get reference text for a file, or return a notice if missing"""
    base_name = Path(file_path).name
    text = REFERENCE_TEXTS.get(base_name, "Reference text not available")
    if text == "Reference text not available":
        print(f"No reference text found for {base_name}")
    return text

# Use jiwer transforms to normalize both reference and hypothesis consistently
from jiwer import wer, Compose, ToLowerCase, RemovePunctuation, RemoveMultipleSpaces, Strip

# define transform
TRANSFORM_FOR_WER = Compose([
    ToLowerCase(),
    RemovePunctuation(),
    RemoveMultipleSpaces(),
    Strip()
])

def evaluate_model(model_wrapper, audio, sr, reference_text):
    """Evaluate model and compute WER"""
    start_time = time.time()
    mem_start = psutil.Process().memory_info().rss

    try:
        transcription = model_wrapper.transcribe(audio, sr)
        process_time = time.time() - start_time
        mem_used = (psutil.Process().memory_info().rss - mem_start) / 1024 / 1024

        if reference_text and reference_text != "Reference text not available":
            ref_transformed = TRANSFORM_FOR_WER(reference_text)
            hyp_transformed = TRANSFORM_FOR_WER(transcription)
            error_rate = wer(ref_transformed, hyp_transformed)

            print(f"\nReference: \"{reference_text}\"")
            print(f"Generated: \"{transcription}\"")
            print(f"Computed WER: {error_rate:.3f}")
            print(f"WER Rating: {'Excellent' if error_rate <= 0.1 else 'Good' if error_rate <= 0.3 else 'Fair' if error_rate <= 0.5 else 'Poor'}")
        else:
            error_rate = float('nan')
            print("No reference text available for WER calculation")

        return {
            "wer": error_rate,
            "time": process_time,
            "memory": mem_used,
            "transcription": transcription,
            "reference": reference_text
        }

    except Exception as e:
        print(f"Error evaluating model: {str(e)}")
        return None

# verify test files exist
print("\nChecking test files...")
for lang, files in TEST_FILES.items():
    for fpath in files:
        if Path(fpath).exists():
            print(f"Found {fpath}")
        else:
            print(f"Missing {fpath}")


## Run Model Comparison

Now let's compare the models on:
1. Short audio clips (< 30s)
2. Different languages
3. Different speech patterns (fast/slow, accents)

In [None]:
# %pip install torchcodec


In [None]:
import pandas as pd
import numpy as np

results = {}

for model_name, config in MODELS.items():
    print(f"\nEvaluating {model_name}")
    
    try:
        model = ModelWrapper(config).load()
    except Exception as e:
        print(f"Error loading {model_name}: {e}")
        continue

    results[model_name] = []
    processed_files = set()  # avoiding duplicates

    for lang, files in TEST_FILES.items():
        for file_path in files:
            if file_path in processed_files:
                continue
            processed_files.add(file_path)

            try:
                # Use our improved audio loading function
                audio, sr = load_test_audio(file_path)
                if audio is None:
                    print(f"Skipping {file_path}, audio not loaded")
                    continue

                reference_text = get_reference_text(file_path)
                print(f"\nProcessing {file_path}")

                result = evaluate_model(model, audio, sr, reference_text)
                if not result:
                    continue

                # ensuring numeric fields
                result["memory"] = max(result.get("memory", 0), 0)
                result["wer"] = result.get("wer", np.nan)

                result["file"] = file_path
                result["language"] = lang
                results[model_name].append(result)

                print(f"Time: {result['time']:.2f}s")
                print(f"Memory: {result['memory']:.1f}MB")
                if not np.isnan(result["wer"]):
                    print(f"WER: {result['wer']:.3f}")
                print(f"Output: {result['transcription'][:100]}...")
                if reference_text != "Reference text not available":
                    print(f"Expected: {reference_text[:100]}...")

            except Exception as e:
                print(f"Error processing {file_path}: {e}")

    print(f"\nUnloading {model_name}...")
    try:
        model.unload()
        print(f"Unloaded {model_name} and freed memory")
    except Exception as e:
        print(f"Error unloading {model_name}: {e}")


# Convert results to DataFrame safely
df_results = [
    {
        "model": model_name,
        "language": r.get("language", "unknown"),
        "wer": r.get("wer", np.nan),
        "time": r.get("time", np.nan),
        "memory": r.get("memory", np.nan),
        "file": r.get("file", "unknown")
    }
    for model_name, model_results in results.items()
    for r in model_results
]

df = pd.DataFrame(df_results)

print("\nResults DataFrame Preview")
print(df.head())

# Handle missing or empty dataframe
if df.empty:
    print("No results to summarize — check if evaluation produced any output.")
else:
    # Print summary
    print("\nResults Summary")
    summary = df.groupby("model").agg({
        "wer": ["mean", "std"],
        "time": ["mean", "std"],
        "memory": ["mean", "std"]
    }).round(3)

    print("\nMetrics by model (mean ± std):")
    for model in summary.index:
        wer_mean = summary.loc[model, ("wer", "mean")]
        wer_std = summary.loc[model, ("wer", "std")]
        time_mean = summary.loc[model, ("time", "mean")]
        time_std = summary.loc[model, ("time", "std")]
        mem_mean = summary.loc[model, ("memory", "mean")]
        mem_std = summary.loc[model, ("memory", "std")]

        print(f"\n{model}:")
        if not np.isnan(wer_mean):
            print(f"  • WER: {wer_mean:.3f} ± {wer_std:.3f}")
        print(f"  • Time: {time_mean:.2f}s ± {time_std:.2f}s")
        print(f"  • Memory: {mem_mean:.1f}MB ± {mem_std:.1f}MB")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

# First, verify we have data to plot
print("DataFrame Info:")
print(df.info())
print("\nDataFrame Head:")
print(df.head())

fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle("Speech Recognition Model Comparison", fontsize=14, y=1.02)

# WER by model
if not df["wer"].isna().all():
    sns.boxplot(x="model", y="wer", data=df, ax=axes[0,0])
    axes[0,0].set_title("Word Error Rate by Model")
    axes[0,0].set_ylabel("WER")
    axes[0,0].tick_params(axis='x', rotation=45)
else:
    print("No WER values available for plotting")
    axes[0,0].remove()

# Processing time by model
sns.boxplot(x="model", y="time", data=df, ax=axes[0,1])
axes[0,1].set_title("Processing Time by Model")
axes[0,1].set_ylabel("Time (seconds)")
axes[0,1].tick_params(axis='x', rotation=45)

# Memory usage by model
sns.boxplot(x="model", y="memory", data=df, ax=axes[1,0])
axes[1,0].set_title("Memory Usage by Model")
axes[1,0].set_ylabel("Memory (MB)")
axes[1,0].tick_params(axis='x', rotation=45)

# WER by language
if not df["wer"].isna().all():
    sns.boxplot(x="language", y="wer", data=df, ax=axes[1,1])
    axes[1,1].set_title("WER by Language")
    axes[1,1].set_ylabel("WER")
    axes[1,1].tick_params(axis='x', rotation=45)
else:
    axes[1,1].remove()

plt.tight_layout()
plt.show()

# Print detailed statistics
print("\nDetailed Statistics")
stats = df.groupby(["model", "language"]).agg({
    "wer": ["mean", "std", "count"],
    "time": ["mean", "std"],
    "memory": ["mean", "std"]
}).round(3)

print("\nDetailed results by model and language:")
for (model, lang), row in stats.iterrows():
    print(f"\n{model} - {lang}:")
    if not np.isnan(row[("wer", "mean")]):
        print(f"  • WER: {row[('wer', 'mean')]:.3f} ± {row[('wer', 'std')]:.3f} (n={int(row[('wer', 'count')])})")
    print(f"  • Time: {row[('time', 'mean')]:.2f}s ± {row[('time', 'std')]:.2f}s")
    print(f"  • Memory: {row[('memory', 'mean')]:.1f}MB ± {row[('memory', 'std')]:.1f}MB")

# Save results with verification
results_file = Path("./data/test/results.csv")
try:
    # Create directory if it doesn't exist
    results_file.parent.mkdir(parents=True, exist_ok=True)
    
    # Save the data
    print(f"\nSaving results to {results_file.absolute()}")
    df.to_csv(results_file, index=False)
    
    # Verify the save by reading it back
    df_verify = pd.read_csv(results_file)
    if len(df_verify) == len(df):
        print(f"Successfully saved {len(df)} rows of data")
        print(f"Columns saved: {', '.join(df_verify.columns)}")
    else:
        print(f"Data verification failed: Saved {len(df_verify)} rows but expected {len(df)}")
        
except Exception as e:
    print(f"Error saving results: {str(e)}")
    print(f"Current working directory: {Path.cwd()}")
    print(f"Target save path: {results_file.absolute()}")

In [None]:
import pandas as pd
import numpy as np

# load saved results
df = pd.read_csv("./data/test/results.csv")

# Check if we have WER values
if df['wer'].isnull().all():
    print("⚠️ No WER values found. Make sure reference texts are available for comparison.")
else:
    # pivot table for WER
    wer_table = df.pivot_table(
        index="language",
        columns="model",
        values="wer",
        aggfunc="mean"
    ).round(3)

    print("=== WER Table ===")
    print(wer_table)
    print("\nWER Score Guide:")
    print("0.0-0.1:   Excellent (near human-level)")
    print("0.1-0.3:   Good (usable for most purposes)")
    print("0.3-0.5:   Fair (may need manual correction)")
    print(">0.5:      Poor (significant errors)")

# pivot table for processing time
time_table = df.pivot_table(
    index="language",
    columns="model",
    values="time",
    aggfunc="mean"
).round(2)

print("\nProcessing Time Table (s)")
print(time_table)

# pivot table for memory
memory_table = df.pivot_table(
    index="language",
    columns="model",
    values="memory",
    aggfunc="mean"
).round(1)

print("\nMemory Usage Table (MB)")
print(memory_table)

# Add summary statistics with WER quality indicators
print("\nSummary by Model")
model_stats = df.groupby('model').agg({
    'wer': lambda x: f"{x.mean():.3f} ({'Excellent' if x.mean() <= 0.1 else 'Good' if x.mean() <= 0.3 else 'Fair' if x.mean() <= 0.5 else 'Poor'})",
    'time': lambda x: f"{x.mean():.2f}s",
    'memory': lambda x: f"{x.mean():.1f}MB"
}).round(3)

print("\nModel Performance (WER | Processing Time | Memory):")
for model in model_stats.index:
    wer_stat = model_stats.loc[model, 'wer']
    time_stat = model_stats.loc[model, 'time']
    mem_stat = model_stats.loc[model, 'memory']
    print(f"{model}:")
    print(f"WER: {wer_stat}")
    print(f"Time: {time_stat}")
    print(f"Memory: {mem_stat}")

# Understanding WER (Word Error Rate)

WER measures the minimum number of edits needed to change the transcribed text into the reference text, divided by the number of words in the reference.

WER = (Substitutions + Deletions + Insertions) / Number of Reference Words

- **WER = 0.0**: Perfect match (best possible score)
- **WER = 0.2**: 20% error rate (good for conversational speech)
- **WER = 0.5**: 50% error rate (poor performance)
- **WER > 1.0**: Very poor performance (more errors than reference words)

Lower WER is always better. State-of-the-art models typically achieve:
- English: 0.02 - 0.15 (2-15% WER)
- Other languages: 0.05 - 0.30 (5-30% WER)
- Noisy/accented speech: Up to 0.50 (50% WER)

In [None]:
# Test WER calculation with sample files
print("Testing WER Calculation")

# Test with a single file first
test_file = "./data/test/eng1.wav"
audio, sr = load_test_audio(test_file)
if audio is not None:
    reference = get_reference_text(test_file)
    result = evaluate_model(ModelWrapper(MODELS['whisper-small']).load(), audio, sr, reference)
    print("\nDetailed WER Analysis:")
    if result and not np.isnan(result['wer']):
        print(f"Final WER: {result['wer']:.3f}")

In [None]:
# results = []

# for model_name, config in MODELS.items():
#     wrapper = ModelWrapper(config).load()
#     for lang, files in TEST_FILES.items():
#         for file_path in files:
#             audio, sr = load_test_audio(file_path)
#             if audio is None:
#                 continue
#             reference = get_reference_text(file_path)
#             metrics = evaluate_model(wrapper, audio, sr, reference)
#             if metrics:
#                 metrics.update({
#                     "model": model_name,
#                     "language": lang,
#                     "file": Path(file_path).name
#                 })
#                 results.append(metrics)
#     wrapper.unload()

# df_results = pd.DataFrame(results)
# df_results.to_csv("results_summary.csv", index=False)
# df_results.groupby("model")[["wer", "time", "memory"]].mean()
