# Task 1: Speech-to-Text Benchmarking using Word Error Rate (WER)

This notebook benchmarks three leading Speech-to-Text engines:
1. OpenAI Whisper  
2. faster-whisper  
3. Vosk  

We evaluate them using **Word Error Rate (WER)** on a small test set of audio files.

## What is Word Error Rate (WER)?

WER is a standard metric for evaluating speech recognition systems.

WER = (Substitutions + Deletions + Insertions) / Number of words in reference

Lower WER = better performance.

In [None]:
%pip install openai-whisper faster-whisper vosk jiwer soundfile librosa

## Dataset

We use 5 short audio samples (25 - 40 sec) with known ground-truth transcripts.
Each audio file has a corresponding `.txt` file containing the correct transcription.

Directory structure:

/dataset
1.     /audio-1.wav
2.     /audio-2.wav
3.     /audio-3.wav
4.     /audio-4.wav
5.     /audio-5.wav

## Cell 2: Imports

In [None]:
import os
import whisper
from faster_whisper import WhisperModel
from vosk import Model, KaldiRecognizer
import json
import wave
import soundfile as sf
from jiwer import wer

## Cell 3: Loading Test Audio

In [None]:
DATASET_PATH = "/dataset"

def load_test_data(dataset_path):
    data = []
    for file in os.listdir(dataset_path):
        if file.endswith(".wav"):
            audio_path = os.path.join(dataset_path, file)
            txt_path = audio_path.replace(".wav", ".txt")
            if os.path.exists(txt_path):
                with open(txt_path, "r", encoding="utf-8") as f:
                    reference = f.read().strip()
                data.append((audio_path, reference))
    return data

test_data = load_test_data(DATASET_PATH)

print(f"Loaded {len(test_data)} audio samples")

## Model 1: OpenAI Whisper
High accuracy multilingual transformer-based model.

In [None]:
whisper_model = whisper.load_model("base")

## Cell 4: Whisper Transcription + WER

In [None]:
def evaluate_whisper(model, test_data):
    wers = []
    for audio_path, reference in test_data:
        result = model.transcribe(audio_path)
        prediction = result["text"].strip()
        error = wer(reference.lower(), prediction.lower())
        wers.append(error)
        print(f"Whisper WER for {os.path.basename(audio_path)}: {error}")
    return sum(wers) / len(wers)

whisper_avg_wer = evaluate_whisper(whisper_model, test_data)
print(f"\nAverage Whisper WER: {whisper_avg_wer}")

## Model 2: faster-whisper
Optimized implementation of Whisper using CTranslate2 for faster inference.

In [None]:
faster_model = WhisperModel("base", device="cpu", compute_type="int8")

## Cell 5: Faster-whisper Transcription + WER

In [None]:
def evaluate_faster_whisper(model, test_data):
    wers = []
    for audio_path, reference in test_data:
        segments, info = model.transcribe(audio_path)
        prediction = " ".join([segment.text for segment in segments]).strip()
        error = wer(reference.lower(), prediction.lower())
        wers.append(error)
        print(f"faster-whisper WER for {os.path.basename(audio_path)}: {error}")
    return sum(wers) / len(wers)

faster_whisper_avg_wer = evaluate_faster_whisper(faster_model, test_data)
print(f"\nAverage faster-whisper WER: {faster_whisper_avg_wer}")

## Model 3: Vosk
Lightweight offline speech recognition engine.

In [None]:
!wget https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip
!unzip vosk-model-small-en-us-0.15.zip

In [None]:
vosk_model = Model("vosk-model-small-en-us-0.15")

## Cell 6:  Vosk Transcription

In [None]:
def transcribe_vosk(audio_path, model):
    wf = wave.open(audio_path, "rb")
    rec = KaldiRecognizer(model, wf.getframerate())
    rec.SetWords(True)

    result_text = ""

    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            res = json.loads(rec.Result())
            result_text += " " + res.get("text", "")

    final_res = json.loads(rec.FinalResult())
    result_text += " " + final_res.get("text", "")

    return result_text.strip()

## Cell 7: Vosk Transcription + WER

In [None]:
def evaluate_vosk(model, test_data):
    wers = []
    for audio_path, reference in test_data:
        prediction = transcribe_vosk(audio_path, model)
        error = wer(reference.lower(), prediction.lower())
        wers.append(error)
        print(f"Vosk WER for {os.path.basename(audio_path)}: {error}")
    return sum(wers) / len(wers)

vosk_avg_wer = evaluate_vosk(vosk_model, test_data)
print(f"\nAverage Vosk WER: {vosk_avg_wer}")

## Final WER Comparison
Lower is better.

In [None]:
import pandas as pd

results = {
    "Model": ["Whisper", "faster-whisper", "Vosk"],
    "Average WER": [whisper_avg_wer, faster_whisper_avg_wer, vosk_avg_wer]
}

df = pd.DataFrame(results)
df

# Task 2: Transcription using faster-whisper 

### Cell 1: Install Dependencies

In [None]:
%pip install faster-whisper soundfile

### Cell 2: Importing Required Libraries

In [None]:
import soundfile as sf
import math
from faster_whisper import WhisperModel

### Cell 3: Loading Podcast Audio File 

We here load the mp3 audio file and convert it to proper mono 16k Hz wav format, for proper transcribing.

In [None]:
!ffmpeg -y -i /podcast-audio/807931c237e75122fd4f0bb4ec9f7d1b.mp3 -ac 1 -ar 16000 clean_audio.wav

### Cell 4: Loading the wav file, and checking it's features

In [None]:
import soundfile as sf

audio, sr = sf.read("clean_audio.wav")
print("Sample rate:", sr)
print("Shape:", audio.shape)
print("Duration (sec):", len(audio) / sr)

### Cell 5: Splitting Audio into 45-Second Chunks

In [None]:
import numpy as np
import math
import soundfile as sf

def split_audio_correct(audio_path, chunk_duration=45):
    audio, sr = sf.read(audio_path)

    if len(audio.shape) > 1:
        audio = audio.mean(axis=1)  # force mono

    total_samples = len(audio)
    samples_per_chunk = int(chunk_duration * sr)

    chunks = []

    for start in range(0, total_samples, samples_per_chunk):
        end = start + samples_per_chunk
        chunk = audio[start:end]

        if len(chunk) == 0:
            continue

        chunks.append(chunk.astype(np.float32))

    print(f"Total chunks created: {len(chunks)}")
    return chunks, sr

### Cell 6: Loading faster-whisper Model

In [None]:
from faster_whisper import WhisperModel

model = WhisperModel(
    "base",
    device="cpu",
    compute_type="int8"
)

### Cell 7: Timeline formatting

In [None]:
def format_timestamp(seconds):
    hrs = int(seconds // 3600)
    mins = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    return f"{hrs:02d}:{mins:02d}:{secs:02d}"

### Cell 8: Transcribing Long Audio using Chunking

In [None]:
def transcribe_long_audio_with_timestamps(model, audio_path, chunk_duration=45):
    chunks, sr = split_audio_correct(audio_path, chunk_duration)
    full_text = []

    print("\n================= STARTING TRANSCRIPTION =================\n")

    for idx, chunk in enumerate(chunks):
        start_time = idx * chunk_duration
        end_time = start_time + (len(chunk) / sr)

        start_ts = format_timestamp(start_time)
        end_ts = format_timestamp(end_time)

        print(f"\n[{start_ts} - {end_ts}]")
        print("-" * 60)

        segments, info = model.transcribe(
            chunk,
            language="en",
            beam_size=5,
            temperature=0.0,
            vad_filter=True,
            vad_parameters=dict(min_silence_duration_ms=500)
        )

        chunk_text = ""
        for segment in segments:
            chunk_text += segment.text + " "

        chunk_text = chunk_text.strip()

        # Print chunk transcription
        print(chunk_text)

        # Store
        full_text.append(chunk_text)

    print("\n================= TRANSCRIPTION COMPLETE =================\n")

    final_transcript = " ".join(full_text)

    print("\n============= FULL TRANSCRIPT (COMBINED) =============\n")
    print(final_transcript)

    return full_text, final_transcript

### Cell 9: Running Transcription on Podcast Audio

In [None]:
chunk_texts, final_transcript = transcribe_long_audio_with_timestamps(
    model,
    "clean_audio.wav",
    chunk_duration=45
)

### Cell 8: Saving Final Transcript

In [None]:
OUTPUT_PATH = "/output/podcast_transcript_with_timestamps.txt"

with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    for idx, text in enumerate(chunk_texts):
        start_time = idx * 45
        end_time = start_time + len(text)

        start_ts = format_timestamp(start_time)
        end_ts = format_timestamp(start_time + 45)

        f.write(f"[{start_ts} - {end_ts}]\n")
        f.write(text + "\n\n")

    f.write("\n================ FULL TRANSCRIPT ================\n\n")
    f.write(final_transcript)

print("âœ… Transcript saved to:", OUTPUT_PATH)