<a href="https://colab.research.google.com/github/sam-trg/BirlaTestII/blob/main/whisper_tests.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 📦 Install dependencies
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q faster-whisper
!apt-get -y install ffmpeg

# ⚙️ Setup paths
input_ogg = "files/test.ogg"
output_wav = "files/test_16k.wav"
output_mp3 = "files/test.mp3"

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m107.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m84.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m46.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.1 MB/s[0m eta [36m0:

In [2]:
# 🔧 Audio conversion functions
import time
import subprocess

def convert_ogg_to_wav(input_ogg, output_wav, sample_rate=16000):
    start = time.time()
    subprocess.run([
        "ffmpeg", "-i", input_ogg,
        "-ar", str(sample_rate),
        "-ac", "1",
        "-y", output_wav
    ], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    return time.time() - start

def convert_ogg_to_mp3(input_ogg, output_mp3):
    start = time.time()
    subprocess.run(["ffmpeg", "-i", input_ogg, "-acodec", "libmp3lame", "-q:a", "4", output_mp3], check=True)
    return time.time() - start

In [3]:
# 🔤 Transcription functions
import whisper
from faster_whisper import WhisperModel

def transcribe_with_whisper(input_audio, model_type="base"):
    model = whisper.load_model(model_type)
    start = time.time()
    result = model.transcribe(input_audio)
    return time.time() - start, result["text"]

def transcribe_with_faster_whisper(input_audio, model_type="base", device="cpu"):
    model = WhisperModel(model_type, device=device, compute_type="int8")
    start = time.time()
    segments, _ = model.transcribe(input_audio, beam_size=5)
    text = " ".join([segment.text for segment in segments])
    return time.time() - start, text

In [11]:
# 🧪 Benchmark 1: Direct OGG input
print("=== Direct OGG Input ===")
time_vanilla, text_vanilla = transcribe_with_whisper(input_ogg)
print(f"Vanilla Whisper: {time_vanilla:.2f}s\n{text_vanilla}\n")

time_faster, text_faster = transcribe_with_faster_whisper(input_ogg)
print(f"Faster Whisper: {time_faster:.2f}s\n{text_faster}")

=== Direct OGG Input ===




Vanilla Whisper: 6.27s
 مجھے دلی میں دکٹرس دکھایا

Faster Whisper: 5.20s
 مجھے دلی میں دکٹرس دیکھایا


In [12]:
# 📊 Compare result from Benchmark 1
similarity = 100 * sum(c1 == c2 for c1, c2 in zip(text_vanilla, text_faster)) / max(len(text_vanilla), len(text_faster))
print(f"Text similarity: {similarity:.1f}%")

Text similarity: 77.8%


In [13]:
# 🧪 Benchmark 2: Convert to MP3 first
convert_time = convert_ogg_to_mp3(input_ogg, output_mp3)
print(f"OGG → MP3: {convert_time:.2f}s")

time_vanilla, text_vanilla = transcribe_with_whisper(output_mp3)
print(f"Vanilla Whisper: {time_vanilla:.2f}s\n{text_vanilla}\n")

time_faster, text_faster = transcribe_with_faster_whisper(output_mp3)
print(f"Faster Whisper: {time_faster:.2f}s\n{text_faster}")

OGG → MP3: 0.20s




Vanilla Whisper: 6.14s
 مجھے دلی میں دکٹرس دکھایا

Faster Whisper: 4.98s
 مجھے دلی میں دکٹرس دیکھایا


In [14]:
# 📊 Compare result from Benchmark 2
similarity = 100 * sum(c1 == c2 for c1, c2 in zip(text_vanilla, text_faster)) / max(len(text_vanilla), len(text_faster))
print(f"Text similarity: {similarity:.1f}%")

Text similarity: 77.8%


In [15]:
# 🧪 Benchmark 3: Convert to 16kHz WAV
convert_time = convert_ogg_to_wav(input_ogg, output_wav)
print(f"OGG → 16kHz WAV: {convert_time:.2f}s")

time_vanilla, text_vanilla = transcribe_with_whisper(output_wav)
print(f"Vanilla Whisper: {time_vanilla:.2f}s\n{text_vanilla}\n")

time_faster, text_faster = transcribe_with_faster_whisper(output_wav)
print(f"Faster Whisper: {time_faster:.2f}s\n{text_faster}")

OGG → 16kHz WAV: 0.12s




Vanilla Whisper: 6.65s
 مجھے دلی میں دکٹرس دکھایا

Faster Whisper: 5.04s
 مجھے دلی میں دکٹرس دیکھایا


In [16]:
# 📊 Compare result from Benchmark 3
similarity = 100 * sum(c1 == c2 for c1, c2 in zip(text_vanilla, text_faster)) / max(len(text_vanilla), len(text_faster))
print(f"Text similarity: {similarity:.1f}%")

Text similarity: 77.8%
