<a href="https://colab.research.google.com/github/shehab0911/Real-Time-AI-based-Audio-Transcription-Between-Standard-Whisper-and-Faster-Whisper-Base-Model/blob/main/Final_Whisper_and_Fast_whisper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install faster-whisper
! pip install gradio -q
! pip install git+https://github.com/openai/whisper.git -q

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone


In [4]:
import whisper
import gradio as gr
import time
from faster_whisper import WhisperModel as FasterWhisperModel


model_whisper = whisper.load_model("base")
model_faster_whisper = FasterWhisperModel("base")

def transcribe(audio, use_faster=False):

    model = model_faster_whisper if use_faster else model_whisper


    audio = whisper.load_audio(audio)

    start_time = time.time()

    if use_faster:

        segments, _ = model.transcribe(audio, language="en")
        transcription = " ".join([segment.text for segment in segments]).strip()
    else:

        mel = whisper.log_mel_spectrogram(audio).to(model.device)
        options = whisper.DecodingOptions()
        result = whisper.decode(model, mel, options)
        transcription = result.text.strip()

    inference_time = time.time() - start_time

    return transcription, inference_time


100%|████████████████████████████████████████| 139M/139M [00:01<00:00, 121MiB/s]
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/2.31k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]

model.bin:   0%|          | 0.00/145M [00:00<?, ?B/s]

vocabulary.txt:   0%|          | 0.00/460k [00:00<?, ?B/s]

In [5]:
interface = gr.Interface(
    title='Real Time AI based Audio Transcription With Faster Whisper Base Model',
    fn=transcribe,
    inputs=[
        gr.Audio(type="filepath"),
        gr.Checkbox(label="Use Faster-Whisper")
    ],
    outputs=[
        gr.Textbox(label="Transcription"),
        gr.Number(label="Inference Time (seconds)")
    ],
    live=True
)

interface.launch()


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://7f8dc0a7de751a4feb.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




In [7]:
import whisper
import gradio as gr
import time
from faster_whisper import WhisperModel as FasterWhisperModel
from threading import Thread

model_whisper = whisper.load_model("base")
model_faster_whisper = FasterWhisperModel("base")

def transcribe(audio, use_faster=False):
    model = model_faster_whisper if use_faster else model_whisper

    try:
        audio = whisper.load_audio(audio)
    except Exception as e:
        return f"Error loading audio: {e}", None

    start_time = time.time()

    # Handle long audio by splitting into chunks
    chunk_length = 60 * 5
    transcription_parts = []

    # Split audio into chunks
    for start in range(0, len(audio), chunk_length * whisper.SAMPLE_RATE):
        end = min(start + chunk_length * whisper.SAMPLE_RATE, len(audio))
        chunk = audio[start:end]

        if use_faster:
            segments, _ = model.transcribe(chunk, language="en")
            transcription_parts.append(" ".join([segment.text for segment in segments]).strip())
        else:
            mel = whisper.log_mel_spectrogram(chunk).to(model.device)
            options = whisper.DecodingOptions()
            result = whisper.decode(model, mel, options)
            transcription_parts.append(result.text.strip())

    transcription = " ".join(transcription_parts).strip()
    inference_time = time.time() - start_time

    return transcription, inference_time


def analyze_performance():
    audio_file = "path_to_your_audio_file"

    # Measure performance of standard Whisper model
    start_time = time.time()
    transcribe(audio_file, use_faster=False)
    whisper_time = time.time() - start_time

    # Measure performance of Faster Whisper model
    start_time = time.time()
    transcribe(audio_file, use_faster=True)
    faster_whisper_time = time.time() - start_time

    print(f"Whisper model time: {whisper_time} seconds")
    print(f"Faster Whisper model time: {faster_whisper_time} seconds")

# Gradio Interface
interface = gr.Interface(
    title='Real Time AI based Audio Transcription With Faster Whisper Base Model',
    fn=transcribe,
    inputs=[
        gr.Audio(type="filepath"),
        gr.Checkbox(label="Use Faster-Whisper")
    ],
    outputs=[
        gr.Textbox(label="Transcription"),
        gr.Number(label="Inference Time (seconds)")
    ],
    live=True
)

interface.launch()

analyze_performance()


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://9e132b367d92e84771.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Whisper model time: 0.11597347259521484 seconds
Faster Whisper model time: 0.10576033592224121 seconds
