In [1]:
!pip install faster-whisper gradio torchaudio soundfile resampy

Collecting faster-whisper
  Downloading faster_whisper-1.2.0-py3-none-any.whl.metadata (16 kB)
Collecting resampy
  Downloading resampy-0.4.3-py3-none-any.whl.metadata (3.0 kB)
Collecting ctranslate2<5,>=4.0 (from faster-whisper)
  Downloading ctranslate2-4.6.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting onnxruntime<2,>=1.14 (from faster-whisper)
  Downloading onnxruntime-1.23.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Collecting av>=11 (from faster-whisper)
  Downloading av-15.1.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.6 kB)
Collecting coloredlogs (from onnxruntime<2,>=1.14->faster-whisper)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime<2,>=1.14->faster-whisper)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading faster_whisper-1.2.0-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━

In [None]:
import gradio as gr
import numpy as np
import torch
import time
import soundfile as sf
import resampy
import subprocess
from faster_whisper import WhisperModel

device = "cuda" if torch.cuda.is_available() else "cpu"
compute_type = "float16" if device == "cuda" else "float32"
model_size = "large-v2"
model = WhisperModel(model_size, device=device, compute_type=compute_type)

def vad_transcribe(audio_file, threshold = 0.5, min_speech_ms = 250, max_speech_s = 20, min_silence_ms = 200):
    try:
        if audio_file is None:
            return "[Error] No file provided.", None, None, None, None

        audio_data, sr = sf.read(audio_file)
        if len(audio_data.shape) > 1:
            audio_data = np.mean(audio_data, axis=1)
        audio_16k = resampy.resample(audio_data, sr, 16000).astype(np.float32)

        total_duration = len(audio_16k) / 16000.0

        start_time = time.time()

        segments, _ = model.transcribe(
            audio_16k,
            vad_filter=True,
            vad_parameters={
                "threshold": threshold,
                "min_speech_duration_ms": min_speech_ms,
                "max_speech_duration_s": max_speech_s,
                "min_silence_duration_ms": min_silence_ms
            }
        )

        end_time = time.time()
        latency = round(end_time - start_time, 2)

        transcript = ""
        speech_duration = 0
        segment_count = 0

        for segment in segments:
            transcript += f"[{segment.start:.2f}s - {segment.end:.2f}s]: {segment.text.strip()}\n"
            speech_duration += (segment.end - segment.start)
            segment_count += 1

        speech_ratio = round((speech_duration / total_duration) * 100, 2) if total_duration else 0

        if device == "cuda":
            subprocess.run(["nvidia-smi"])

        return (
            transcript.strip(),
            f"{latency} seconds"
        )

    except Exception as e:
        return f"[Error] {str(e)}", None

with gr.Blocks() as demo:
    gr.Markdown("## 🎤 Voice Activity Detection + Transcription using Faster-Whisper + Gradio")
    gr.Markdown("Upload an audio file and apply configurable Voice Activity Detection.")

    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(label="Upload Audio File", type="filepath")
            transcribe_btn = gr.Button("Transcribe with VAD")

    with gr.Row():
        transcript_output = gr.Textbox(label="📋 Transcription with Timestamps", lines=12)

    with gr.Row():
        latency_output = gr.Textbox(label="⏱️ Latency")

    transcribe_btn.click(
        fn=vad_transcribe,
        inputs=[audio_input],
        outputs=[transcript_output, latency_output]
    )

demo.launch()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

vocabulary.txt: 0.00B [00:00, ?B/s]

model.bin:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7683747339404ec3dc.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


