In [1]:
!pip install faster-whisper gradio torchaudio soundfile resampy

Collecting faster-whisper
  Downloading faster_whisper-1.2.0-py3-none-any.whl.metadata (16 kB)
Collecting resampy
  Downloading resampy-0.4.3-py3-none-any.whl.metadata (3.0 kB)
Collecting ctranslate2<5,>=4.0 (from faster-whisper)
  Downloading ctranslate2-4.6.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting onnxruntime<2,>=1.14 (from faster-whisper)
  Downloading onnxruntime-1.23.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.9 kB)
Collecting av>=11 (from faster-whisper)
  Downloading av-15.1.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.6 kB)
Collecting coloredlogs (from onnxruntime<2,>=1.14->faster-whisper)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime<2,>=1.14->faster-whisper)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading faster_whisper-1.2.0-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━

In [6]:
import gradio as gr
import numpy as np
import torch
import time
import soundfile as sf
import resampy
import subprocess
from faster_whisper import WhisperModel

device = "cuda" if torch.cuda.is_available() else "cpu"
compute_type = "float16" if device == "cuda" else "float32"
model_size = "large-v2"
model = WhisperModel(model_size, device=device, compute_type=compute_type)

def vad_transcribe(audio_file, threshold = 0.5, min_speech_ms = 250, max_speech_s = 20, min_silence_ms = 200):
    try:
        if audio_file is None:
            return "[Error] No file provided.", None, None, None, None

        audio_data, sr = sf.read(audio_file)
        if len(audio_data.shape) > 1:
            audio_data = np.mean(audio_data, axis=1)
        audio_16k = resampy.resample(audio_data, sr, 16000).astype(np.float32)

        total_duration = len(audio_16k) / 16000.0

        start_time = time.time()

        segments, _ = model.transcribe(
            audio_16k,
            vad_filter=True,
            vad_parameters={
                "threshold": threshold,
                "min_speech_duration_ms": min_speech_ms,
                "max_speech_duration_s": max_speech_s,
                "min_silence_duration_ms": min_silence_ms
            }
        )

        end_time = time.time()
        latency = round(end_time - start_time, 2)

        transcript = ""
        speech_duration = 0
        segment_count = 0

        for segment in segments:
            transcript += f"[{segment.start:.2f}s - {segment.end:.2f}s]: {segment.text.strip()}\n"
            speech_duration += (segment.end - segment.start)
            segment_count += 1

        speech_ratio = round((speech_duration / total_duration) * 100, 2) if total_duration else 0

        if device == "cuda":
            subprocess.run(["nvidia-smi"])

        return (
            transcript.strip(),
            f"{latency} seconds"
        )

    except Exception as e:
        return f"[Error] {str(e)}", None

with gr.Blocks() as demo:
    gr.Markdown("## 🎤 Voice Activity Detection + Transcription using Faster-Whisper + Gradio")
    gr.Markdown("Upload an audio file and apply configurable Voice Activity Detection.")

    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(label="Upload Audio File", type="filepath")
            transcribe_btn = gr.Button("Transcribe with VAD")

    with gr.Row():
        transcript_output = gr.Textbox(label="📋 Transcription with Timestamps", lines=12)

    with gr.Row():
        latency_output = gr.Textbox(label="⏱️ Latency")

    transcribe_btn.click(
        fn=vad_transcribe,
        inputs=[audio_input],
        outputs=[transcript_output, latency_output]
    )

demo.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://f7b82c46f5259acb3d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
import os
import time
import requests
import torch
import subprocess
import gradio as gr

# 🔐 Set your AssemblyAI API Key here
ASSEMBLYAI_API_KEY = "af4b5d85f2214bc5a356a262b9ce6e21"

# 🔁 Upload file to AssemblyAI
def upload_to_assemblyai(file_path):
    headers = {'authorization': ASSEMBLYAI_API_KEY}
    with open(file_path, 'rb') as f:
        response = requests.post(
            'https://api.assemblyai.com/v2/upload',
            headers=headers,
            files={'file': f}
        )
    return response.json()['upload_url']

# 🧠 Transcribe and diarize
def transcribe_and_diarize(audio_path):
    try:
        if not audio_path:
            return "[ERROR] No audio provided.", "", "", ""

        start_time = time.time()

        # Upload
        upload_url = upload_to_assemblyai(audio_path)

        # Request transcription with speaker diarization
        transcript_request = {
            "audio_url": upload_url,
            "speaker_labels": True,
        }

        headers = {
            "authorization": ASSEMBLYAI_API_KEY,
            "content-type": "application/json"
        }

        response = requests.post(
            "https://api.assemblyai.com/v2/transcript",
            json=transcript_request,
            headers=headers
        )

        transcript_id = response.json()['id']

        # Polling for completion
        polling_url = f"https://api.assemblyai.com/v2/transcript/{transcript_id}"
        while True:
            polling_response = requests.get(polling_url, headers=headers).json()
            if polling_response['status'] == 'completed':
                break
            elif polling_response['status'] == 'error':
                raise Exception(polling_response['error'])
            time.sleep(2)

        results = polling_response
        full_text = results['text']
        words = results['utterances']

        # Format speaker-labeled transcript
        speaker_transcript = "\n".join(
            f"[{utt['start'] // 1000:.2f}s - {utt['end'] // 1000:.2f}s] Speaker {utt['speaker']} : {utt['text']}"
            for utt in words
        )

        latency = f"{round(time.time() - start_time, 2)} sec"
        device_info = f"Device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}"

        # Optional GPU usage info
        if torch.cuda.is_available():
            subprocess.run(["nvidia-smi"])

        return speaker_transcript, full_text, latency, device_info

    except Exception as e:
        return f"[ERROR] {str(e)}", "", "", ""

# 🎛️ Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("## 🎙️ Speaker Diarization using Whisper (AssemblyAI Backend)")
    gr.Markdown("Upload audio file, transcribe and identify speakers using AssemblyAI.")

    audio_input = gr.Audio(type="filepath", label="🎧 Upload Audio File")
    run_button = gr.Button("Run")

    diarized_output = gr.Textbox(label="🗣️ Speaker-Labeled Transcript", lines=10)
    full_transcript = gr.Textbox(label="📝 Full Transcript", lines=6)
    latency_info = gr.Textbox(label="⏱️ Latency")
    model_info = gr.Textbox(label="⚙️ Device Info")

    run_button.click(
        fn=transcribe_and_diarize,
        inputs=audio_input,
        outputs=[diarized_output, full_transcript, latency_info, model_info]
    )

demo.launch()


* Running on local URL:  http://127.0.0.1:7865
It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

* Running on public URL: https://3140e57b45a1f5a64a.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Fri Jun  6 10:13:52 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   39C    P8              9W /   70W |       3MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                       Off |   00

In [None]:
import os
import time
import requests
import torch
import gradio as gr

ASSEMBLYAI_API_KEY = None

# Save API Key
def save_api_key(user_key):
    global ASSEMBLYAI_API_KEY
    if not user_key or user_key.strip() == "":
        return gr.update(value="❌ Please enter a valid API key."), gr.update(visible=False)
    ASSEMBLYAI_API_KEY = user_key.strip()
    return gr.update(value="✅ API key saved successfully! Upload audio to continue."), gr.update(visible=True)

# Upload audio to AssemblyAI
def upload_to_assemblyai(file_path):
    headers = {'authorization': ASSEMBLYAI_API_KEY}
    with open(file_path, 'rb') as f:
        response = requests.post(
            'https://api.assemblyai.com/v2/upload',
            headers=headers,
            files={'file': f}
        )
    return response.json()['upload_url']

# Transcription with diarization
def transcribe_and_diarize(audio_path):
    global ASSEMBLYAI_API_KEY
    try:
        if not ASSEMBLYAI_API_KEY:
            return "[ERROR] API key not set.", "", "", "", ""

        if not audio_path:
            return "[ERROR] No audio file provided.", "", "", "", ""

        start_time = time.time()
        upload_url = upload_to_assemblyai(audio_path)

        headers = {
            "authorization": ASSEMBLYAI_API_KEY,
            "content-type": "application/json"
        }

        request_data = {
            "audio_url": upload_url,
            "speaker_labels": True
        }

        response = requests.post("https://api.assemblyai.com/v2/transcript", json=request_data, headers=headers)
        if response.status_code != 200:
            return f"[ERROR] Transcript request failed: {response.text}", "", "", "", ""

        transcript_id = response.json()['id']
        polling_url = f"https://api.assemblyai.com/v2/transcript/{transcript_id}"

        while True:
            polling_response = requests.get(polling_url, headers=headers)
            if polling_response.status_code != 200:
                return f"[ERROR] Polling failed: {polling_response.text}", "", "", "", ""
            polling_json = polling_response.json()
            if polling_json['status'] == 'completed':
                break
            elif polling_json['status'] == 'error':
                raise Exception(polling_json['error'])
            time.sleep(2)

        results = polling_json
        full_text = results['text']
        words = results['utterances']

        speaker_transcript = "\n".join(
            f"[{utt['start'] // 1000:.2f}s - {utt['end'] // 1000:.2f}s] Speaker {utt['speaker']} : {utt['text']}"
            for utt in words
        )

        latency = f"{round(time.time() - start_time, 2)} sec"
        device = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"
        quality = "✅ Diarization Quality: Good\n- Multi-speaker support\n- Word-level accuracy"

        return speaker_transcript, full_text, latency, device, quality

    except Exception as e:
        return f"[ERROR] {str(e)}", "", "", "", ""

# Interface
with gr.Blocks() as demo:
    gr.Markdown("## 🔐 AssemblyAI Speaker Diarization App")

    with gr.Row():
        api_key_input = gr.Textbox(label="Enter AssemblyAI API Key", type="password")
        submit_key_btn = gr.Button("Submit API Key")
    key_status = gr.Textbox(label="Status", interactive=False)

    with gr.Column(visible=False) as main_app:
        with gr.Row():
            audio_input = gr.Audio(type="filepath", label="🎧 Upload Audio File")
            run_button = gr.Button("Transcribe + Diarize")

        with gr.Row():
            with gr.Column(scale=2):
                diarized_output = gr.Textbox(label="🗣️ Speaker-Labeled Transcript", lines=12)
            with gr.Column(scale=2):
                full_transcript = gr.Textbox(label="📄 Full Transcript", lines=12)
            with gr.Column(scale=1):
                latency_info = gr.Textbox(label="⏱️ Latency", interactive=False)
                model_info = gr.Textbox(label="🖥️ GPU Info", interactive=False)
                quality_info = gr.Textbox(label="📌 Quality Notes", interactive=False)

    # Set the API key and show main UI
    submit_key_btn.click(
        fn=save_api_key,
        inputs=[api_key_input],
        outputs=[key_status, main_app]
    )

    # Run transcription
    run_button.click(
        fn=transcribe_and_diarize,
        inputs=[audio_input],
        outputs=[diarized_output, full_transcript, latency_info, model_info, quality_info]
    )

demo.launch()


* Running on local URL:  http://127.0.0.1:7864
It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

* Running on public URL: https://ddebe5fae88ba32946.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
!pip install -U openai-whisper gradio

Collecting gradio
  Downloading gradio-5.33.0-py3-none-any.whl.metadata (16 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.6.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.10.2 (from gradio)
  Downloading gradio_client-1.10.2-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.13-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (from gradio)
  Downloading safehttpx-0.1.6-py3-none-any.whl.metadata (4.2 kB)
Collecting semantic-version~=2.0 (from gradio)
  Downloading semantic_version-2.10.0-py2.py3-none-any.whl.metadata (9.7 kB)
Co

In [None]:
import os
import time
import requests
import torch
import gradio as gr
import whisper

ASSEMBLYAI_API_KEY = None

# Load Whisper large-v2
device = "cuda" if torch.cuda.is_available() else "cpu"
whisper_model = whisper.load_model("small", device=device)

# Save API Key
def save_api_key(user_key):
    global ASSEMBLYAI_API_KEY
    if not user_key or user_key.strip() == "":
        return gr.update(value="❌ Please enter a valid API key."), gr.update(visible=False)
    ASSEMBLYAI_API_KEY = user_key.strip()
    return gr.update(value="✅ API key saved. Now upload your audio."), gr.update(visible=True)

# Upload audio to AssemblyAI
def upload_to_assemblyai(file_path):
    headers = {'authorization': ASSEMBLYAI_API_KEY}
    with open(file_path, 'rb') as f:
        response = requests.post(
            'https://api.assemblyai.com/v2/upload',
            headers=headers,
            files={'file': f}
        )
    return response.json()['upload_url']

# Transcription + Diarization
def transcribe_and_diarize(audio_path):
    global ASSEMBLYAI_API_KEY
    try:
        if not ASSEMBLYAI_API_KEY:
            return "[ERROR] API key not set.", "", "", "", ""

        if not audio_path:
            return "[ERROR] No audio provided.", "", "", "", ""

        start_time = time.time()

        # Transcribe using Whisper locally
        whisper_result = whisper_model.transcribe(audio_path, verbose=False)
        whisper_text = whisper_result['text']

        # Upload to AssemblyAI
        upload_url = upload_to_assemblyai(audio_path)

        headers = {
            "authorization": ASSEMBLYAI_API_KEY,
            "content-type": "application/json"
        }

        request_data = {
            "audio_url": upload_url,
            "speaker_labels": True,
            "speaker_count": 2  # ✅ Force diarization to detect only 2 speakers
        }

        response = requests.post("https://api.assemblyai.com/v2/transcript", json=request_data, headers=headers)
        if response.status_code != 200:
            return f"[ERROR] Transcript request failed: {response.text}", "", "", "", ""

        transcript_id = response.json()['id']
        polling_url = f"https://api.assemblyai.com/v2/transcript/{transcript_id}"

        while True:
            polling_response = requests.get(polling_url, headers=headers)
            if polling_response.status_code != 200:
                return f"[ERROR] Polling failed: {polling_response.text}", "", "", "", ""
            polling_json = polling_response.json()
            if polling_json['status'] == 'completed':
                break
            elif polling_json['status'] == 'error':
                raise Exception(polling_json['error'])
            time.sleep(2)

        results = polling_json
        full_text_assembly = results['text']
        utterances = results['utterances']

        speaker_output = "\n".join(
            f"[{utt['start'] // 1000:.2f}s - {utt['end'] // 1000:.2f}s] Speaker {utt['speaker']} : {utt['text']}"
            for utt in utterances
        )

        latency = f"{round(time.time() - start_time, 2)} sec"
        device_info = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"
        quality_info = "✅ Diarization Quality: Good\n- Powered by AssemblyAI\n- Whisper transcription also included"

        return speaker_output, whisper_text, latency, device_info, quality_info

    except Exception as e:
        return f"[ERROR] {str(e)}", "", "", "", ""

# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("## 🧠 Whisper Large V2 + 🗣️ AssemblyAI Speaker Diarization")
    gr.Markdown("Enter API key, upload audio, and view diarized output using AssemblyAI and Whisper transcription.")

    with gr.Row():
        api_key_input = gr.Textbox(label="🔐 Enter AssemblyAI API Key", type="password")
        submit_key_btn = gr.Button("✅ Submit API Key")
    key_status = gr.Textbox(label="Status", interactive=False)

    with gr.Column(visible=False) as main_app:
        audio_input = gr.Audio(type="filepath", label="🎧 Upload Audio File")
        run_button = gr.Button("🚀 Transcribe + Diarize")

        with gr.Row():
            diarized_output = gr.Textbox(label="🗣️ Speaker-Labeled Transcript (AssemblyAI)", lines=12)
            full_transcript = gr.Textbox(label="📝 Whisper Full Transcript (Local)", lines=12)

        with gr.Row():
            latency_info = gr.Textbox(label="⏱️ Latency", interactive=False)
            model_info = gr.Textbox(label="🖥️ GPU Info", interactive=False)
            quality_info = gr.Textbox(label="📌 Quality Notes", interactive=False)

    # Save API key
    submit_key_btn.click(
        fn=save_api_key,
        inputs=[api_key_input],
        outputs=[key_status, main_app]
    )

    # Run transcription + diarization
    run_button.click(
        fn=transcribe_and_diarize,
        inputs=[audio_input],
        outputs=[diarized_output, full_transcript, latency_info, model_info, quality_info]
    )

demo.launch()


* Running on local URL:  http://127.0.0.1:7861
It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

* Running on public URL: https://141249567aa2272e05.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Detected language: English


100%|██████████| 6566/6566 [00:03<00:00, 1990.13frames/s]
