In [1]:
# Cell 1: Install and Import Whisper
import whisper
import numpy as np
import sounddevice as sd
from IPython.display import display, clear_output
import ipywidgets as widgets
import threading
import queue
import time

# Load Whisper model (base model supports multilingual)
print("Loading Whisper model...")
whisper_model = whisper.load_model("base")  # Supports EN, ES, FR
print("✅ Whisper model loaded!")


Loading Whisper model...


100%|███████████████████████████████████████| 139M/139M [00:59<00:00, 2.44MiB/s]


✅ Whisper model loaded!


In [3]:
# Cell 2: Real-time Transcription Class
from ipywidgets import VBox, HBox, Button, HTML, Output

import sounddevice as sd
import whisper
from ipywidgets import VBox, HBox, Button, HTML, Output

class RealTimeTranscriber:
    def __init__(self, model, sample_rate=16000, chunk_duration=1.0):
        self.model = model
        self.sample_rate = sample_rate
        self.chunk_duration = chunk_duration
        self.stream = None
        self.is_recording = False
        self.transcript_output = Output()
        # UI elements
        self.start_btn = Button(description="Start")
        self.stop_btn = Button(description="Stop")
        # Create UI
        self.create_ui()

    def create_ui(self):
        self.start_btn.on_click(self.start_recording)
        self.stop_btn.on_click(self.stop_recording)
        self.ui = VBox([
            HBox([self.start_btn, self.stop_btn]),
            HTML("<h3>Live Transcript:</h3>"),
            self.transcript_output
        ])

    def start_recording(self, _):
        if self.is_recording:
            return
        self.is_recording = True
        self.transcript_output.clear_output()
        self.stream = sd.InputStream(
            samplerate=self.sample_rate,
            channels=1,
            callback=self.audio_callback
        )
        self.stream.start()

    def stop_recording(self, _):
        if not self.is_recording:
            return
        self.is_recording = False
        if self.stream is not None:
            self.stream.stop()
            self.stream.close()
            self.stream = None

    def audio_callback(self, indata, frames, time, status):
        if not self.is_recording:
            return
        audio_chunk = indata[:, 0]
        result = self.model.transcribe(audio_chunk, fp16=False)
        with self.transcript_output:
            print(result["text"].strip()) 
    
    def start_recording(self, btn):
        """Start real-time recording and transcription"""
        self.recording = True
        self.start_btn.disabled = True
        self.stop_btn.disabled = False
        
        with self.transcript_output:
            clear_output()
            print("🎤 Recording started... Speak into your microphone!")
        
        # Start audio stream
        self.stream = sd.InputStream(
            callback=self.audio_callback,
            channels=1,
            samplerate=self.sample_rate,
            blocksize=self.chunk_samples
        )
        self.stream.start()
        
        # Start transcription thread
        self.transcription_thread = threading.Thread(target=self.transcribe_loop)
        self.transcription_thread.start()
    
    def stop_recording(self, btn):
        """Stop recording and transcription"""
        self.recording = False
        self.start_btn.disabled = False
        self.stop_btn.disabled = True
        
        if hasattr(self, 'stream'):
            self.stream.stop()
            self.stream.close()
            
        with self.transcript_output:
            print("\n⏹️ Recording stopped.")
    
    def transcribe_loop(self):
        """Continuously transcribe audio chunks"""
        full_transcript = ""
        
        while self.recording:
            try:
                # Get audio chunk from queue
                if not self.audio_queue.empty():
                    audio_chunk = self.audio_queue.get(timeout=1)
                    
                    # Ensure audio is long enough for Whisper
                    if len(audio_chunk) >= self.chunk_samples:
                        # Transcribe with Whisper
                        result = self.model.transcribe(
                            audio_chunk,
                            fp16=False,
                            language=None  # Auto-detect language
                        )
                        
                        text = result["text"].strip()
                        language = result["language"]
                        
                        if text:  # Only update if we got text
                            full_transcript += f"[{language.upper()}] {text} "
                            
                            with self.transcript_output:
                                clear_output(wait=True)
                                print("📝 Live Transcript:")
                                print("-" * 50)
                                print(full_transcript)
                                print("-" * 50)
                                print(f"Detected Language: {language.upper()}")
                        
                else:
                    time.sleep(0.1)  # Small delay if no audio
                    
            except queue.Empty:
                continue
            except Exception as e:
                with self.transcript_output:
                    print(f"❌ Transcription error: {e}")

# Create and display the transcriber
transcriber = RealTimeTranscriber(whisper_model)
display(transcriber.ui)


VBox(children=(HBox(children=(Button(description='Start', style=ButtonStyle()), Button(description='Stop', sty…