# ðŸŽ¤ LiveKit Voice Agent v4

**Whisper STT â†’ Gemini Flash â†’ Soprano TTS**

Pre-loads models to avoid subprocess timeouts.

In [None]:
!pip install -q "livekit-agents[google,silero]~=1.3" faster-whisper soprano-tts

In [None]:
!nvidia-smi

In [None]:
# === PRE-LOAD ALL MODELS (before LiveKit starts) ===
import os
os.environ["HF_HOME"] = "/content/hf_cache"

print("Loading Faster Whisper...")
from faster_whisper import WhisperModel
WHISPER = WhisperModel("tiny", device="cuda", compute_type="float16")
print("âœ… Whisper")

print("Loading Soprano TTS...")
from soprano import SopranoTTS
SOPRANO = SopranoTTS(device="cuda")
print("âœ… Soprano")

print("Loading Silero VAD...")
from livekit.plugins import silero
VAD = silero.VAD.load(min_speech_duration=0.05, min_silence_duration=0.35, force_cpu=True)
print("âœ… VAD")

print("\nðŸŽ‰ All models pre-loaded! Run next cell.")

In [None]:
import os

# ========== REPLACE WITH YOUR CREDENTIALS ==========
os.environ["LIVEKIT_URL"] = "wss://YOUR-PROJECT.livekit.cloud"
os.environ["LIVEKIT_API_KEY"] = "YOUR_API_KEY"
os.environ["LIVEKIT_API_SECRET"] = "YOUR_API_SECRET"
os.environ["GOOGLE_API_KEY"] = "YOUR_GOOGLE_API_KEY"
# =====================================================

print("âœ… Credentials set")

In [None]:
%%writefile livekit_agent_v4.py
"""LiveKit Voice Agent v4: Whisper â†’ Gemini Flash â†’ Soprano"""

import os
import re
import numpy as np
from typing import AsyncIterable, Optional

from livekit import agents, rtc
from livekit.agents import Agent, AgentSession, ModelSettings, stt, cli
from livekit.agents.worker import WorkerOptions
from livekit.agents.job import JobExecutorType
from livekit.plugins import google, silero

# Import pre-loaded globals from notebook
from faster_whisper import WhisperModel
from soprano import SopranoTTS


class VoiceAgent(Agent):
    def __init__(self, whisper, soprano, instructions: str):
        super().__init__(instructions=instructions)
        self._whisper = whisper
        self._soprano = soprano
        self._sent_re = re.compile(r"(.+?[.!?]\s+|.+?\n+)", re.DOTALL)

    async def stt_node(self, audio: AsyncIterable[rtc.AudioFrame], model_settings: ModelSettings):
        """Custom STT using Faster Whisper"""
        async def _transcribe():
            chunks = []
            async for frame in audio:
                samples = np.frombuffer(frame.data, dtype=np.int16).astype(np.float32) / 32768.0
                chunks.append(samples)
            if not chunks:
                yield stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH, alternatives=[])
                return
            audio_data = np.concatenate(chunks)
            segments, _ = self._whisper.transcribe(audio_data, beam_size=1, language="en")
            text = " ".join(s.text for s in segments).strip()
            print(f"ðŸŽ¤ User: {text}")
            if text:
                yield stt.SpeechEvent(
                    type=stt.SpeechEventType.FINAL_TRANSCRIPT,
                    alternatives=[stt.SpeechData(text=text)],
                )
            yield stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH, alternatives=[])
        return _transcribe()

    async def tts_node(self, text: AsyncIterable[str], model_settings: ModelSettings):
        """Custom TTS using Soprano infer_stream"""
        buffer = ""
        sr, spf = 32000, 640  # 20ms frames at 32kHz

        def to_frames(pcm: np.ndarray):
            pcm = np.clip(pcm, -1.0, 1.0)
            pcm_i16 = (pcm * 32767).astype(np.int16)
            for i in range(0, len(pcm_i16), spf):
                chunk = pcm_i16[i:i+spf]
                if len(chunk) < spf:
                    chunk = np.pad(chunk, (0, spf - len(chunk)))
                yield rtc.AudioFrame(
                    data=chunk.tobytes(),
                    sample_rate=sr,
                    num_channels=1,
                    samples_per_channel=spf
                )

        async def speak(sentence: str):
            print(f"ðŸ”Š Agent: {sentence.strip()}")
            for chunk in self._soprano.infer_stream(sentence, chunk_size=1):
                pcm = np.asarray(chunk, dtype=np.float32)
                for frame in to_frames(pcm):
                    yield frame

        async for delta in text:
            buffer += delta
            while (m := self._sent_re.match(buffer)):
                sentence = m.group(1)
                buffer = buffer[len(sentence):]
                async for frame in speak(sentence):
                    yield frame
        if buffer.strip():
            async for frame in speak(buffer):
                yield frame


def prewarm(proc: agents.JobProcess):
    """Load models - uses pre-loaded globals to avoid timeout"""
    print("Loading models...")
    # Re-load here since subprocess doesn't share memory with notebook
    proc.userdata["whisper"] = WhisperModel("tiny", device="cuda", compute_type="float16")
    proc.userdata["soprano"] = SopranoTTS(device="cuda")
    proc.userdata["vad"] = silero.VAD.load(
        min_speech_duration=0.05, min_silence_duration=0.35, force_cpu=True
    )
    print("âœ… Models loaded")


async def entrypoint(ctx: agents.JobContext):
    instructions = "You are a helpful voice assistant. Keep responses concise, 1-2 sentences."
    
    agent = VoiceAgent(
        whisper=ctx.proc.userdata["whisper"],
        soprano=ctx.proc.userdata["soprano"],
        instructions=instructions,
    )
    
    session = AgentSession(
        turn_detection="vad",
        vad=ctx.proc.userdata["vad"],
        llm=google.LLM(model="gemini-2.0-flash"),  # Text-to-text API
    )
    
    await session.start(agent=agent, room=ctx.room)
    print("ðŸŽ¤ Listening...")


if __name__ == "__main__":
    cli.run_app(
        WorkerOptions(
            entrypoint_fnc=entrypoint,
            prewarm_fnc=prewarm,
            initialize_process_timeout=120.0,  # 2 min for model loading
            num_idle_processes=0,  # Single process, no parallel loading
            job_executor_type=JobExecutorType.THREAD,  # Better for Colab
        )
    )

In [None]:
!python livekit_agent_v4.py start

---

## ðŸ§ª Test

1. Go to [LiveKit Playground](https://agents-playground.livekit.io/)
2. Enter your LiveKit credentials
3. Join a room and speak!

## ðŸ”§ Troubleshooting

| Issue | Fix |
|-------|-----|
| Still timing out | Increase `initialize_process_timeout` to 300.0 |
| Model not found | Make sure cell 3 ran successfully |
| Gemini error | Check your `GOOGLE_API_KEY` is valid |