# ðŸŽ¤ LiveKit Voice Agent: Faster Whisper + Gemini + Soprano

Complete working pipeline for Google Colab T4 GPU with all timeout fixes.

| Component | Model |
|-----------|-------|
| **STT** | Faster Whisper (tiny) |
| **LLM** | Gemini 2.5 Flash |
| **TTS** | Soprano |
| **VAD** | Silero |

## 1. Install Dependencies

In [None]:
!pip install -q livekit-agents[google,silero]~=1.3
!pip install -q faster-whisper
!pip install -q soprano-tts

## 2. Check GPU

In [None]:
!nvidia-smi

import torch
print(f"\nPyTorch CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## 3. Pre-Download Models (Run Once)

This caches models so the agent doesn't timeout during startup.

In [None]:
import os
os.environ["HF_HOME"] = "/content/hf_cache"

print("Downloading Faster Whisper...")
from faster_whisper import WhisperModel
whisper = WhisperModel("tiny", device="cuda", compute_type="float16")
del whisper
print("âœ… Whisper cached")

print("Downloading Soprano TTS...")
from soprano import SopranoTTS
soprano = SopranoTTS(device="cuda")
del soprano
print("âœ… Soprano cached")

print("\nðŸŽ‰ All models ready!")

## 4. Set Credentials

Get credentials from:
- **LiveKit**: https://cloud.livekit.io â†’ Project â†’ API Keys
- **Google**: https://aistudio.google.com/apikey

In [None]:
import os

# ========== REPLACE THESE ==========
os.environ["LIVEKIT_URL"] = "wss://YOUR-PROJECT.livekit.cloud"
os.environ["LIVEKIT_API_KEY"] = "YOUR_API_KEY"
os.environ["LIVEKIT_API_SECRET"] = "YOUR_API_SECRET"
os.environ["GOOGLE_API_KEY"] = "YOUR_GOOGLE_API_KEY"
# ====================================

os.environ["GEMINI_MODEL"] = "gemini-2.5-flash"
os.environ["SYSTEM_PROMPT"] = "You are a helpful voice assistant. Be concise."
os.environ["HF_HOME"] = "/content/hf_cache"

print("âœ… Credentials set")

## 5. Create Agent Script

In [None]:
%%writefile livekit_agent.py
"""LiveKit Voice Agent: Faster Whisper STT â†’ Gemini â†’ Soprano TTS"""

import os
import re
import numpy as np
from typing import AsyncIterable, Optional, List

from livekit import agents, rtc
from livekit.agents import Agent, AgentSession, ModelSettings, stt, cli
from livekit.agents.worker import WorkerOptions
from livekit.agents.job import JobExecutorType
from livekit.plugins import google, silero
from google.genai.types import Modality

from faster_whisper import WhisperModel
from soprano import SopranoTTS


class VoiceAgent(Agent):
    def __init__(self, whisper: WhisperModel, soprano: SopranoTTS, instructions: str):
        # FIX: Pass instructions to parent Agent class
        super().__init__(instructions=instructions)
        self._whisper = whisper
        self._soprano = soprano
        self._sent_re = re.compile(r"(.+?[.!?]\s+|.+?\n+)", re.DOTALL)

    async def stt_node(
        self, audio: AsyncIterable[rtc.AudioFrame], model_settings: ModelSettings
    ) -> Optional[AsyncIterable[stt.SpeechEvent]]:
        async def _transcribe():
            chunks = []
            async for frame in audio:
                samples = np.frombuffer(frame.data, dtype=np.int16).astype(np.float32) / 32768.0
                chunks.append(samples)
            
            if not chunks:
                yield stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH, alternatives=[])
                return
            
            audio_data = np.concatenate(chunks)
            segments, _ = self._whisper.transcribe(audio_data, beam_size=1, language="en")
            text = " ".join(s.text for s in segments).strip()
            
            if text:
                yield stt.SpeechEvent(
                    type=stt.SpeechEventType.FINAL_TRANSCRIPT,
                    alternatives=[stt.SpeechData(text=text)],
                )
            yield stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH, alternatives=[])
        return _transcribe()

    async def tts_node(
        self, text: AsyncIterable[str], model_settings: ModelSettings
    ) -> AsyncIterable[rtc.AudioFrame]:
        buffer = ""
        sr, spf = 32000, 640  # 20ms frames

        def to_frames(pcm: np.ndarray):
            pcm = np.clip(pcm, -1.0, 1.0)
            pcm_i16 = (pcm * 32767).astype(np.int16)
            for i in range(0, len(pcm_i16), spf):
                chunk = pcm_i16[i:i+spf]
                if len(chunk) < spf:
                    chunk = np.pad(chunk, (0, spf - len(chunk)))
                yield rtc.AudioFrame(data=chunk.tobytes(), sample_rate=sr, num_channels=1, samples_per_channel=spf)

        async def speak(sentence: str):
            for chunk in self._soprano.infer_stream(sentence, chunk_size=1):
                pcm = np.asarray(chunk, dtype=np.float32)
                for frame in to_frames(pcm):
                    yield frame

        async for delta in text:
            buffer += delta
            while (m := self._sent_re.match(buffer)):
                sentence = m.group(1)
                buffer = buffer[len(sentence):]
                async for frame in speak(sentence):
                    yield frame
        if buffer.strip():
            async for frame in speak(buffer):
                yield frame


def prewarm(proc: agents.JobProcess):
    print("Loading VAD...")
    proc.userdata["vad"] = silero.VAD.load(
        min_speech_duration=0.05, min_silence_duration=0.35, force_cpu=True
    )
    print("Loading Whisper...")
    proc.userdata["whisper"] = WhisperModel("tiny", device="cuda", compute_type="float16")
    print("Loading Soprano...")
    proc.userdata["soprano"] = SopranoTTS(device="cuda")
    print("âœ… All models loaded")


async def entrypoint(ctx: agents.JobContext):
    instructions = os.getenv("SYSTEM_PROMPT", "You are a helpful voice assistant. Be concise.")
    
    agent = VoiceAgent(
        whisper=ctx.proc.userdata["whisper"],
        soprano=ctx.proc.userdata["soprano"],
        instructions=instructions,  # Pass instructions here
    )
    
    session = AgentSession(
        turn_detection="vad",
        vad=ctx.proc.userdata["vad"],
        llm=google.realtime.RealtimeModel(
            model=os.getenv("GEMINI_MODEL", "gemini-2.5-flash"),
            modalities=[Modality.TEXT],
            instructions=instructions,
            temperature=0.3,
        ),
        preemptive_generation=True,
    )
    await session.start(agent=agent, room=ctx.room)


if __name__ == "__main__":
    cli.run_app(
        WorkerOptions(
            entrypoint_fnc=entrypoint,
            prewarm_fnc=prewarm,
            initialize_process_timeout=300.0,  # 5 min for model downloads
            num_idle_processes=0,              # No parallel downloads
            job_executor_type=JobExecutorType.THREAD,  # Better for Colab
        )
    )

## 6. Run Agent

In [None]:
!python livekit_agent.py start

---

## ðŸ§ª Test Your Agent

1. Go to [LiveKit Playground](https://agents-playground.livekit.io/)
2. Enter your LiveKit URL and credentials
3. Join a room and start talking!

## ðŸ”§ Troubleshooting

| Issue | Fix |
|-------|-----|
| Timeout errors | Run cell 3 first to pre-cache models |
| CUDA OOM | Restart runtime, run cells in order |
| Soprano backend error | `!pip install soprano-tts[transformers]` |