# üé§ LiveKit Voice Agent v10

**Self-Hosted Whisper API ‚Üí Gemini Flash ‚Üí Soprano TTS**

Architecture:
- Cell 1: Start Whisper API server (separate process)
- Cell 2: Run Agent (only loads Soprano TTS)

This reduces memory usage by keeping models in separate processes!

In [None]:
!pip install -q "livekit-agents[google,silero]~=1.3" soprano-tts faster-whisper fastapi uvicorn httpx

In [None]:
import os

# LiveKit credentials
os.environ["LIVEKIT_URL"] = "wss://test-jllkasbg.livekit.cloud"
os.environ["LIVEKIT_API_KEY"] = "APIFnsAaWh3eFdR"
os.environ["LIVEKIT_API_SECRET"] = "WabCvkbupgaGfV7JQKBdZNDlYXuRFrr9jZcu7HTFdfG"

# Gemini for LLM
os.environ["GOOGLE_API_KEY"] = "AIzaSyD9sGx9FmvzIl7NtgU7vdwJVgs7NohSSqI"

os.environ["HF_HOME"] = "/content/hf_cache"

print("‚úÖ Credentials set")

## Step 1: Create & Start Whisper API Server

In [None]:
%%writefile whisper_server.py
"""Whisper STT API Server - runs on port 8000"""

import io
import numpy as np
from fastapi import FastAPI, UploadFile, File
from fastapi.responses import JSONResponse
import uvicorn

app = FastAPI()

# Load Whisper model at startup
WHISPER = None

@app.on_event("startup")
async def load_model():
    global WHISPER
    print("Loading Faster Whisper...")
    from faster_whisper import WhisperModel
    WHISPER = WhisperModel("tiny", device="cuda", compute_type="float16")
    print("‚úÖ Whisper ready on port 8000")

@app.post("/transcribe")
async def transcribe(audio: UploadFile = File(...), sample_rate: int = 16000):
    """Transcribe audio bytes (int16 PCM) to text"""
    try:
        audio_bytes = await audio.read()
        # Convert int16 bytes to float32
        audio_int16 = np.frombuffer(audio_bytes, dtype=np.int16)
        audio_float = audio_int16.astype(np.float32) / 32768.0
        
        # Transcribe
        segments, _ = WHISPER.transcribe(audio_float, beam_size=1, language="en")
        text = " ".join(s.text for s in segments).strip()
        
        return {"text": text, "success": True}
    except Exception as e:
        return {"text": "", "success": False, "error": str(e)}

@app.get("/health")
async def health():
    return {"status": "ok", "model": "whisper-tiny"}

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)

In [None]:
# Start Whisper server in background
import subprocess
import time

print("üöÄ Starting Whisper API server...")
whisper_process = subprocess.Popen(
    ["python", "whisper_server.py"],
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT
)

# Wait for server to start
time.sleep(10)

# Check if it's running
import httpx
try:
    r = httpx.get("http://localhost:8000/health", timeout=5)
    print(f"‚úÖ Whisper server running: {r.json()}")
except:
    print("‚è≥ Still loading... wait a few more seconds")

## Step 2: Create & Run Agent with Custom Whisper STT

In [None]:
%%writefile agent_v10.py
"""LiveKit Voice Agent v10: Self-hosted Whisper API ‚Üí Gemini ‚Üí Soprano TTS"""

import asyncio
import os
import re
import io
import numpy as np
import httpx
from typing import AsyncIterable

from livekit import agents, rtc
from livekit.agents import Agent, AgentSession, ModelSettings, cli, stt
from livekit.plugins import google, silero

WHISPER_API = "http://localhost:8000"
SOPRANO = None


def load_soprano():
    global SOPRANO
    if SOPRANO is None:
        print("Loading Soprano TTS...")
        from soprano import SopranoTTS
        SOPRANO = SopranoTTS(device="cuda")
        print("‚úÖ Soprano TTS ready")


class VoiceAgent(Agent):
    def __init__(self):
        super().__init__(instructions="You are a helpful voice assistant. Keep responses short, 1-2 sentences.")
        self._sent_re = re.compile(r"(.+?[.!?]\s+|.+?\n+)", re.DOTALL)
        self._http_client = httpx.AsyncClient(timeout=30)

    async def stt_node(self, audio: AsyncIterable[rtc.AudioFrame], model_settings: ModelSettings):
        """Custom STT using self-hosted Whisper API"""
        async def _transcribe():
            # Collect all audio frames
            chunks = []
            async for frame in audio:
                # Each frame has int16 data
                chunks.append(frame.data)
            
            if not chunks:
                yield stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH, alternatives=[])
                return
            
            # Combine all audio data
            audio_bytes = b"".join(chunks)
            
            # Call Whisper API
            try:
                files = {"audio": ("audio.raw", audio_bytes, "application/octet-stream")}
                response = await self._http_client.post(f"{WHISPER_API}/transcribe", files=files)
                result = response.json()
                text = result.get("text", "").strip()
                
                print(f"üé§ User: {text}")
                
                if text:
                    yield stt.SpeechEvent(
                        type=stt.SpeechEventType.FINAL_TRANSCRIPT,
                        alternatives=[stt.SpeechData(text=text)],
                    )
            except Exception as e:
                print(f"STT error: {e}")
            
            yield stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH, alternatives=[])
        
        return _transcribe()

    async def tts_node(self, text: AsyncIterable[str], model_settings: ModelSettings):
        """Custom TTS using Soprano"""
        buffer = ""
        sr, spf = 32000, 640

        def to_frames(pcm: np.ndarray):
            pcm = np.clip(pcm, -1.0, 1.0)
            pcm_i16 = (pcm * 32767).astype(np.int16)
            for i in range(0, len(pcm_i16), spf):
                chunk = pcm_i16[i:i+spf]
                if len(chunk) < spf:
                    chunk = np.pad(chunk, (0, spf - len(chunk)))
                yield rtc.AudioFrame(
                    data=chunk.tobytes(),
                    sample_rate=sr,
                    num_channels=1,
                    samples_per_channel=spf
                )

        async def speak(sentence: str):
            sentence = sentence.strip()
            if not sentence:
                return
            print(f"üîä Speaking: {sentence}")
            try:
                for chunk in SOPRANO.infer_stream(sentence, chunk_size=1):
                    pcm = np.asarray(chunk, dtype=np.float32)
                    for frame in to_frames(pcm):
                        yield frame
            except Exception as e:
                print(f"TTS error: {e}")

        async for delta in text:
            buffer += delta
            while (m := self._sent_re.match(buffer)):
                sentence = m.group(1)
                buffer = buffer[len(sentence):]
                async for frame in speak(sentence):
                    yield frame
        if buffer.strip():
            async for frame in speak(buffer):
                yield frame


async def entrypoint(ctx: agents.JobContext):
    # Check Whisper API is running
    try:
        async with httpx.AsyncClient() as client:
            r = await client.get(f"{WHISPER_API}/health")
            print(f"‚úÖ Whisper API: {r.json()}")
    except:
        print("‚ùå Whisper API not running! Start it first.")
        return
    
    # Load Soprano TTS (only model in this process)
    load_soprano()
    
    # Connect to room
    await ctx.connect()
    print(f"‚úÖ Connected to room: {ctx.room.name}")
    
    # Load VAD
    print("Loading Silero VAD...")
    vad = silero.VAD.load(min_speech_duration=0.05, min_silence_duration=0.4)
    print("‚úÖ VAD ready")
    
    agent = VoiceAgent()
    
    session = AgentSession(
        turn_detection="vad",
        vad=vad,
        llm=google.LLM(model="gemini-2.0-flash"),
        # STT and TTS handled by custom nodes in VoiceAgent
    )
    
    await session.start(agent=agent, room=ctx.room)
    print("\n" + "="*50)
    print("üé§ LISTENING... Speak now!")
    print("="*50 + "\n")
    
    # Keep alive
    disconnect_event = asyncio.Event()
    
    @ctx.room.on("disconnected")
    def on_disconnect():
        print("Room disconnected")
        disconnect_event.set()
    
    await disconnect_event.wait()


if __name__ == "__main__":
    cli.run_app(
        agents.WorkerOptions(
            entrypoint_fnc=entrypoint,
        )
    )

In [None]:
!python agent_v10.py start

---

## üîß Architecture
```
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ  Whisper API Server (8000)  ‚îÇ  ‚Üê Separate process
‚îÇ  - Faster Whisper (GPU)     ‚îÇ
‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò
            ‚Üë HTTP
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ     LiveKit Agent           ‚îÇ
‚îÇ  - Soprano TTS (GPU)        ‚îÇ  ‚Üê Only loads TTS
‚îÇ  - Gemini LLM (API)         ‚îÇ
‚îÇ  - Silero VAD (CPU)         ‚îÇ
‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò
```

## Benefits
- ‚úÖ No external API keys needed for STT
- ‚úÖ Models in separate processes (better memory)
- ‚úÖ Full control over Whisper model
- ‚úÖ Can swap Whisper models (tiny/base/small)

In [None]:
# To stop the Whisper server when done:
# whisper_process.terminate()