# ðŸŽ¤ LiveKit Voice Agent v6

**Whisper STT â†’ Gemini Flash â†’ Soprano TTS**

Fixed: Agent now stays alive and processes audio properly.

In [None]:
!pip install -q "livekit-agents[google,silero]~=1.3" faster-whisper soprano-tts

In [None]:
!nvidia-smi

In [None]:
import os

os.environ["LIVEKIT_URL"] = "wss://test-jllkasbg.livekit.cloud"
os.environ["LIVEKIT_API_KEY"] = "APIFnsAaWh3eFdR"
os.environ["LIVEKIT_API_SECRET"] = "WabCvkbupgaGfV7JQKBdZNDlYXuRFrr9jZcu7HTFdfG"
os.environ["GOOGLE_API_KEY"] = "AIzaSyD9sGx9FmvzIl7NtgU7vdwJVgs7NohSSqI"
os.environ["HF_HOME"] = "/content/hf_cache"

print("âœ… Credentials set")

In [None]:
%%writefile agent_v6.py
"""LiveKit Voice Agent v6: Whisper â†’ Gemini Flash â†’ Soprano"""

import asyncio
import os
import re
import numpy as np
from typing import AsyncIterable

from livekit import agents, rtc
from livekit.agents import Agent, AgentSession, ModelSettings, stt, cli
from livekit.plugins import google, silero

# Global models
WHISPER = None
SOPRANO = None
VAD = None


def load_models():
    global WHISPER, SOPRANO, VAD
    
    if WHISPER is None:
        print("Loading Faster Whisper...")
        from faster_whisper import WhisperModel
        WHISPER = WhisperModel("tiny", device="cuda", compute_type="float16")
        print("âœ… Whisper")
    
    if SOPRANO is None:
        print("Loading Soprano TTS...")
        from soprano import SopranoTTS
        SOPRANO = SopranoTTS(device="cuda")
        print("âœ… Soprano")
    
    if VAD is None:
        print("Loading Silero VAD...")
        VAD = silero.VAD.load(min_speech_duration=0.05, min_silence_duration=0.35, force_cpu=True)
        print("âœ… VAD")
    
    print("ðŸŽ‰ All models ready!")


class VoiceAgent(Agent):
    def __init__(self):
        super().__init__(instructions="You are a helpful voice assistant. Be concise.")
        self._sent_re = re.compile(r"(.+?[.!?]\s+|.+?\n+)", re.DOTALL)

    async def stt_node(self, audio: AsyncIterable[rtc.AudioFrame], model_settings: ModelSettings):
        async def _transcribe():
            chunks = []
            async for frame in audio:
                samples = np.frombuffer(frame.data, dtype=np.int16).astype(np.float32) / 32768.0
                chunks.append(samples)
            if not chunks:
                yield stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH, alternatives=[])
                return
            audio_data = np.concatenate(chunks)
            segments, _ = WHISPER.transcribe(audio_data, beam_size=1, language="en")
            text = " ".join(s.text for s in segments).strip()
            print(f"ðŸŽ¤ User: {text}")
            if text:
                yield stt.SpeechEvent(
                    type=stt.SpeechEventType.FINAL_TRANSCRIPT,
                    alternatives=[stt.SpeechData(text=text)],
                )
            yield stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH, alternatives=[])
        return _transcribe()

    async def tts_node(self, text: AsyncIterable[str], model_settings: ModelSettings):
        buffer = ""
        sr, spf = 32000, 640

        def to_frames(pcm: np.ndarray):
            pcm = np.clip(pcm, -1.0, 1.0)
            pcm_i16 = (pcm * 32767).astype(np.int16)
            for i in range(0, len(pcm_i16), spf):
                chunk = pcm_i16[i:i+spf]
                if len(chunk) < spf:
                    chunk = np.pad(chunk, (0, spf - len(chunk)))
                yield rtc.AudioFrame(
                    data=chunk.tobytes(),
                    sample_rate=sr,
                    num_channels=1,
                    samples_per_channel=spf
                )

        async def speak(sentence: str):
            print(f"ðŸ”Š Agent: {sentence.strip()}")
            for chunk in SOPRANO.infer_stream(sentence, chunk_size=1):
                pcm = np.asarray(chunk, dtype=np.float32)
                for frame in to_frames(pcm):
                    yield frame

        async for delta in text:
            buffer += delta
            while (m := self._sent_re.match(buffer)):
                sentence = m.group(1)
                buffer = buffer[len(sentence):]
                async for frame in speak(sentence):
                    yield frame
        if buffer.strip():
            async for frame in speak(buffer):
                yield frame


async def entrypoint(ctx: agents.JobContext):
    # Load models
    load_models()
    
    # Connect to room first and wait
    await ctx.connect()
    print(f"âœ… Connected to room: {ctx.room.name}")
    
    agent = VoiceAgent()
    
    session = AgentSession(
        turn_detection="vad",
        vad=VAD,
        llm=google.LLM(model="gemini-2.0-flash"),
    )
    
    # Start session
    await session.start(agent=agent, room=ctx.room)
    print("ðŸŽ¤ Listening... (speak now)")
    
    # Keep agent alive until room closes
    await session.wait()


if __name__ == "__main__":
    cli.run_app(
        agents.WorkerOptions(
            entrypoint_fnc=entrypoint,
        )
    )

In [None]:
# Run with 'start' mode instead of 'dev' to avoid hot-reload issues
!python agent_v6.py start

## Changes in v6:

1. **`await ctx.connect()`** - Wait for room connection before starting session
2. **`await session.wait()`** - Keep agent alive until session ends
3. **Using `start` instead of `dev`** - Avoids hot-reload killing processes

## Test:
1. Wait for "ðŸŽ¤ Listening..."
2. Join http://localhost:3000
3. Speak and wait for response