# ðŸŽ¤ LiveKit Voice Agent v3

**Faster Whisper STT â†’ Gemini LLM â†’ Soprano TTS**

Single-process version for Colab.

In [None]:
!pip install -q livekit-agents[google,silero]~=1.3
!pip install -q faster-whisper soprano-tts

In [None]:
# Pre-load all models
import os
os.environ["HF_HOME"] = "/content/hf_cache"

print("Loading Whisper...")
from faster_whisper import WhisperModel
WHISPER = WhisperModel("tiny", device="cuda", compute_type="float16")
print("âœ… Whisper")

print("Loading Soprano...")
from soprano import SopranoTTS  
SOPRANO = SopranoTTS(device="cuda")
print("âœ… Soprano")

print("Loading VAD...")
from livekit.plugins import silero
VAD = silero.VAD.load(min_speech_duration=0.05, min_silence_duration=0.35, force_cpu=True)
print("âœ… VAD")

print("\nðŸŽ‰ All models loaded!")

In [None]:
import os

# ========== REPLACE WITH YOUR CREDENTIALS ==========
os.environ["LIVEKIT_URL"] = "wss://test-jllkasbg.livekit.cloud"
os.environ["LIVEKIT_API_KEY"] = "APIFnsAaWh3eFdR"
os.environ["LIVEKIT_API_SECRET"] = "WabCvkbupgaGfV7JQKBdZNDlYXuRFrr9jZcu7HTFdfG"
os.environ["GOOGLE_API_KEY"] = "AIzaSyD9sGx9FmvzIl7NtgU7vdwJVgs7NohSSqI"
# =====================================================

print("âœ… Credentials set")

In [None]:
import asyncio
import re
import numpy as np
from typing import AsyncIterable

from livekit import agents, rtc, api
from livekit.agents import Agent, AgentSession, ModelSettings, stt
from livekit.plugins import google


class VoiceAgent(Agent):
    def __init__(self):
        super().__init__(instructions="You are a helpful voice assistant. Be concise.")
        self._sent_re = re.compile(r"(.+?[.!?]\s+|.+?\n+)", re.DOTALL)

    async def stt_node(self, audio: AsyncIterable[rtc.AudioFrame], model_settings: ModelSettings):
        """Faster Whisper STT"""
        async def _transcribe():
            chunks = []
            async for frame in audio:
                samples = np.frombuffer(frame.data, dtype=np.int16).astype(np.float32) / 32768.0
                chunks.append(samples)
            if not chunks:
                yield stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH, alternatives=[])
                return
            audio_data = np.concatenate(chunks)
            segments, _ = WHISPER.transcribe(audio_data, beam_size=1, language="en")
            text = " ".join(s.text for s in segments).strip()
            print(f"ðŸŽ¤ User: {text}")
            if text:
                yield stt.SpeechEvent(type=stt.SpeechEventType.FINAL_TRANSCRIPT, alternatives=[stt.SpeechData(text=text)])
            yield stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH, alternatives=[])
        return _transcribe()

    async def tts_node(self, text: AsyncIterable[str], model_settings: ModelSettings):
        """Soprano TTS"""
        buffer = ""
        sr, spf = 32000, 640
        
        def to_frames(pcm):
            pcm = np.clip(pcm, -1.0, 1.0)
            pcm_i16 = (pcm * 32767).astype(np.int16)
            for i in range(0, len(pcm_i16), spf):
                chunk = pcm_i16[i:i+spf]
                if len(chunk) < spf:
                    chunk = np.pad(chunk, (0, spf - len(chunk)))
                yield rtc.AudioFrame(data=chunk.tobytes(), sample_rate=sr, num_channels=1, samples_per_channel=spf)

        async def speak(sentence):
            print(f"ðŸ”Š Agent: {sentence.strip()}")
            for chunk in SOPRANO.infer_stream(sentence, chunk_size=1):
                pcm = np.asarray(chunk, dtype=np.float32)
                for frame in to_frames(pcm):
                    yield frame

        async for delta in text:
            buffer += delta
            while (m := self._sent_re.match(buffer)):
                sentence = m.group(1)
                buffer = buffer[len(sentence):]
                async for frame in speak(sentence):
                    yield frame
        if buffer.strip():
            async for frame in speak(buffer):
                yield frame


async def run_agent():
    url = os.environ["LIVEKIT_URL"]
    api_key = os.environ["LIVEKIT_API_KEY"]
    api_secret = os.environ["LIVEKIT_API_SECRET"]
    
    room_name = "soprano-agent"
    lk_api = api.LiveKitAPI(url, api_key, api_secret)
    await lk_api.room.create_room(api.CreateRoomRequest(name=room_name))
    
    token = api.AccessToken(api_key, api_secret)
    token.with_identity("agent").with_grants(api.VideoGrants(room_join=True, room=room_name))
    
    print(f"\nðŸ”— Join: https://agents-playground.livekit.io/#room={room_name}&url={url}")
    
    room = rtc.Room()
    await room.connect(url, token.to_jwt())
    print("âœ… Connected")
    
    agent = VoiceAgent()
    session = AgentSession(
        turn_detection="vad",
        vad=VAD,
        # Use regular Gemini LLM (not Realtime API)
        llm=google.LLM(model="gemini-2.0-flash"),
    )
    
    await session.start(agent=agent, room=room)
    print("ðŸŽ¤ Listening... (interrupt kernel to stop)\n")
    
    try:
        while True:
            await asyncio.sleep(1)
    except (KeyboardInterrupt, asyncio.CancelledError):
        pass
    finally:
        await room.disconnect()
        print("\nðŸ‘‹ Disconnected")


await run_agent()