# ðŸŽ¤ LiveKit Voice Agent v13

**Whisper + Gemini + Soprano - CLEAN VERSION**

In [None]:
!pip install -q "livekit-agents[google,silero]~=1.3" soprano-tts faster-whisper

In [None]:
import os
os.environ["LIVEKIT_URL"] = "wss://test-jllkasbg.livekit.cloud"
os.environ["LIVEKIT_API_KEY"] = "APIFnsAaWh3eFdR"
os.environ["LIVEKIT_API_SECRET"] = "WabCvkbupgaGfV7JQKBdZNDlYXuRFrr9jZcu7HTFdfG"
os.environ["GOOGLE_API_KEY"] = "AIzaSyD9sGx9FmvzIl7NtgU7vdwJVgs7NohSSqI"
os.environ["HF_HOME"] = "/content/hf_cache"
print("âœ… Ready")

In [None]:
%%writefile agent_v13.py
import asyncio
import re
import numpy as np
from typing import AsyncIterable

from livekit import agents, rtc
from livekit.agents import Agent, AgentSession, ModelSettings, cli, stt
from livekit.plugins import google, silero

SOPRANO = None
WHISPER = None

def load_models():
    global SOPRANO, WHISPER
    if WHISPER is None:
        print("Loading Whisper...")
        from faster_whisper import WhisperModel
        WHISPER = WhisperModel("tiny", device="cuda", compute_type="float16")
        print("âœ… Whisper ready")
    if SOPRANO is None:
        print("Loading Soprano...")
        from soprano import SopranoTTS
        SOPRANO = SopranoTTS(device="cuda")
        print("âœ… Soprano ready")


class VoiceAgent(Agent):
    def __init__(self):
        super().__init__(instructions="You are a helpful voice assistant. Keep responses short.")
        self._sent_re = re.compile(r"(.+?[.!?]\s+|.+?\n+)", re.DOTALL)

    async def stt_node(self, audio: AsyncIterable[rtc.AudioFrame], model_settings: ModelSettings):
        chunks = []
        async for frame in audio:
            chunks.append(np.frombuffer(frame.data, dtype=np.int16))
        
        if not chunks:
            yield stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH, alternatives=[])
            return
        
        audio_np = np.concatenate(chunks).astype(np.float32) / 32768.0
        
        try:
            segments, _ = WHISPER.transcribe(audio_np, beam_size=1, language="en")
            text = " ".join(s.text for s in segments).strip()
            print(f"ðŸŽ¤ User: {text}")
            if text:
                yield stt.SpeechEvent(
                    type=stt.SpeechEventType.FINAL_TRANSCRIPT,
                    alternatives=[stt.SpeechData(text=text, language="en")],
                )
        except Exception as e:
            print(f"STT error: {e}")
        
        yield stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH, alternatives=[])

    async def tts_node(self, text: AsyncIterable[str], model_settings: ModelSettings):
        buffer = ""
        sr, spf = 32000, 640

        def to_frames(pcm):
            pcm = np.clip(pcm, -1.0, 1.0)
            pcm_i16 = (pcm * 32767).astype(np.int16)
            for i in range(0, len(pcm_i16), spf):
                chunk = pcm_i16[i:i+spf]
                if len(chunk) < spf:
                    chunk = np.pad(chunk, (0, spf - len(chunk)))
                yield rtc.AudioFrame(data=chunk.tobytes(), sample_rate=sr, num_channels=1, samples_per_channel=spf)

        async def speak(sentence):
            sentence = sentence.strip()
            if not sentence:
                return
            print(f"ðŸ”Š Speaking: {sentence}")
            try:
                for chunk in SOPRANO.infer_stream(sentence, chunk_size=1):
                    for frame in to_frames(np.asarray(chunk, dtype=np.float32)):
                        yield frame
            except Exception as e:
                print(f"TTS error: {e}")

        async for delta in text:
            buffer += delta
            while (m := self._sent_re.match(buffer)):
                sentence = m.group(1)
                buffer = buffer[len(sentence):]
                async for frame in speak(sentence):
                    yield frame
        if buffer.strip():
            async for frame in speak(buffer):
                yield frame


async def entrypoint(ctx: agents.JobContext):
    load_models()
    await ctx.connect()
    print(f"âœ… Connected: {ctx.room.name}")
    
    vad = silero.VAD.load(min_speech_duration=0.05, min_silence_duration=0.4)
    print("âœ… VAD ready")
    
    session = AgentSession(
        turn_detection="vad",
        vad=vad,
        llm=google.LLM(model="gemini-2.0-flash"),
    )
    
    await session.start(agent=VoiceAgent(), room=ctx.room)
    print("\nðŸŽ¤ LISTENING...\n")
    
    done = asyncio.Event()
    ctx.room.on("disconnected")(lambda: done.set())
    await done.wait()


if __name__ == "__main__":
    cli.run_app(agents.WorkerOptions(entrypoint_fnc=entrypoint))

In [None]:
!python agent_v13.py start