# üé§ LiveKit Voice Agent v9

**Deepgram STT ‚Üí Gemini Flash ‚Üí Soprano TTS**

This is the recommended version! Deepgram uses a simple API key (no service account needed).

In [None]:
!pip install -q "livekit-agents[google,silero,deepgram]~=1.3" soprano-tts

In [None]:
!nvidia-smi

In [None]:
import os

# LiveKit credentials
os.environ["LIVEKIT_URL"] = "wss://test-jllkasbg.livekit.cloud"
os.environ["LIVEKIT_API_KEY"] = "APIFnsAaWh3eFdR"
os.environ["LIVEKIT_API_SECRET"] = "WabCvkbupgaGfV7JQKBdZNDlYXuRFrr9jZcu7HTFdfG"

# Gemini for LLM
os.environ["GOOGLE_API_KEY"] = "AIzaSyD9sGx9FmvzIl7NtgU7vdwJVgs7NohSSqI"

# ========== ADD YOUR DEEPGRAM API KEY ==========
os.environ["DEEPGRAM_API_KEY"] = "YOUR-DEEPGRAM-API-KEY"
# ================================================

os.environ["HF_HOME"] = "/content/hf_cache"

print("‚úÖ Credentials set")
print("‚ö†Ô∏è  Make sure to add your DEEPGRAM_API_KEY above!")

In [None]:
%%writefile agent_v9.py
"""LiveKit Voice Agent v9: Deepgram STT ‚Üí Gemini ‚Üí Soprano TTS"""

import asyncio
import os
import re
import numpy as np
from typing import AsyncIterable

from livekit import agents, rtc
from livekit.agents import Agent, AgentSession, ModelSettings, cli
from livekit.plugins import google, silero, deepgram

# Global TTS
SOPRANO = None


def load_soprano():
    global SOPRANO
    if SOPRANO is None:
        print("Loading Soprano TTS...")
        from soprano import SopranoTTS
        SOPRANO = SopranoTTS(device="cuda")
        print("‚úÖ Soprano TTS ready")


class VoiceAgent(Agent):
    def __init__(self):
        super().__init__(instructions="You are a helpful voice assistant. Keep responses short, 1-2 sentences.")
        self._sent_re = re.compile(r"(.+?[.!?]\s+|.+?\n+)", re.DOTALL)

    async def tts_node(self, text: AsyncIterable[str], model_settings: ModelSettings):
        """Custom TTS using Soprano"""
        buffer = ""
        sr, spf = 32000, 640  # 20ms frames at 32kHz

        def to_frames(pcm: np.ndarray):
            pcm = np.clip(pcm, -1.0, 1.0)
            pcm_i16 = (pcm * 32767).astype(np.int16)
            for i in range(0, len(pcm_i16), spf):
                chunk = pcm_i16[i:i+spf]
                if len(chunk) < spf:
                    chunk = np.pad(chunk, (0, spf - len(chunk)))
                yield rtc.AudioFrame(
                    data=chunk.tobytes(),
                    sample_rate=sr,
                    num_channels=1,
                    samples_per_channel=spf
                )

        async def speak(sentence: str):
            sentence = sentence.strip()
            if not sentence:
                return
            print(f"üîä Speaking: {sentence}")
            try:
                for chunk in SOPRANO.infer_stream(sentence, chunk_size=1):
                    pcm = np.asarray(chunk, dtype=np.float32)
                    for frame in to_frames(pcm):
                        yield frame
            except Exception as e:
                print(f"‚ùå TTS error: {e}")

        async for delta in text:
            buffer += delta
            while (m := self._sent_re.match(buffer)):
                sentence = m.group(1)
                buffer = buffer[len(sentence):]
                async for frame in speak(sentence):
                    yield frame
        if buffer.strip():
            async for frame in speak(buffer):
                yield frame


async def entrypoint(ctx: agents.JobContext):
    # Load Soprano TTS
    load_soprano()
    
    # Connect to room
    await ctx.connect()
    print(f"‚úÖ Connected to room: {ctx.room.name}")
    
    # Load VAD
    print("Loading Silero VAD...")
    vad = silero.VAD.load(min_speech_duration=0.05, min_silence_duration=0.4)
    print("‚úÖ VAD ready")
    
    agent = VoiceAgent()
    
    # Deepgram STT (uses simple API key - no service account needed!)
    session = AgentSession(
        turn_detection="vad",
        vad=vad,
        stt=deepgram.STT(model="nova-2"),  # Deepgram Nova-2
        llm=google.LLM(model="gemini-2.0-flash"),
        # TTS handled by custom tts_node
    )
    
    @session.on("user_input_transcribed")
    def on_transcription(text):
        print(f"üé§ User said: {text}")
    
    await session.start(agent=agent, room=ctx.room)
    print("\n" + "="*50)
    print("üé§ LISTENING... Speak now!")
    print("="*50 + "\n")
    
    # Keep alive until disconnected
    disconnect_event = asyncio.Event()
    
    @ctx.room.on("disconnected")
    def on_disconnect():
        print("Room disconnected")
        disconnect_event.set()
    
    await disconnect_event.wait()


if __name__ == "__main__":
    cli.run_app(
        agents.WorkerOptions(
            entrypoint_fnc=entrypoint,
        )
    )

In [None]:
!python agent_v9.py start

---

## üîß Pipeline
```
üéôÔ∏è Your Voice
    ‚Üì
Deepgram Nova-2 STT (simple API key)
    ‚Üì
Gemini 2.0 Flash (LLM)
    ‚Üì
Soprano TTS (custom voice)
    ‚Üì
üîä Agent Speaks
```

## Why Deepgram?
- ‚úÖ Simple API key (no Google Cloud service account needed)
- ‚úÖ Fast, low-latency streaming
- ‚úÖ Excellent accuracy with Nova-2 model
- ‚úÖ Free tier available

## Get Deepgram API Key
1. Go to https://console.deepgram.com/
2. Sign up (free tier available)
3. Create an API key
4. Add it above