# üó£Ô∏è Real-Time Voice Agent (Pipecat + Soprano)

Continuous conversation - just talk!

In [None]:
!pip install -q "pipecat-ai[google,silero,daily]" faster-whisper soprano-tts aiohttp

In [None]:
import os
os.environ["HF_HOME"] = "/content/hf_cache"

print("Loading models...")
from faster_whisper import WhisperModel
from soprano import SopranoTTS

WHISPER = WhisperModel("tiny", device="cuda", compute_type="float16")
print("‚úÖ Whisper")

SOPRANO = SopranoTTS(device="cuda")
print("‚úÖ Soprano\nüéâ Models ready!")

In [None]:
import os

# Get Daily.co API key from: https://dashboard.daily.co/developers
os.environ["DAILY_API_KEY"] = "YOUR_DAILY_API_KEY"  # Replace!
os.environ["GOOGLE_API_KEY"] = "YOUR_GOOGLE_API_KEY"  # Replace!

print("‚úÖ Credentials set")

In [None]:
import asyncio
import aiohttp
import numpy as np
import time

from pipecat.frames.frames import AudioRawFrame, TextFrame, TTSAudioRawFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.processors.frame_processor import FrameProcessor
from pipecat.services.google.llm import GoogleLLMService
from pipecat.transports.daily.transport import DailyTransport, DailyParams
from pipecat.audio.vad.silero import SileroVADAnalyzer


async def create_daily_room():
    """Create a Daily room and return the URL"""
    api_key = os.environ["DAILY_API_KEY"]
    async with aiohttp.ClientSession() as session:
        async with session.post(
            "https://api.daily.co/v1/rooms",
            headers={"Authorization": f"Bearer {api_key}"},
            json={"properties": {"exp": int(time.time()) + 3600}}
        ) as resp:
            data = await resp.json()
            if "url" not in data:
                raise Exception(f"Failed to create room: {data}")
            return data["url"]


class WhisperSTT(FrameProcessor):
    def __init__(self):
        super().__init__()
        self._buffer = []

    async def process_frame(self, frame, direction):
        await super().process_frame(frame, direction)
        if isinstance(frame, AudioRawFrame):
            samples = np.frombuffer(frame.audio, dtype=np.int16).astype(np.float32) / 32768.0
            self._buffer.append(samples)
        else:
            if self._buffer:
                audio = np.concatenate(self._buffer)
                self._buffer = []
                segments, _ = WHISPER.transcribe(audio, beam_size=1, language="en")
                text = " ".join(s.text for s in segments).strip()
                if text:
                    print(f"üé§ You: {text}")
                    await self.push_frame(TextFrame(text=text))
            await self.push_frame(frame)


class SopranoTTSService(FrameProcessor):
    async def process_frame(self, frame, direction):
        await super().process_frame(frame, direction)
        if isinstance(frame, TextFrame) and frame.text:
            print(f"üîä Agent: {frame.text}")
            audio = SOPRANO.infer(frame.text)
            audio_np = audio.cpu().numpy() if hasattr(audio, 'cpu') else np.array(audio)
            audio_i16 = (np.clip(audio_np, -1.0, 1.0) * 32767).astype(np.int16)
            await self.push_frame(TTSAudioRawFrame(
                audio=audio_i16.tobytes(), sample_rate=32000, num_channels=1
            ))
        else:
            await self.push_frame(frame)


async def main():
    print("Creating Daily room...")
    room_url = await create_daily_room()
    print(f"\nüîó JOIN HERE: {room_url}\n")
    
    transport = DailyTransport(
        room_url=room_url,
        token=None,
        bot_name="Soprano",
        params=DailyParams(
            audio_in_enabled=True,
            audio_out_enabled=True,
            vad_analyzer=SileroVADAnalyzer(),
        )
    )
    
    llm = GoogleLLMService(
        model="gemini-2.0-flash",
        api_key=os.environ["GOOGLE_API_KEY"],
        system_instruction="You are a helpful voice assistant. Keep responses to 1-2 sentences."
    )
    
    pipeline = Pipeline([
        transport.input(),
        WhisperSTT(),
        llm,
        SopranoTTSService(),
        transport.output(),
    ])
    
    task = PipelineTask(pipeline)
    
    @transport.event_handler("on_first_participant_joined")
    async def on_join(transport, participant):
        print("üëã User joined!")
        await task.queue_frame(TextFrame(text="Hello! I'm your voice assistant."))
    
    runner = PipelineRunner()
    print("üé§ Waiting for you to join...")
    await runner.run(task)


await main()