# ü§ñ Soprano Real-Time Voice Agent (Colab Version)

This notebook runs a full-duplex conversational agent using **Soprano TTS** for output and **Gradio** for streaming audio I/O.

**Features:**
- **Real-time Streaming:** Snappy, low-latency responses.
- **Barge-in Support:** The agent listens while speaking. If you interrupt, it stops talking.
- **Mock Brain:** Uses a simulated STT/LLM for demonstration. You can plug in OpenAI/Groq easily.

**Note:** If you see Numpy errors, restart the runtime after installation.

In [None]:
#@title 1. Install Dependencies
!git clone https://github.com/ekwek1/soprano.git
%cd soprano
!pip install -e .[lmdeploy] --quiet
# Numpy 2.0 can cause issues, so we pin it to <2.0
!pip install "numpy<2.0" gradio sounddevice scipy --upgrade --quiet
print("Done!")

In [None]:
#@title 2. Define the Agent
import gradio as gr
import numpy as np
import time
import queue
import threading
import random
from soprano import SopranoTTS

class MockBrain:
    """Simulates the Intelligence (STT -> LLM)"""
    def generate(self, audio_data):
        # In a real app, you would run Whisper STT on audio_data here
        # Then send text to an LLM
        responses = [
            "I hear you loud and clear. Soprano is fast!",
            "That is very interesting. Please tell me more.",
            "I stopped speaking because I heard you interrupt. This is full duplex.",
            "The weather is nice in the cloud today.",
            "Voice interfaces are the future of AI."
        ]
        return random.choice(responses)

class ColabAgent:
    def __init__(self):
        print("Initializing Soprano (CUDA)...")
        try:
            self.tts = SopranoTTS(backend='lmdeploy', device='cuda', cache_size_mb=100)
        except Exception as e:
            print(f"Warning: Could not load CUDA backend ({e}). Falling back to CPU/Transformers.")
            self.tts = SopranoTTS(backend='transformers', device='cpu')
        
        self.brain = MockBrain()
        
        # State
        self.audio_buffer = []
        self.silence_start = None
        self.is_speaking = False
        self.user_is_speaking = False
        self.interrupted = False

        # Constants
        self.SAMPLE_RATE = 32000
        self.VAD_THRESHOLD = 0.05  # Energy threshold
        self.SILENCE_DURATION = 0.8 # Seconds of silence to trigger response

    def process_audio_stream(self, audio_chunk, state):
        """
        Gradio calls this roughly every 0.1s with new microphone audio.
        Returns: Audio output (or None if listening)
        """
        if audio_chunk is None:
            return None
            
        fs, data = audio_chunk
        # Convert to standard format (float32, mono)
        if data.dtype == np.int16:
            data = data.astype(np.float32) / 32768.0
        if len(data.shape) > 1:
            data = np.mean(data, axis=1)
            
        rms = np.sqrt(np.mean(data**2))
        
        # --- VAD & State Logic ---
        if rms > self.VAD_THRESHOLD:
            if not self.user_is_speaking:
                self.user_is_speaking = True
                # Barge-in Trigger!
                if self.is_speaking:
                    self.interrupted = True 
                    print("[Barge-in detected] Stopping agent...")
            self.silence_start = None
        elif self.user_is_speaking:
            # Handled silence after speech
            if self.silence_start is None:
                self.silence_start = time.time()
            
            if time.time() - self.silence_start > self.SILENCE_DURATION:
                # User finished speaking -> Trigger Response
                self.user_is_speaking = False
                self.silence_start = None
                return self.trigger_response()

        return None

    def trigger_response(self):
        """Generates response audio generator"""
        # Reset interruption flag
        self.interrupted = False
        self.is_speaking = True
        
        text = self.brain.generate(None)
        print(f"Agent replying: {text}")
        
        # Streaming Inference
        stream = self.tts.infer_stream(text)
        
        for chunk in stream:
            # Check interruption
            if self.interrupted:
                print("Agent halted.")
                break
                
            chunk_np = chunk.cpu().numpy()
            chunk_int16 = (chunk_np * 32767).astype(np.int16)
            yield (self.SAMPLE_RATE, chunk_int16)
            
        self.is_speaking = False

# Initialize
agent = ColabAgent()

# Gradio Interface
with gr.Blocks(title="Real-Time Soprano Agent") as demo:
    gr.Markdown("## üéôÔ∏è Speak to Interrupt!")
    gr.Markdown("This agent uses VAD to detect when you speak. If you speak while it's talking, it stops.")
    
    with gr.Row():
        mic = gr.Audio(sources=["microphone"], streaming=True, show_label=False)
        speaker = gr.Audio(label="Agent Output", streaming=True, autoplay=True, interactive=False)

    # Connect streaming input to processing function
    mic.stream(agent.process_audio_stream, inputs=[mic], outputs=[speaker], time_limit=60)

demo.launch(share=True)