# 🎙️ Soprano Production-Grade Full-Duplex Agent

This notebook implements a high-performance **Speech-to-Speech** agent using:
- **Server**: FastAPI + WebSockets (Full Duplex)
- **STT**: Faster-Whisper (GPU optimized)
- **TTS**: Soprano (Streaming, Low Latency)
- **VAD**: Silero VAD (Fast CPU)
- **Transport**: Binary PCM over WebSockets
- **Client**: WebRTC-style microphone capture with **Echo Cancellation** (AEC)

### Architecture
1. **Browser** captures audio -> sends PCM to Server.
2. **Server** detects speech (VAD) -> **Interruption Spike** (stops TTS) -> STT -> LLM -> TTS Stream.
3. **Browser** receives PCM -> Queues for playback -> Flushes queue on interruption signal.


In [None]:
#@title 1. 📦 Setup & Installation
#@markdown **Run this cell first, then RESTART RUNTIME.**

print("🔧 Installing dependencies...")

# 0. LOCK NUMPY VERSION (Disabled)
# print("📌 Locking NumPy to 1.26.4...")
# !pip uninstall numpy -y --quiet 2>/dev/null
# !pip install "numpy==1.26.4" --quiet

# 1. Clone Soprano
!git clone https://github.com/ekwek1/soprano.git 2>/dev/null || echo "Soprano already cloned"
%cd soprano

# 2. Install Soprano & Core Deps
!pip install -e . --quiet
!pip install transformers huggingface_hub scipy unidecode --quiet

# 3. Install Server & Agent Deps
!pip install fastapi uvicorn[standard] websockets soundfile --quiet
!pip install faster-whisper --quiet

# 4. Install Cloudflared for Tunneling
!pip install pycloudflared --quiet

# 5. Force Reinstall NumPy safely at the end (Disabled)
# !pip install "numpy==1.26.4" --force-reinstall --quiet

print("✅ Installation complete!")
print("⚠️  IMPORTANT: Go to 'Runtime -> Restart runtime' now!")

In [None]:
%cd /content/soprano

In [None]:
#@title 2. 📝 Create Server Code (app.py)
#@markdown Writes the FastAPI server code to `app.py`.

# Ensure correct directory
import os
if os.path.isdir('/content/soprano'):
    os.chdir('/content/soprano')
    print("📂 Changed directory to /content/soprano")

code = '''
import asyncio
import json
import time
from dataclasses import dataclass
from typing import Optional, Deque
from collections import deque
import os
import sys

import numpy as np
import torch
from fastapi import FastAPI, WebSocket
from fastapi.responses import HTMLResponse
from faster_whisper import WhisperModel
from starlette.websockets import WebSocketDisconnect
from contextlib import asynccontextmanager

# ---- Soprano ----
try:
    from soprano import SopranoTTS
except ImportError:
    sys.path.append(os.getcwd())
    from soprano import SopranoTTS

# ---- Audio config ----
IN_SR = 16000
OUT_SR = 32000

# ---- VAD (Silero) ----
torch.set_num_threads(1)
vad_model, vad_utils = torch.hub.load(repo_or_dir="snakers4/silero-vad", model="silero_vad", trust_repo=True)
(get_speech_timestamps, save_audio, read_audio, VADIterator, collect_chunks) = vad_utils

# ---- Preload models at startup ----
@asynccontextmanager
async def lifespan(app: FastAPI):
    # Startup: Load models BEFORE accepting any connections
    print("🔄 Preloading Soprano TTS...")
    app.state.tts = SopranoTTS(device="cuda", backend="auto")
    print("✅ Soprano loaded!")
    
    print("🔄 Preloading Whisper STT...")
    app.state.whisper = WhisperModel("base", device="cuda", compute_type="float16")
    print("✅ Whisper loaded!")
    
    print("🟢 Server ready to accept connections!")
    yield
    # Shutdown (cleanup if needed)
    print("Server shutting down...")

app = FastAPI(lifespan=lifespan)

@dataclass
class SessionState:
    speech_buf: Deque[np.ndarray]
    vad: VADIterator
    in_speech: bool
    tts_task: Optional[asyncio.Task]
    tts_cancel: asyncio.Event

def pcm16_to_float32(pcm: bytes) -> np.ndarray:
    x = np.frombuffer(pcm, dtype=np.int16).astype(np.float32)
    return (x / 32768.0).clip(-1.0, 1.0)

def float32_to_pcm16(x: np.ndarray) -> bytes:
    x = np.clip(x, -1.0, 1.0)
    i16 = (x * 32767.0).astype(np.int16)
    return i16.tobytes()

def generate_response_text(user_text: str) -> str:
    return f"I heard you say: {user_text}. This is a full duplex test."

async def safe_send_text(ws: WebSocket, payload: dict) -> bool:
    """Best-effort send that doesn't crash on disconnect."""
    try:
        await ws.send_text(json.dumps(payload))
        return True
    except (WebSocketDisconnect, RuntimeError, asyncio.CancelledError):
        return False

async def safe_send_bytes(ws: WebSocket, data: bytes) -> bool:
    """Best-effort send bytes that doesn't crash on disconnect."""
    try:
        await ws.send_bytes(data)
        return True
    except (WebSocketDisconnect, RuntimeError, asyncio.CancelledError):
        return False

async def stream_soprano_tts(ws: WebSocket, tts: SopranoTTS, text: str, st: SessionState):
    """Stream TTS chunks to client. Supports cancellation for barge-in."""
    try:
        if not await safe_send_text(ws, {"type": "tts_start", "sample_rate": OUT_SR}):
            return

        for chunk in tts.infer_stream(text, chunk_size=1, temperature=0.0):
            if st.tts_cancel.is_set():
                break

            if hasattr(chunk, "detach"):
                chunk = chunk.detach().cpu().numpy()

            chunk = np.asarray(chunk, dtype=np.float32).reshape(-1)
            if not await safe_send_bytes(ws, float32_to_pcm16(chunk)):
                break
            await asyncio.sleep(0)

        await safe_send_text(ws, {"type": "tts_end"})
    except asyncio.CancelledError:
        pass  # Clean exit on cancellation

async def cancel_tts(ws: WebSocket, st: SessionState):
    """Server-side cancel + client-side flush (best-effort)."""
    st.tts_cancel.set()
    if st.tts_task and not st.tts_task.done():
        st.tts_task.cancel()
        try:
            await st.tts_task
        except (asyncio.CancelledError, Exception):
            pass
    st.tts_task = None
    await safe_send_text(ws, {"type": "stop_audio"})

@app.get("/")
async def get():
    with open("index.html", "r") as f:
        return HTMLResponse(f.read())

@app.websocket("/ws")
async def ws_endpoint(ws: WebSocket):
    await ws.accept()
    print("🔗 Client connected!")

    # Models are already loaded at startup
    tts = app.state.tts
    whisper = app.state.whisper
    vad_it = VADIterator(vad_model, sampling_rate=IN_SR)

    st = SessionState(
        speech_buf=deque(),
        vad=vad_it,
        in_speech=False,
        tts_task=None,
        tts_cancel=asyncio.Event(),
    )

    # Hello message
    hello = "I'm online. You can speak to me, and interrupt me, at any time."
    st.tts_cancel.clear()
    st.tts_task = asyncio.create_task(stream_soprano_tts(ws, tts, hello, st))

    try:
        async for pcm in ws.iter_bytes():
            x = pcm16_to_float32(pcm)
            tx = torch.from_numpy(x)

            speech_event = st.vad(tx)

            if speech_event:
                if speech_event.get("start"):
                    if not st.in_speech:
                        st.in_speech = True
                        print("[VAD] Speech Start -> Interrupting")
                        await cancel_tts(ws, st)
                
                if speech_event.get("end"):
                    if st.in_speech:
                        st.in_speech = False
                        print("[VAD] Speech End -> Transcribing")
                        
                        if len(st.speech_buf) > 0:
                            utter = np.concatenate(list(st.speech_buf))
                            st.speech_buf.clear()

                            segments, _ = whisper.transcribe(utter, beam_size=1, vad_filter=False)
                            user_text = " ".join([s.text.strip() for s in segments]).strip()
                            
                            if user_text:
                                await safe_send_text(ws, {"type": "user_text", "text": user_text})
                                
                                resp = generate_response_text(user_text)
                                await safe_send_text(ws, {"type": "agent_text", "text": resp})

                                st.tts_cancel.clear()
                                st.tts_task = asyncio.create_task(stream_soprano_tts(ws, tts, resp, st))

            if st.in_speech:
                st.speech_buf.append(x)

    except WebSocketDisconnect:
        print("👋 Client disconnected (normal)")
    except asyncio.CancelledError:
        print("⚠️ Connection cancelled")
    finally:
        st.tts_cancel.set()
        if st.tts_task and not st.tts_task.done():
            st.tts_task.cancel()
            try:
                await st.tts_task
            except (asyncio.CancelledError, Exception):
                pass
        print("🧹 Session cleanup complete")
'''

with open("app.py", "w") as f:
    f.write(code)

print("✅ app.py created (with preloaded models + disconnect-safe handling)")

In [None]:
#@title 3. 📝 Create Client Code (index.html)
#@markdown Writes the WebRTC client code.

html_code = """
<!doctype html>
<html>
<head>
  <meta charset=\"utf-8\"/>
  <title>Soprano Full Duplex</title>
  <style>
    body { font-family: system-ui; max-width: 800px; margin: 2rem auto; padding: 0 1rem; background: #f4f4f4; }
    .container { background: white; padding: 2rem; border-radius: 10px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); }
    button { font-size: 1.2rem; padding: 10px 20px; cursor: pointer; background: #007bff; color: white; border: none; border-radius: 5px; }
    button:disabled { background: #ccc; }
    #status { margin-top: 1rem; font-weight: bold; color: #555; }
    #log { background: #1e1e1e; color: #0f0; padding: 1rem; border-radius: 5px; height: 300px; overflow-y: auto; font-family: monospace; margin-top: 1rem; }
  </style>
</head>
<body>
<div class=\"container\">
  <h2>🎙️ Soprano Full Duplex Agent</h2>
  <p>Speak naturally. Interrupt anytime.</p>
  <button id=\"go\">Start Conversation</button>
  <div id=\"status\">Disconnected</div>
  <div id=\"log\"></div>
</div>
<script>
// Detect environment to set WS URL automatically
const proto = window.location.protocol === 'https:' ? 'wss' : 'ws';
const WS_URL = `${proto}://${window.location.host}/ws`;

const logEl = document.getElementById(\"log\");
const log = (s) => {
  logEl.textContent += `> ${s}\n`;
  logEl.scrollTop = logEl.scrollHeight;
};

let audioCtx, source, processor;
let ttsQueue = [];
let playing = false;
let outSR = 32000;

function clearPlayback() {
  ttsQueue = [];
  playing = false;
  // Note: Web Audio API buffers can't be strictly \"cancelled\" once scheduled easily without replacing nodes,
  // but clearing the queue prevents future chunks. For strictly immediate stop, we'd close/reopen context or disconnect node.
  // For this demo, clearing queue + state is sufficient for perceived interruption.
}

async function pumpPlayback() {
  if (playing) return;
  playing = true;

  while (ttsQueue.length > 0) {
    if (!playing) break; // Check interrupt
    
    const pcm16 = ttsQueue.shift();
    const i16 = new Int16Array(pcm16);
    const f32 = new Float32Array(i16.length);
    for (let i=0; i<i16.length; i++) f32[i] = i16[i] / 32768.0;

    const buf = audioCtx.createBuffer(1, f32.length, outSR);
    buf.getChannelData(0).set(f32);

    const src = audioCtx.createBufferSource();
    src.buffer = buf;
    src.connect(audioCtx.destination);
    src.start();

    await new Promise(res => src.onended = res);
  }
  playing = false;
}

document.getElementById(\"go\").onclick = async () => {
  // 1. Audio Context
  audioCtx = new (window.AudioContext || window.webkitAudioContext)();
  
  // 2. WebSocket
  document.getElementById(\"status\").textContent = \"Connecting...\";
  const ws = new WebSocket(WS_URL);
  ws.binaryType = \"arraybuffer\";

  ws.onopen = async () => {
    document.getElementById(\"status\").textContent = \"Connected - Listening\";
    document.getElementById(\"go\").disabled = true;
    log(\"Ws Connected. Requesting Mic...\");

    // 3. Mic Capture with AEC (Critical for full duplex)
    const stream = await navigator.mediaDevices.getUserMedia({
      audio: {
        channelCount: 1,
        sampleRate: 16000,
        echoCancellation: true, 
        noiseSuppression: true,
        autoGainControl: true
      }
    });

    // 4. Processor (Worklet is better, but ScriptProcessor is easier for single-file demo)
    source = audioCtx.createMediaStreamSource(stream);
    processor = audioCtx.createScriptProcessor(512, 1, 1);

    processor.onaudioprocess = (e) => {
      if (ws.readyState !== WebSocket.OPEN) return;
      
      const input = e.inputBuffer.getChannelData(0);
      // Float32 -> Int16
      const pcm = new Int16Array(input.length);
      for (let i=0; i<input.length; i++) {
        let s = Math.max(-1, Math.min(1, input[i]));
        pcm[i] = s < 0 ? s * 32768 : s * 32767;
      }
      ws.send(pcm.buffer);
    };

    source.connect(processor);
    processor.connect(audioCtx.destination);
    log(\"Mic active. Speak anytime!\");
  };

  ws.onmessage = (ev) => {
    if (typeof ev.data === \"string\") {
      const msg = JSON.parse(ev.data);
      
      if (msg.type === \"tts_start\") outSR = msg.sample_rate;
      if (msg.type === \"stop_audio\") {
        log(\"🛑 INTERRUPT Detected -> Clearing playback\");
        clearPlayback();
      }
      if (msg.type === \"user_text\") log(\"User: \" + msg.text);
      if (msg.type === \"agent_text\") log(\"Agent: \" + msg.text);
      return;
    }

    // Binary Audio
    ttsQueue.push(ev.data);
    pumpPlayback();
  };

  ws.onclose = () => {
    document.getElementById(\"status\").textContent = \"Disconnected\";
    log(\"Disconnected\");
  };
};
</script>
</body>
</html>
"""

with open("index.html", "w") as f:
    f.write(html_code)

print("✅ index.html created")

In [None]:
#@title 4. 🚀 Launch Server
#@markdown Starts the FastAPI server and exposes it via Cloudflare Tunnel.
#@markdown Click the **trycloudflare** link that appears to access the UI.

import os
import threading
import time
from pycloudflared import try_cloudflare

# 1. Start Tunnel
public_url = try_cloudflare(port=8000)
print(f"\n🌍 Public URL: {public_url}\n")

# 2. Run Uvicorn (Blocking)
print("🚀 Starting Uvicorn Server... (This buffer will capture logs)")
!uvicorn app:app --host 0.0.0.0 --port 8000