# Soprano TTS - WebUI on GPU

This notebook runs the Soprano TTS WebUI using the GPU acceleration provided by Google Colab.
It installs the necessary dependencies, loads the model with the fast `lmdeploy` backend, and creates a public link to the WebUI.

In [None]:
#@title 1. Setup & Installation
#@markdown This cell clones the repository and installs all necessary dependencies including `lmdeploy` for CUDA support.

!git clone https://github.com/ekwek1/soprano.git
%cd soprano

# Install the package with lmdeploy support for CUDA acceleration
print("Installing dependencies... This may take a minute.")
!pip install -e .[lmdeploy] --quiet

# Ensure compatibility in Colab environment
# We install a specific version of Gradio that is known to support audio streaming well, or just upgrade
!pip install gradio sounddevice scipy --upgrade --quiet

print("Installation complete!")

In [None]:
#@title 2. Load Model & Run WebUI
#@markdown This cell loads the model onto the GPU and launches the Gradio WebUI. Click the public URL (e.g. `https://xxxx.gradio.live`) to access it.

import time
import gradio as gr
import numpy as np
import torch
from soprano import SopranoTTS

# Initialize model with GPU acceleration
print("Loading Soprano TTS model (Backend: LMDeploy, Device: CUDA)...")
try:
    model = SopranoTTS(
        backend='lmdeploy',
        device='cuda',
        cache_size_mb=100,
        decoder_batch_size=1
    )
    device = model.device
    backend = model.backend
    print("Model loaded successfully!")
except Exception as e:
    print(f"Error loading model: {e}")
    print("Fallback: Make sure you have selected a GPU Runtime (Runtime > Change runtime type > T4 GPU).")
    raise e

SAMPLE_RATE = 32000

def generate_speech(
    text: str,
    temperature: float,
    top_p: float,
    repetition_penalty: float,
    chunk_size: int = 1,
    streaming: bool = False,
):
    if not text.strip():
        yield None, "Please enter some text to generate speech."
        return

    try:
        if streaming:
            stream = model.infer_stream(
                text,
                temperature=temperature,
                top_p=top_p,
                repetition_penalty=repetition_penalty,
                chunk_size=chunk_size,
            )
            yield None, "‚è≥ Streaming..."
            
            start_time = time.time()
            
            # Critical change for Colab streaming: 
            # We yield only the NEW chunk. Gradio's audio with streaming=True will append it.
            for i, chunk in enumerate(stream):
                chunk_np = chunk.cpu().numpy()
                chunk_int16 = (chunk_np * 32767).astype(np.int16)
                
                # Yield JUST the new chunk
                yield (SAMPLE_RATE, chunk_int16), f"Streaming chunk {i+1}..."
            
            latency = time.time() - start_time
            # Final yield to indicate completion (optional, sometimes sending empty or last status helps)
            yield None, (f"‚úì Streaming complete | " f"{latency*1000:.2f} ms total time")
            return

        # Non-streaming mode (standard)
        start_time = time.perf_counter()

        audio = model.infer(
            text,
            temperature=temperature,
            top_p=top_p,
            repetition_penalty=repetition_penalty,
        )

        gen_time = time.perf_counter() - start_time

        audio_np = audio.cpu().numpy()
        audio_int16 = (audio_np * 32767).astype(np.int16)

        audio_seconds = len(audio_np) / SAMPLE_RATE
        rtf = audio_seconds / gen_time if gen_time > 0 else float("inf")

        status = (
            f"‚úì Generated {audio_seconds:.2f} s audio | "
            f"Generation time: {gen_time:.3f} s "
            f"({rtf:.2f}x realtime)"
        )

        yield (SAMPLE_RATE, audio_int16), status
        return

    except Exception as e:
        yield None, f"‚úó Error: {str(e)}"


# Create Gradio interface
with gr.Blocks(title="Soprano TTS") as demo:
    gr.Markdown(
        f"""# üó£Ô∏è Soprano TTS (GPU Accelerated)
        
        Running on **{device.upper()}** using **{backend}** backend.
        """
    )
    with gr.Row():
        with gr.Column(scale=2):
            text_input = gr.Textbox(
                label="Text to Synthesize",
                placeholder="Enter text here...",
                value="Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed.",
                lines=5,
                max_lines=10,
            )
            
            streaming = gr.Checkbox(
                label="Stream Audio",
                value=False,
                info="Enable streaming generation (updates audio player in real-time)"
            )
            
            with gr.Accordion("Advanced Settings", open=False):
                temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Temperature")
                top_p = gr.Slider(minimum=0.5, maximum=1.0, value=0.95, step=0.05, label="Top P")
                repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.2, step=0.1, label="Repetition Penalty")
                chunk_size = gr.Slider(minimum=1, maximum=10, value=1, step=1, precision=0, label="Chunk Size (Streaming only)")

            generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")
        with gr.Column(scale=1):
            # streaming=True enables chunk-based output which Gradio queues/appends automatically
            audio_output = gr.Audio(
                label="Generated Speech", 
                type="numpy", 
                autoplay=True, 
                streaming=True
            )
            status_output = gr.Textbox(label="Status", interactive=False, lines=3)

    generate_btn.click(
        fn=generate_speech,
        inputs=[text_input, temperature, top_p, repetition_penalty, chunk_size, streaming],
        outputs=[audio_output, status_output],
    )

print("Starting Gradio interface... Please click the public link below.")
demo.launch(share=True)
