# üéôÔ∏è Qwen3-TTS: Advanced Text-to-Speech AI

<div align="center">

[![Support](https://img.shields.io/badge/Buy%20Me%20a%20Coffee-Support-FFDD00?style=for-the-badge&logo=buymeacoffee&logoColor=black)](https://buymeacoffee.com/lynettethecat)
[![License](https://img.shields.io/badge/License-MIT-blue?style=for-the-badge)](https://opensource.org/licenses/MIT)

</div>

---

## üìå What is This Notebook?

This notebook provides a free, easy-to-use interface for Qwen3-TTS ‚Äî one of the most advanced open-source text-to-speech models.

‚úÖ Voice Cloning (3+ seconds reference audio)  
‚úÖ Preset Character Voices  
‚úÖ AI Voice Design from text description  

---

## ‚öôÔ∏è Technical Details

- Model: Qwen3-TTS 1.7B (Base, CustomVoice, VoiceDesign)
- Hardware: Google Colab T4 GPU
- Optimizations: FP16 precision, SDPA attention, TF32
- Expected Speed: RTF 3.5‚Äì5x

---

**‚¨áÔ∏è Run the cells below to get started! ‚¨áÔ∏è**


In [None]:
!pip uninstall -y torch torchvision torchaudio
!pip install torch==2.4.1+cu121 torchvision==0.19.1+cu121 torchaudio==2.4.1+cu121 --index-url https://download.pytorch.org/whl/cu121
!pip install ninja
!pip install flash-attn --no-build-isolation
!pip install -U qwen-tts gradio huggingface_hub

In [None]:
import gradio as gr
from qwen_tts import Qwen3TTSModel
import torch, soundfile as sf, tempfile, gc, time

current_model = None
current_model_type = None

torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

print(f"GPU: {torch.cuda.get_device_name(0)}")

def load_model(model_type):
    global current_model, current_model_type
    if current_model_type == model_type:
        return current_model
    if current_model is not None:
        del current_model
        gc.collect()
        torch.cuda.empty_cache()
    
    if model_type == "base":
        model_name = "Qwen/Qwen3-TTS-12Hz-1.7B-Base"
    elif model_type == "custom":
        model_name = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
    else:
        model_name = "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign"

    current_model = Qwen3TTSModel.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="cuda:0",
        attn_implementation="sdpa"
    )
    current_model_type = model_type
    return current_model

def generate(model_type, **kwargs):
    model = load_model(model_type)
    with torch.inference_mode():
        wavs, sr = kwargs["fn"](model)
    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
    sf.write(tmp.name, wavs[0], sr)
    return tmp.name

with gr.Blocks(title="Qwen3-TTS", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# üéôÔ∏è Qwen3-TTS Interface")

    with gr.Tab("Voice Clone"):
        text = gr.Textbox(label="Text", lines=3)
        ref_audio = gr.Audio(type="filepath")
        btn = gr.Button("Generate")
        out = gr.Audio()

        btn.click(
            lambda t, a: generate(
                "base",
                fn=lambda m: m.generate_voice_clone(
                    text=t,
                    voice_clone_prompt=m.create_voice_clone_prompt(ref_audio=a, x_vector_only_mode=True)
                )
            ),
            inputs=[text, ref_audio],
            outputs=out
        )

    with gr.Tab("Custom Voice"):
        text2 = gr.Textbox(label="Text", lines=3)
        voice = gr.Textbox(label="Voice Name", value="serena")
        btn2 = gr.Button("Generate")
        out2 = gr.Audio()

        btn2.click(
            lambda t, v: generate(
                "custom",
                fn=lambda m: m.generate_custom_voice(text=t, speaker=v)
            ),
            inputs=[text2, voice],
            outputs=out2
        )

    with gr.Tab("Voice Design"):
        text3 = gr.Textbox(label="Text", lines=3)
        desc = gr.Textbox(label="Voice Description")
        btn3 = gr.Button("Generate")
        out3 = gr.Audio()

        btn3.click(
            lambda t, d: generate(
                "design",
                fn=lambda m: m.generate_voice_design(text=t, instruct=d)
            ),
            inputs=[text3, desc],
            outputs=out3
        )

print("Qwen3-TTS ready.")
demo.launch(share=True, debug=True)
