# üéôÔ∏è Qwen3-TTS API Server v3

Run Qwen3-TTS with **Voice Cloning** on Colab GPU.

**Note:** Model downloads each session (~3GB). For persistent caching, manually copy from `/root/.cache/huggingface` to Drive after first run.

In [None]:
# 1Ô∏è‚É£ Install Dependencies
!pip install -q qwen-tts flask flask-cors pyngrok soundfile numpy

In [None]:
# 2Ô∏è‚É£ Load Model (NO Drive cache - downloads fresh each time)
import torch
from qwen_tts import Qwen3TTSModel

print("üîÑ Loading Qwen3-TTS model...")
print(f"   GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")
print(f"   VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
print("   Downloading ~3GB (no Drive cache)...")

model = Qwen3TTSModel.from_pretrained(
    "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
    device_map="cuda:0",
    torch_dtype=torch.bfloat16,
)

print("‚úÖ Qwen3-TTS model loaded!")

In [None]:
# 3Ô∏è‚É£ Enter ngrok token (get free at https://dashboard.ngrok.com/get-started/your-authtoken)
NGROK_TOKEN = ""  # @param {type:"string"}

from pyngrok import ngrok
if NGROK_TOKEN:
    ngrok.set_auth_token(NGROK_TOKEN)
    print("‚úÖ ngrok token set!")
else:
    print("‚ö†Ô∏è No token - get one at https://dashboard.ngrok.com/get-started/your-authtoken")

In [None]:
# 4Ô∏è‚É£ Start API Server
from flask import Flask, request, send_file, jsonify
from flask_cors import CORS
import soundfile as sf
import io, base64, tempfile, os

app = Flask(__name__)
CORS(app)

@app.route('/health', methods=['GET'])
def health():
    return jsonify({"status": "ok", "model": "Qwen3-TTS-1.7B-Base", "gpu": torch.cuda.get_device_name(0)})

@app.route('/api/tts', methods=['POST'])
def generate_tts():
    try:
        data = request.json
        text = data.get('text', 'Hello')
        language = data.get('language', 'English')
        ref_audio_b64 = data.get('ref_audio')
        ref_text = data.get('ref_text', '')
        
        print(f"üéôÔ∏è Gen: lang={language}, chars={len(text)}, clone={bool(ref_audio_b64)}")
        
        if ref_audio_b64:
            audio_bytes = base64.b64decode(ref_audio_b64)
            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
                f.write(audio_bytes)
                temp_path = f.name
            try:
                wavs, sr = model.generate_voice_clone(text=text, language=language, ref_audio=temp_path, ref_text=ref_text if ref_text else None)
            finally:
                os.unlink(temp_path)
        else:
            wavs, sr = model.generate_voice_clone(text=text, language=language, ref_audio="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-TTS-Repo/clone.wav", x_vector_only_mode=True)
        
        buffer = io.BytesIO()
        sf.write(buffer, wavs[0], sr, format='WAV')
        buffer.seek(0)
        print(f"‚úÖ Generated {len(wavs[0])/sr:.2f}s")
        return send_file(buffer, mimetype='audio/wav', as_attachment=True, download_name='output.wav')
    except Exception as e:
        import traceback; traceback.print_exc()
        return jsonify({"error": str(e)}), 500

@app.route('/api/clone', methods=['POST'])
def voice_clone():
    return generate_tts()

public_url = ngrok.connect(5000)
print("\n" + "="*50)
print("üöÄ QWEN3-TTS RUNNING!")
print("="*50)
print(f"\nüìã URL: {public_url}\n")
print("="*50 + "\n")

app.run(port=5000)