# üéôÔ∏è Qwen3-TTS API Server

Run Qwen3-TTS with **Voice Cloning** on Colab GPU.

**Features:**
- Voice cloning from 3-30s audio
- 10 languages supported

**How to use:**
1. Go to Runtime ‚Üí Change runtime type ‚Üí Select **T4 GPU**
2. Run all cells (Ctrl+F9)
3. Copy the ngrok URL
4. Paste in your Qwen3 TTS page

## 1Ô∏è‚É£ Install Dependencies

In [None]:
!pip install -q qwen-tts flask flask-cors pyngrok soundfile numpy

## 2Ô∏è‚É£ Mount Google Drive (for caching)

In [None]:
from google.colab import drive
import os
import shutil

# Mount Google Drive
drive.mount('/content/drive')

# Cache directory in Google Drive
CACHE_DIR = '/content/drive/MyDrive/qwen3_tts_cache'

# IMPORTANT: Clear corrupted cache if it exists
if os.path.exists(CACHE_DIR):
    print("‚ö†Ô∏è Found existing cache, clearing it to fix corruption...")
    shutil.rmtree(CACHE_DIR)
    print("‚úÖ Cache cleared!")

os.makedirs(CACHE_DIR, exist_ok=True)

# Set HuggingFace cache
os.environ['HF_HOME'] = CACHE_DIR
os.environ['TRANSFORMERS_CACHE'] = CACHE_DIR
os.environ['HF_HUB_CACHE'] = os.path.join(CACHE_DIR, 'hub')

print(f"‚úÖ Cache directory: {CACHE_DIR}")

## 3Ô∏è‚É£ Load Qwen3-TTS Model

In [None]:
import torch
from qwen_tts import Qwen3TTSModel

print("üîÑ Loading Qwen3-TTS model...")
print(f"   GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")
print(f"   VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
print("   This will download ~3GB on first run (cached in Drive after)")

# Load the Base model (supports voice cloning)
# Using default local cache first, then copy to Drive if needed
model = Qwen3TTSModel.from_pretrained(
    "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
    device_map="cuda:0",
    torch_dtype=torch.bfloat16,
)

print("‚úÖ Qwen3-TTS model loaded!")

## 4Ô∏è‚É£ Setup ngrok

In [None]:
# @title Enter your ngrok authtoken (get free at https://ngrok.com)
NGROK_TOKEN = ""  # @param {type:"string"}

from pyngrok import ngrok

if NGROK_TOKEN:
    ngrok.set_auth_token(NGROK_TOKEN)
    print("‚úÖ ngrok token set!")
else:
    print("‚ö†Ô∏è No ngrok token - using free tier")
    print("   Get a free token at: https://dashboard.ngrok.com/get-started/your-authtoken")

## 5Ô∏è‚É£ Start API Server

In [None]:
from flask import Flask, request, send_file, jsonify
from flask_cors import CORS
from pyngrok import ngrok
import soundfile as sf
import numpy as np
import io
import base64
import tempfile
import os

app = Flask(__name__)
CORS(app)

LANGUAGES = ["English", "Chinese", "Japanese", "Korean", "German", "French", "Russian", "Portuguese", "Spanish", "Italian"]

@app.route('/health', methods=['GET'])
def health():
    return jsonify({
        "status": "ok",
        "model": "Qwen3-TTS-1.7B-Base",
        "gpu": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU",
        "features": ["tts", "voice_clone"]
    })

@app.route('/api/tts', methods=['POST'])
def generate_tts():
    """Generate TTS, optionally with voice cloning."""
    try:
        data = request.json
        text = data.get('text', 'Hello world')
        language = data.get('language', 'English')
        ref_audio_b64 = data.get('ref_audio')  # base64 encoded audio
        ref_text = data.get('ref_text', '')
        
        print(f"üéôÔ∏è Generating: lang={language}, chars={len(text)}, clone={bool(ref_audio_b64)}")
        
        if ref_audio_b64:
            # Voice cloning mode
            audio_bytes = base64.b64decode(ref_audio_b64)
            
            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
                f.write(audio_bytes)
                temp_path = f.name
            
            try:
                wavs, sr = model.generate_voice_clone(
                    text=text,
                    language=language,
                    ref_audio=temp_path,
                    ref_text=ref_text if ref_text else None,
                )
            finally:
                os.unlink(temp_path)
        else:
            # Default voice using x-vector mode
            wavs, sr = model.generate_voice_clone(
                text=text,
                language=language,
                ref_audio="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-TTS-Repo/clone.wav",
                x_vector_only_mode=True,
            )
        
        buffer = io.BytesIO()
        sf.write(buffer, wavs[0], sr, format='WAV')
        buffer.seek(0)
        
        print(f"‚úÖ Generated {len(wavs[0])/sr:.2f}s of audio")
        
        return send_file(buffer, mimetype='audio/wav', as_attachment=True, download_name='output.wav')
        
    except Exception as e:
        print(f"‚ùå Error: {e}")
        import traceback
        traceback.print_exc()
        return jsonify({"error": str(e)}), 500

@app.route('/api/clone', methods=['POST'])
def voice_clone():
    """Alias for /api/tts with voice cloning."""
    return generate_tts()

# Start ngrok
public_url = ngrok.connect(5000)
print("\n" + "="*60)
print("üöÄ QWEN3-TTS API SERVER IS RUNNING!")
print("="*60)
print(f"\nüìã Copy this URL to your Qwen3 TTS page:\n")
print(f"   {public_url}")
print(f"\n" + "="*60)
print("\nüé≠ Voice Cloning: Upload 10-30s audio")
print("üåç Languages: English, Chinese, Japanese + 7 more\n")

app.run(port=5000)