# üéôÔ∏è Qwen3-TTS API Server v7

Voice Cloning (2K char limit, no chunking)

In [None]:
!pip install -q qwen-tts flask flask-cors pyngrok soundfile numpy

In [None]:
import torch
from qwen_tts import Qwen3TTSModel

print("üîÑ Loading Qwen3-TTS model...")
print(f"   GPU: {torch.cuda.get_device_name(0)}")

model = Qwen3TTSModel.from_pretrained(
    "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
    device_map="cuda:0",
    torch_dtype=torch.bfloat16,
)

print("‚úÖ Model loaded!")

In [None]:
# Enter ngrok token: https://dashboard.ngrok.com/get-started/your-authtoken
NGROK_TOKEN = ""  # @param {type:"string"}

from pyngrok import ngrok
if NGROK_TOKEN:
    ngrok.set_auth_token(NGROK_TOKEN)
    print("‚úÖ ngrok token set!")

In [None]:
from flask import Flask, request, send_file, jsonify, Response
from flask_cors import CORS
import soundfile as sf
import io, base64, tempfile, os, json

app = Flask(__name__)
CORS(app)

@app.route('/health', methods=['GET'])
def health():
    return jsonify({"status": "ok", "model": "Qwen3-TTS-1.7B", "version": "v7", "gpu": torch.cuda.get_device_name(0)})

@app.route('/api/tts', methods=['POST'])
def generate_tts():
    data = request.json
    text = data.get('text', 'Hello')
    language = data.get('language', 'English')
    ref_audio_b64 = data.get('ref_audio')
    ref_text = data.get('ref_text', '')
    stream = data.get('stream', False)

    def generate_with_progress():
        try:
            print(f"üéôÔ∏è Gen: lang={language}, chars={len(text)}")
            
            yield f"data: {json.dumps({'type': 'progress', 'current': 0, 'total': 1, 'percent': 10, 'status': 'Loading reference audio...'})}\n\n"
            
            # Decode reference audio (REQUIRED)
            if not ref_audio_b64:
                yield f"data: {json.dumps({'type': 'error', 'message': 'Reference audio is required'})}\n\n"
                return
            
            audio_bytes = base64.b64decode(ref_audio_b64)
            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
                f.write(audio_bytes)
                temp_path = f.name
            
            yield f"data: {json.dumps({'type': 'progress', 'current': 0, 'total': 1, 'percent': 30, 'status': 'Generating speech...'})}\n\n"
            
            try:
                if ref_text and ref_text.strip():
                    wavs, sr = model.generate_voice_clone(text=text, language=language, ref_audio=temp_path, ref_text=ref_text)
                else:
                    wavs, sr = model.generate_voice_clone(text=text, language=language, ref_audio=temp_path, x_vector_only_mode=True)
            finally:
                os.unlink(temp_path)
            
            yield f"data: {json.dumps({'type': 'progress', 'current': 1, 'total': 1, 'percent': 90, 'status': 'Encoding audio...'})}\n\n"
            
            buffer = io.BytesIO()
            sf.write(buffer, wavs[0], sr, format='WAV')
            buffer.seek(0)
            audio_b64 = base64.b64encode(buffer.read()).decode('utf-8')
            
            duration = len(wavs[0]) / sr
            print(f"‚úÖ Generated {duration:.2f}s of audio")
            
            yield f"data: {json.dumps({'type': 'complete', 'audio': audio_b64, 'duration': round(duration, 1)})}\n\n"
            
        except Exception as e:
            import traceback; traceback.print_exc()
            yield f"data: {json.dumps({'type': 'error', 'message': str(e)})}\n\n"

    if stream:
        return Response(generate_with_progress(), mimetype='text/event-stream', headers={'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'X-Accel-Buffering': 'no'})
    else:
        # Non-streaming fallback
        try:
            if not ref_audio_b64:
                return jsonify({"error": "Reference audio is required"}), 400
            
            audio_bytes = base64.b64decode(ref_audio_b64)
            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
                f.write(audio_bytes)
                temp_path = f.name
            
            try:
                if ref_text and ref_text.strip():
                    wavs, sr = model.generate_voice_clone(text=text, language=language, ref_audio=temp_path, ref_text=ref_text)
                else:
                    wavs, sr = model.generate_voice_clone(text=text, language=language, ref_audio=temp_path, x_vector_only_mode=True)
            finally:
                os.unlink(temp_path)

            buffer = io.BytesIO()
            sf.write(buffer, wavs[0], sr, format='WAV')
            buffer.seek(0)
            return send_file(buffer, mimetype='audio/wav', as_attachment=True, download_name='output.wav')
        except Exception as e:
            import traceback; traceback.print_exc()
            return jsonify({"error": str(e)}), 500

@app.route('/api/clone', methods=['POST'])
def voice_clone():
    return generate_tts()

public_url = ngrok.connect(5000)
print("\n" + "="*50)
print(f"üöÄ URL: {public_url}")
print("="*50 + "\n")

app.run(port=5000)