# üéôÔ∏è Qwen3 TTS Server for Social Video Engine

This notebook runs Qwen3-TTS as an API server on Google Colab (free GPU).

**How it works:**
1. Loads the Qwen3-TTS model on the Colab T4 GPU
2. Exposes an HTTP API via ngrok
3. Your video pipeline sends text ‚Üí gets back audio
4. **Cost: $0.00**

**Setup:**
1. Make sure GPU runtime is enabled: Runtime ‚Üí Change runtime type ‚Üí T4 GPU
2. Run all cells
3. Copy the ngrok URL and set it in your render pipeline

In [None]:
# Cell 1: Install dependencies
!pip install -q qwen-tts flask pyngrok soundfile numpy torch
!pip install -q flash-attn --no-build-isolation 2>/dev/null || echo 'Flash attention install failed, continuing without it'

In [None]:
# Cell 2: Check GPU
import torch
print(f'GPU: {torch.cuda.get_device_name(0)}')
print(f'VRAM: {torch.cuda.get_device_properties(0).total_mem / 1024**3:.1f} GB')
print(f'PyTorch: {torch.__version__}')

In [None]:
# Cell 3: Load model (downloads ~1.5GB on first run)
from qwen_tts import Qwen3TTSModel
import torch

# Use 0.6B for free Colab (less VRAM), 1.7B for paid/Pro
MODEL_NAME = "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice"
# MODEL_NAME = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"  # Uncomment for better quality

try:
    model = Qwen3TTSModel.from_pretrained(
        MODEL_NAME,
        device_map="cuda:0",
        dtype=torch.bfloat16,
        attn_implementation="flash_attention_2",
    )
except Exception:
    print("Flash attention not available, using default")
    model = Qwen3TTSModel.from_pretrained(
        MODEL_NAME,
        device_map="cuda:0",
        dtype=torch.bfloat16,
    )

print(f'‚úÖ Model loaded: {MODEL_NAME}')
print(f'Speakers: {model.get_supported_speakers()}')
print(f'Languages: {model.get_supported_languages()}')

In [None]:
# Cell 4: Quick test ‚Äî generate a sample
import soundfile as sf
from IPython.display import Audio

wavs, sr = model.generate_custom_voice(
    text="AI automation saves businesses over twenty hours every single week. And the best part? It costs almost nothing to get started.",
    language="English",
    speaker="Ryan",
    instruct="Speak with confidence and enthusiasm, like a tech YouTuber.",
)

sf.write("/tmp/test_qwen_tts.wav", wavs[0], sr)
print(f'‚úÖ Generated {len(wavs[0])/sr:.1f}s of audio')
Audio(wavs[0], rate=sr)

In [None]:
# Cell 5: Start the API server
import os
import io
import json
import base64
import threading
import numpy as np
import soundfile as sf
from flask import Flask, request, jsonify, send_file

app = Flask(__name__)

@app.route('/health', methods=['GET'])
def health():
    return jsonify({'status': 'ok', 'model': MODEL_NAME})

@app.route('/speakers', methods=['GET'])
def speakers():
    return jsonify({
        'speakers': model.get_supported_speakers(),
        'languages': model.get_supported_languages(),
    })

@app.route('/tts', methods=['POST'])
def tts():
    """
    Generate TTS audio.
    
    Body JSON:
    {
        "text": "Text to speak",
        "speaker": "Ryan",        // Ryan, Aiden, Vivian, etc.
        "language": "English",     // English, Chinese, Arabic, etc.
        "instruct": "Speak with confidence",  // Optional style instruction
        "format": "wav"           // wav (default) or base64
    }
    """
    data = request.json
    text = data.get('text', '')
    speaker = data.get('speaker', 'Ryan')
    language = data.get('language', 'English')
    instruct = data.get('instruct', '')
    output_format = data.get('format', 'wav')
    
    if not text:
        return jsonify({'error': 'text is required'}), 400
    
    try:
        wavs, sr = model.generate_custom_voice(
            text=text,
            language=language,
            speaker=speaker,
            instruct=instruct if instruct else None,
        )
        
        audio_data = wavs[0]
        duration = len(audio_data) / sr
        
        if output_format == 'base64':
            buf = io.BytesIO()
            sf.write(buf, audio_data, sr, format='WAV')
            buf.seek(0)
            b64 = base64.b64encode(buf.read()).decode('utf-8')
            return jsonify({
                'audio_base64': b64,
                'sample_rate': sr,
                'duration': duration,
                'format': 'wav'
            })
        else:
            buf = io.BytesIO()
            sf.write(buf, audio_data, sr, format='WAV')
            buf.seek(0)
            return send_file(buf, mimetype='audio/wav',
                           download_name='tts_output.wav')
    except Exception as e:
        return jsonify({'error': str(e)}), 500

@app.route('/tts/batch', methods=['POST'])
def tts_batch():
    """
    Batch TTS for multiple scenes.
    
    Body JSON:
    {
        "scenes": [
            {"text": "...", "speaker": "Ryan", "instruct": "..."},
            {"text": "...", "speaker": "Ryan", "instruct": "..."}
        ],
        "language": "English",
        "format": "base64"
    }
    """
    data = request.json
    scenes = data.get('scenes', [])
    language = data.get('language', 'English')
    
    results = []
    for scene in scenes:
        try:
            wavs, sr = model.generate_custom_voice(
                text=scene['text'],
                language=language,
                speaker=scene.get('speaker', 'Ryan'),
                instruct=scene.get('instruct', '') or None,
            )
            buf = io.BytesIO()
            sf.write(buf, wavs[0], sr, format='WAV')
            buf.seek(0)
            b64 = base64.b64encode(buf.read()).decode('utf-8')
            results.append({
                'audio_base64': b64,
                'duration': len(wavs[0]) / sr,
                'success': True
            })
        except Exception as e:
            results.append({'error': str(e), 'success': False})
    
    return jsonify({'results': results, 'sample_rate': sr})

# Run Flask in a thread
threading.Thread(target=lambda: app.run(port=5000, debug=False), daemon=True).start()
print('‚úÖ Flask server running on port 5000')

In [None]:
# Cell 6: Expose via ngrok (free)
# Get your free ngrok token at https://ngrok.com (sign up, copy authtoken)
NGROK_TOKEN = ""  # Paste your ngrok token here

from pyngrok import ngrok

if NGROK_TOKEN:
    ngrok.set_auth_token(NGROK_TOKEN)

public_url = ngrok.connect(5000)
print(f'\nüåê TTS API is live at: {public_url}')
print(f'\nTest it:')
print(f'  curl -X POST {public_url}/tts \\')
print(f'    -H "Content-Type: application/json" \\')
print(f'    -d \'{{"text": "Hello world", "speaker": "Ryan"}}\' \\')
print(f'    --output test.wav')
print(f'\nüìã Set this URL in your video pipeline config.')
print(f'\n‚ö†Ô∏è  Keep this notebook running! Close it = API goes down.')

In [None]:
# Cell 7: Keep alive ‚Äî run this to prevent Colab from timing out
import time
print('Keeping session alive... (Ctrl+C to stop)')
while True:
    time.sleep(60)
    print(f'[{time.strftime("%H:%M")}] Server alive at {public_url}')