# üéôÔ∏è Qwen3-TTS for Kaggle (with Chunking)

**Supports unlimited text** - splits into 2K chunks automatically!

**Before running:**
1. Settings ‚Üí Accelerator ‚Üí **GPU T4 x2**
2. Settings ‚Üí Internet ‚Üí **ON**
3. Run cells with **Shift+Enter** (one by one!)

In [None]:
!pip install -q qwen-tts flask flask-cors pyngrok soundfile numpy

In [None]:
import torch
from qwen_tts import Qwen3TTSModel

print("üîÑ Loading Qwen3-TTS model...")
print(f"   GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

model = Qwen3TTSModel.from_pretrained(
    "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
    device_map="cuda:0",
    torch_dtype=torch.bfloat16,
)

print("‚úÖ Model loaded!")

In [None]:
# PASTE YOUR NGROK TOKEN HERE!
NGROK_TOKEN = ""  # <-- Get from https://dashboard.ngrok.com/get-started/your-authtoken

from pyngrok import ngrok
if NGROK_TOKEN:
    ngrok.set_auth_token(NGROK_TOKEN)
    print("‚úÖ ngrok token set!")
else:
    print("‚ö†Ô∏è Enter your ngrok token above!")

In [None]:
from flask import Flask, request, send_file, jsonify, Response
from flask_cors import CORS
import soundfile as sf
import numpy as np
import io, base64, tempfile, os, json, re

app = Flask(__name__)
CORS(app)

CHUNK_SIZE = 2000  # Characters per chunk

def split_text(text, max_chars=CHUNK_SIZE):
    """Split text into chunks at sentence boundaries"""
    sentences = re.split(r'(?<=[.!?‡•§])\s+', text)
    chunks = []
    current = ""
    for s in sentences:
        if len(current) + len(s) <= max_chars:
            current += (" " if current else "") + s
        else:
            if current:
                chunks.append(current.strip())
            # If single sentence is too long, split by words
            if len(s) > max_chars:
                words = s.split()
                current = ""
                for word in words:
                    if len(current) + len(word) + 1 <= max_chars:
                        current += (" " if current else "") + word
                    else:
                        if current:
                            chunks.append(current.strip())
                        current = word
            else:
                current = s
    if current:
        chunks.append(current.strip())
    return chunks if chunks else [text]

@app.route('/health', methods=['GET'])
def health():
    return jsonify({"status": "ok", "model": "Qwen3-TTS-1.7B", "gpu": torch.cuda.get_device_name(0), "chunk_size": CHUNK_SIZE})

@app.route('/api/tts', methods=['POST'])
def generate_tts():
    data = request.json
    text = data.get('text', 'Hello')
    language = data.get('language', 'English')
    ref_audio_b64 = data.get('ref_audio')
    ref_text = data.get('ref_text', '')
    stream = data.get('stream', False)

    def generate_with_progress():
        try:
            if not ref_audio_b64:
                yield f"data: {json.dumps({'type': 'error', 'message': 'Reference audio is required'})}\n\n"
                return
            
            # Split text into chunks
            chunks = split_text(text, CHUNK_SIZE)
            total_chunks = len(chunks)
            
            print(f"üéôÔ∏è Generating: {len(text)} chars ‚Üí {total_chunks} chunks")
            
            yield f"data: {json.dumps({'type': 'progress', 'current': 0, 'total': total_chunks, 'percent': 0, 'status': f'Preparing {total_chunks} chunks...'})}\n\n"
            
            # Decode reference audio
            audio_bytes = base64.b64decode(ref_audio_b64)
            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
                f.write(audio_bytes)
                ref_path = f.name
            
            yield f"data: {json.dumps({'type': 'progress', 'current': 0, 'total': total_chunks, 'percent': 5, 'status': 'Reference audio loaded'})}\n\n"
            
            all_audio = []
            sample_rate = None
            
            for i, chunk in enumerate(chunks):
                pct = int(((i) / total_chunks) * 90) + 5
                yield f"data: {json.dumps({'type': 'progress', 'current': i, 'total': total_chunks, 'percent': pct, 'status': f'Generating chunk {i+1}/{total_chunks}...'})}\n\n"
                
                try:
                    if ref_text and ref_text.strip():
                        wavs, sr = model.generate_voice_clone(text=chunk, language=language, ref_audio=ref_path, ref_text=ref_text)
                    else:
                        wavs, sr = model.generate_voice_clone(text=chunk, language=language, ref_audio=ref_path, x_vector_only_mode=True)
                    
                    all_audio.append(wavs[0])
                    if sample_rate is None:
                        sample_rate = sr
                except Exception as chunk_err:
                    print(f"‚ùå Chunk {i+1} failed: {chunk_err}")
                    yield f"data: {json.dumps({'type': 'error', 'message': f'Chunk {i+1} failed: {str(chunk_err)}'})}\n\n"
                    return
                
                pct = int(((i + 1) / total_chunks) * 90) + 5
                yield f"data: {json.dumps({'type': 'progress', 'current': i + 1, 'total': total_chunks, 'percent': pct, 'status': f'Chunk {i+1}/{total_chunks} done'})}\n\n"
            
            # Cleanup ref audio
            os.unlink(ref_path)
            
            yield f"data: {json.dumps({'type': 'progress', 'current': total_chunks, 'total': total_chunks, 'percent': 95, 'status': 'Concatenating audio...'})}\n\n"
            
            # Concatenate all audio chunks
            final_audio = np.concatenate(all_audio)
            
            buffer = io.BytesIO()
            sf.write(buffer, final_audio, sample_rate, format='WAV')
            buffer.seek(0)
            audio_b64 = base64.b64encode(buffer.read()).decode('utf-8')
            
            duration = len(final_audio) / sample_rate
            print(f"‚úÖ Generated {duration:.1f}s from {total_chunks} chunks")
            
            yield f"data: {json.dumps({'type': 'complete', 'audio': audio_b64, 'duration': round(duration, 1), 'chunks': total_chunks})}\n\n"
            
        except Exception as e:
            import traceback; traceback.print_exc()
            yield f"data: {json.dumps({'type': 'error', 'message': str(e)})}\n\n"

    if stream:
        return Response(generate_with_progress(), mimetype='text/event-stream', headers={'Cache-Control': 'no-cache', 'X-Accel-Buffering': 'no'})
    else:
        # Non-streaming fallback
        try:
            if not ref_audio_b64:
                return jsonify({"error": "Reference audio is required"}), 400
            
            audio_bytes = base64.b64decode(ref_audio_b64)
            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
                f.write(audio_bytes)
                ref_path = f.name
            
            chunks = split_text(text, CHUNK_SIZE)
            all_audio = []
            sample_rate = None
            
            for chunk in chunks:
                if ref_text and ref_text.strip():
                    wavs, sr = model.generate_voice_clone(text=chunk, language=language, ref_audio=ref_path, ref_text=ref_text)
                else:
                    wavs, sr = model.generate_voice_clone(text=chunk, language=language, ref_audio=ref_path, x_vector_only_mode=True)
                all_audio.append(wavs[0])
                if sample_rate is None:
                    sample_rate = sr
            
            os.unlink(ref_path)
            final_audio = np.concatenate(all_audio)
            
            buffer = io.BytesIO()
            sf.write(buffer, final_audio, sample_rate, format='WAV')
            buffer.seek(0)
            return send_file(buffer, mimetype='audio/wav')
        except Exception as e:
            return jsonify({"error": str(e)}), 500

@app.route('/api/clone', methods=['POST'])
def voice_clone():
    return generate_tts()

public_url = ngrok.connect(5000)
print("\n" + "="*50)
print(f"üöÄ QWEN3-TTS URL: {public_url}")
print(f"   Chunk size: {CHUNK_SIZE} chars")
print("="*50 + "\n")

app.run(port=5000)