# üéôÔ∏è XTTSv2 API Server v3

**Fast Voice Cloning** - 2x realtime on T4 GPU!

**Features:**
- Voice cloning from 6-10s audio
- 17 languages supported
- Much faster than Qwen3-TTS

**How to use:**
1. Runtime ‚Üí Change runtime type ‚Üí **T4 GPU**
2. Run all cells (Ctrl+F9)
3. Copy ngrok URL to your XTTSv2 page

In [None]:
# Install coqui-tts with codec support for PyTorch 2.9+
!pip install -q torch torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install -q "coqui-tts[codec]" flask flask-cors pyngrok

In [None]:
from google.colab import drive
import os

drive.mount('/content/drive')

CACHE_DIR = '/content/drive/MyDrive/xtts_cache'
os.makedirs(CACHE_DIR, exist_ok=True)
os.environ['COQUI_TOS_AGREED'] = '1'
os.environ['HF_HOME'] = CACHE_DIR

print(f"‚úÖ Cache: {CACHE_DIR}")

In [None]:
from TTS.api import TTS
import torch

print("üîÑ Loading XTTSv2 model...")
print(f"   GPU: {torch.cuda.get_device_name(0)}")

tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda")

print("‚úÖ XTTSv2 loaded!")
print(f"   Languages: {tts.languages}")

In [None]:
# Enter ngrok token: https://dashboard.ngrok.com/get-started/your-authtoken
NGROK_TOKEN = ""  # @param {type:"string"}

from pyngrok import ngrok
if NGROK_TOKEN:
    ngrok.set_auth_token(NGROK_TOKEN)
    print("‚úÖ ngrok token set!")

In [None]:
from flask import Flask, request, send_file, jsonify, Response
from flask_cors import CORS
import soundfile as sf
import numpy as np
import io, base64, tempfile, os, json, re

app = Flask(__name__)
CORS(app)

LANGUAGES = {
    "English": "en", "Chinese": "zh-cn", "Japanese": "ja", "Korean": "ko",
    "German": "de", "French": "fr", "Spanish": "es", "Italian": "it",
    "Portuguese": "pt", "Russian": "ru", "Arabic": "ar", "Hindi": "hi",
    "Turkish": "tr", "Polish": "pl", "Dutch": "nl", "Czech": "cs", "Hungarian": "hu"
}

def split_text(text, max_chars=250):
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks = []
    current = ""
    for s in sentences:
        if len(current) + len(s) <= max_chars:
            current += (" " if current else "") + s
        else:
            if current: chunks.append(current.strip())
            current = s
    if current: chunks.append(current.strip())
    return chunks if chunks else [text]

@app.route('/health', methods=['GET'])
def health():
    return jsonify({"status": "ok", "model": "XTTSv2", "version": "v3", "gpu": torch.cuda.get_device_name(0), "languages": list(LANGUAGES.keys())})

@app.route('/api/tts', methods=['POST'])
def generate_tts():
    data = request.json
    text = data.get('text', 'Hello')
    language = data.get('language', 'English')
    ref_audio_b64 = data.get('ref_audio')
    stream = data.get('stream', False)
    
    lang_code = LANGUAGES.get(language, 'en')

    if not stream:
        try:
            print(f"üéôÔ∏è Gen: lang={language}, chars={len(text)}")
            
            # Save reference audio
            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
                if ref_audio_b64:
                    f.write(base64.b64decode(ref_audio_b64))
                else:
                    # Use default speaker
                    import urllib.request
                    urllib.request.urlretrieve("https://github.com/coqui-ai/TTS/raw/dev/tests/data/ljspeech/wavs/LJ001-0001.wav", f.name)
                ref_path = f.name
            
            # Generate
            out_path = "/tmp/xtts_output.wav"
            tts.tts_to_file(text=text, speaker_wav=ref_path, language=lang_code, file_path=out_path)
            
            os.unlink(ref_path)
            
            return send_file(out_path, mimetype='audio/wav', as_attachment=True, download_name='output.wav')
        except Exception as e:
            import traceback; traceback.print_exc()
            return jsonify({"error": str(e)}), 500

    # Streaming with progress
    def generate_with_progress():
        try:
            print(f"üéôÔ∏è [Stream] lang={language}, chars={len(text)}")
            
            chunks = split_text(text, max_chars=250)
            total = len(chunks)
            print(f"   Split into {total} chunks")
            
            yield f"data: {json.dumps({'type': 'progress', 'current': 0, 'total': total, 'percent': 0, 'status': 'Preparing...'})}\n\n"
            
            # Save reference audio
            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
                if ref_audio_b64:
                    f.write(base64.b64decode(ref_audio_b64))
                else:
                    import urllib.request
                    urllib.request.urlretrieve("https://github.com/coqui-ai/TTS/raw/dev/tests/data/ljspeech/wavs/LJ001-0001.wav", f.name)
                ref_path = f.name
            
            yield f"data: {json.dumps({'type': 'progress', 'current': 0, 'total': total, 'percent': 5, 'status': 'Reference loaded'})}\n\n"
            
            all_audio = []
            
            for i, chunk in enumerate(chunks):
                pct = int(((i) / total) * 90) + 5
                yield f"data: {json.dumps({'type': 'progress', 'current': i, 'total': total, 'percent': pct, 'status': f'Chunk {i+1}/{total}...'})}\n\n"
                
                print(f"   Chunk {i+1}/{total}")
                
                out_path = f"/tmp/xtts_chunk_{i}.wav"
                tts.tts_to_file(text=chunk, speaker_wav=ref_path, language=lang_code, file_path=out_path)
                
                audio, sr = sf.read(out_path)
                all_audio.append(audio)
                os.unlink(out_path)
                
                pct = int(((i+1) / total) * 90) + 5
                yield f"data: {json.dumps({'type': 'progress', 'current': i+1, 'total': total, 'percent': pct, 'status': f'Chunk {i+1}/{total} done'})}\n\n"
            
            os.unlink(ref_path)
            
            yield f"data: {json.dumps({'type': 'progress', 'current': total, 'total': total, 'percent': 95, 'status': 'Combining...'})}\n\n"
            
            final = np.concatenate(all_audio)
            buffer = io.BytesIO()
            sf.write(buffer, final, sr, format='WAV')
            buffer.seek(0)
            audio_b64 = base64.b64encode(buffer.read()).decode('utf-8')
            
            duration = len(final) / sr
            print(f"‚úÖ Generated {duration:.1f}s from {total} chunks")
            
            yield f"data: {json.dumps({'type': 'complete', 'audio': audio_b64, 'duration': round(duration, 1), 'chunks': total})}\n\n"
            
        except Exception as e:
            import traceback; traceback.print_exc()
            yield f"data: {json.dumps({'type': 'error', 'message': str(e)})}\n\n"
    
    return Response(generate_with_progress(), mimetype='text/event-stream', headers={'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'X-Accel-Buffering': 'no'})

@app.route('/api/clone', methods=['POST'])
def voice_clone():
    return generate_tts()

public_url = ngrok.connect(5000)
print("\n" + "="*50)
print(f"üöÄ XTTSv2 URL: {public_url}")
print("="*50 + "\n")

app.run(port=5000)