# TTS/STT Services - Complete Deployment & Testing Notebook

This notebook deploys and tests the TTS and STT ML services on Google Colab with GPU acceleration.

## What This Notebook Does
1. **Deploy Services** - Install dependencies and start STT/TTS services
2. **Setup ngrok** - Create public URLs for the services
3. **Health checks** - Verify both services are running
4. **STT testing** - Speech-to-Text transcription
5. **TTS testing** - Text-to-Speech synthesis
6. **Multi-language testing**
7. **Performance measurement**

## Prerequisites
- Google Colab with GPU runtime (Runtime ‚Üí Change runtime type ‚Üí T4 GPU)
- ngrok account and auth token (free at https://ngrok.com)

## Step 1: Install Dependencies & Setup

In [None]:
# @title 1.1 Install All Dependencies
# Install core dependencies
!pip install -q requests pyngrok fastapi uvicorn python-multipart pydantic pydantic-settings loguru nest_asyncio

# Install ML dependencies
!pip install -q faster-whisper TTS numpy

# Check GPU availability
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
else:
    print("‚ö†Ô∏è No GPU detected! Performance will be slow.")
    print("   Go to Runtime ‚Üí Change runtime type ‚Üí Select T4 GPU")

print("\n‚úÖ Dependencies installed!")

In [None]:
# @title 1.2 Setup ngrok Authentication
# @markdown Get your auth token from https://dashboard.ngrok.com/get-started/your-authtoken

# Install pyngrok if needed
import subprocess
import sys
try:
    from pyngrok import ngrok, conf
except ImportError:
    print("Installing pyngrok...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "pyngrok"])
    from pyngrok import ngrok, conf

NGROK_AUTH_TOKEN = ""  # @param {type:"string"}

if not NGROK_AUTH_TOKEN:
    print("‚ö†Ô∏è Please enter your ngrok auth token above!")
    print("   Get it from: https://dashboard.ngrok.com/get-started/your-authtoken")
else:
    conf.get_default().auth_token = NGROK_AUTH_TOKEN
    print("‚úÖ ngrok authenticated successfully!")

## Step 2: Deploy STT Service (Speech-to-Text)

In [None]:
# @title 2.1 Create STT Service with Faster-Whisper

# Ensure dependencies are installed
import subprocess
import sys

def install_if_missing(package, import_name=None):
    import_name = import_name or package
    try:
        __import__(import_name)
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])

install_if_missing("faster-whisper", "faster_whisper")
install_if_missing("fastapi")
install_if_missing("uvicorn")
install_if_missing("python-multipart")
install_if_missing("nest_asyncio")

import io
import numpy as np
from fastapi import FastAPI, File, UploadFile, Form
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from typing import List, Dict, Any, Optional
import uvicorn
import threading
import torch

# Initialize Faster-Whisper model
print("Loading Faster-Whisper model (large-v3)... This may take a few minutes.")
from faster_whisper import WhisperModel

device = "cuda" if torch.cuda.is_available() else "cpu"
compute_type = "float16" if device == "cuda" else "int8"

stt_model = WhisperModel("large-v3", device=device, compute_type=compute_type)
print(f"‚úÖ STT Model loaded on {device.upper()}")

# Create FastAPI app for STT
stt_app = FastAPI(title="STT Service")

class TimestampSegment(BaseModel):
    start: float
    end: float
    word: Optional[str] = None
    confidence: Optional[float] = None

class SttResponse(BaseModel):
    text: str
    language: str
    confidence: float
    timestamps: List[Dict[str, Any]]
    meta: Dict[str, Any]
    modelUsed: str
    status: str = "success"

@stt_app.get("/ml/stt/health")
async def stt_health():
    return {
        "status": "ok",
        "detail": "stt-service healthy",
        "models": [{"name": "whisper_large-v3", "status": "ready", "type": "stt"}]
    }

@stt_app.post("/ml/stt/transcribe", response_model=SttResponse)
async def transcribe(
    file: UploadFile = File(...),
    language_hint: Optional[str] = Form(default=None)
):
    try:
        # Read audio file
        audio_bytes = await file.read()

        import tempfile
        import os

        # Save to temp file for faster-whisper
        with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as tmp:
            tmp.write(audio_bytes)
            tmp_path = tmp.name

        # Transcribe with Faster-Whisper
        segments, info = stt_model.transcribe(
            tmp_path,
            language=language_hint if language_hint else None,
            word_timestamps=True,
            vad_filter=True,
            beam_size=5
        )

        # Process results
        full_text_parts = []
        all_timestamps = []
        total_confidence = 0.0
        segment_count = 0

        for segment in segments:
            full_text_parts.append(segment.text.strip())
            segment_count += 1

            # Calculate confidence from avg_logprob
            segment_confidence = min(1.0, max(0.0, 1.0 + (segment.avg_logprob / 5.0)))
            total_confidence += segment_confidence

            # Extract word timestamps
            if segment.words:
                for word in segment.words:
                    all_timestamps.append({
                        "start": round(word.start, 3),
                        "end": round(word.end, 3),
                        "word": word.word.strip(),
                        "confidence": round(segment_confidence, 3)
                    })

        # Clean up temp file
        os.unlink(tmp_path)

        full_text = " ".join(full_text_parts).strip()
        avg_confidence = total_confidence / max(segment_count, 1)

        return SttResponse(
            text=full_text,
            language=info.language or language_hint or "en",
            confidence=round(avg_confidence, 3),
            timestamps=all_timestamps,
            meta={
                "duration_seconds": round(info.duration, 2),
                "file_name": file.filename,
                "file_size": len(audio_bytes)
            },
            modelUsed="whisper_large-v3:faster-whisper"
        )

    except Exception as e:
        return JSONResponse(
            status_code=500,
            content={"error": str(e), "status": "failed"}
        )

print("‚úÖ STT Service created")

In [None]:
# @title 2.2 Start STT Service & Create ngrok Tunnel
import nest_asyncio
nest_asyncio.apply()

# Start STT server in background thread
STT_PORT = 8002

def run_stt_server():
    uvicorn.run(stt_app, host="0.0.0.0", port=STT_PORT, log_level="warning")

stt_thread = threading.Thread(target=run_stt_server, daemon=True)
stt_thread.start()

import time
time.sleep(3)  # Wait for server to start

# Create ngrok tunnel for STT
from pyngrok import ngrok

stt_tunnel = ngrok.connect(STT_PORT, "http")
STT_URL = stt_tunnel.public_url

print("="*60)
print("‚úÖ STT SERVICE DEPLOYED!")
print("="*60)
print(f"üåê Public URL: {STT_URL}")
print(f"üè• Health Check: {STT_URL}/ml/stt/health")
print(f"üé§ Transcribe: {STT_URL}/ml/stt/transcribe")
print("="*60)

## Step 3: Deploy TTS Service (Text-to-Speech)

In [None]:
# @title 3.1 Create TTS Service with Coqui XTTS

# Ensure TTS is installed
import subprocess
import sys

try:
    from TTS.api import TTS
except ImportError:
    print("Installing TTS (Coqui)... This may take a minute.")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "TTS"])
    from TTS.api import TTS

import os
import uuid
import base64
import torch

# Initialize TTS model
print("Loading Coqui XTTS v2 model... This may take a few minutes.")

device = "cuda" if torch.cuda.is_available() else "cpu"
tts_model = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
print(f"‚úÖ TTS Model loaded on {device.upper()}")

# Create output directory
os.makedirs("/content/tts_output", exist_ok=True)

# Create FastAPI app for TTS
from fastapi import FastAPI
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from typing import Dict, Any, Optional

tts_app = FastAPI(title="TTS Service")

class TtsRequest(BaseModel):
    text: str
    language: str = "en"
    speed: float = 1.0
    speaker_wav: Optional[str] = None

class TtsResponse(BaseModel):
    audio_path: str
    audio_base64: Optional[str] = None
    duration: float
    status: str
    meta: Dict[str, Any]

@tts_app.get("/ml/tts/health")
async def tts_health():
    return {
        "status": "ok",
        "detail": "tts-service healthy",
        "models": [{"name": "xtts_v2", "status": "ready", "type": "tts"}]
    }

@tts_app.post("/ml/tts/predict", response_model=TtsResponse)
async def synthesize(request: TtsRequest):
    try:
        # Generate unique filename
        output_filename = f"/content/tts_output/{uuid.uuid4().hex}.wav"

        # Synthesize speech
        tts_model.tts_to_file(
            text=request.text,
            language=request.language,
            file_path=output_filename,
            speed=request.speed
        )

        # Get audio duration
        import wave
        with wave.open(output_filename, 'r') as wav_file:
            frames = wav_file.getnframes()
            rate = wav_file.getframerate()
            duration = frames / float(rate)

        # Read audio as base64 for response
        with open(output_filename, 'rb') as f:
            audio_base64 = base64.b64encode(f.read()).decode('utf-8')

        return TtsResponse(
            audio_path=output_filename,
            audio_base64=audio_base64,
            duration=round(duration, 2),
            status="success",
            meta={
                "language": request.language,
                "speed": request.speed,
                "model": "xtts_v2",
                "text_length": len(request.text)
            }
        )

    except Exception as e:
        return JSONResponse(
            status_code=500,
            content={"error": str(e), "status": "failed"}
        )

print("‚úÖ TTS Service created")

In [None]:
# @title 3.2 Start TTS Service & Create ngrok Tunnel
# Start TTS server in background thread
TTS_PORT = 8001

def run_tts_server():
    uvicorn.run(tts_app, host="0.0.0.0", port=TTS_PORT, log_level="warning")

tts_thread = threading.Thread(target=run_tts_server, daemon=True)
tts_thread.start()

time.sleep(3)  # Wait for server to start

# Create ngrok tunnel for TTS
tts_tunnel = ngrok.connect(TTS_PORT, "http")
TTS_URL = tts_tunnel.public_url

print("="*60)
print("‚úÖ TTS SERVICE DEPLOYED!")
print("="*60)
print(f"üåê Public URL: {TTS_URL}")
print(f"üè• Health Check: {TTS_URL}/ml/tts/health")
print(f"üîä Synthesize: {TTS_URL}/ml/tts/predict")
print("="*60)

In [None]:
# @title 3.3 Deployment Summary - SAVE THESE URLs!
print("="*70)
print("üéâ DEPLOYMENT COMPLETE! Both services are running.")
print("="*70)
print()
print("üìç STT Service (Speech-to-Text):")
print(f"   URL: {STT_URL}")
print(f"   Health: {STT_URL}/ml/stt/health")
print(f"   Transcribe: {STT_URL}/ml/stt/transcribe")
print()
print("üìç TTS Service (Text-to-Speech):")
print(f"   URL: {TTS_URL}")
print(f"   Health: {TTS_URL}/ml/tts/health")
print(f"   Synthesize: {TTS_URL}/ml/tts/predict")
print()
print("="*70)
print("‚ö†Ô∏è  IMPORTANT: These URLs will change if you restart the notebook!")
print("="*70)

# Set headers for all subsequent requests
HEADERS = {"ngrok-skip-browser-warning": "true"}

## Step 4: Health Checks

In [None]:
# @title Check STT Service Health
def check_stt_health():
    try:
        response = requests.get(f"{STT_URL}/ml/stt/health", headers=HEADERS, timeout=30)
        if response.status_code == 200:
            data = response.json()
            print("‚úÖ STT Service is HEALTHY")
            print(f"   Status: {data.get('status')}")
            print(f"   Models: {len(data.get('models', []))} loaded")
            for model in data.get('models', []):
                print(f"     - {model.get('name')}: {model.get('status')}")
            return True
        else:
            print(f"‚ùå STT Service returned HTTP {response.status_code}")
            return False
    except Exception as e:
        print(f"‚ùå STT Service is UNREACHABLE: {e}")
        return False

check_stt_health()

In [None]:
# @title Check TTS Service Health
def check_tts_health():
    try:
        response = requests.get(f"{TTS_URL}/ml/tts/health", headers=HEADERS, timeout=30)
        if response.status_code == 200:
            data = response.json()
            print("‚úÖ TTS Service is HEALTHY")
            print(f"   Status: {data.get('status')}")
            print(f"   Models: {len(data.get('models', []))} loaded")
            for model in data.get('models', []):
                print(f"     - {model.get('name')}: {model.get('status')}")
            return True
        else:
            print(f"‚ùå TTS Service returned HTTP {response.status_code}")
            return False
    except Exception as e:
        print(f"‚ùå TTS Service is UNREACHABLE: {e}")
        return False

check_tts_health()

## Step 5: STT Testing (Speech-to-Text)

In [None]:
# @title Upload Audio File for Transcription
print("Upload an audio file (WAV, MP3, FLAC, etc.)")
uploaded = files.upload()

if uploaded:
    audio_filename = list(uploaded.keys())[0]
    print(f"\n‚úÖ Uploaded: {audio_filename}")
    print(f"   Size: {len(uploaded[audio_filename])} bytes")
else:
    print("No file uploaded")
    audio_filename = None

In [None]:
# @title Transcribe Uploaded Audio
# @markdown Select the language hint (or leave empty for auto-detect)
language_hint = "en"  # @param ["en", "hi", "ta", "te", "es", "fr", "de", "ja", "zh", ""]

if audio_filename:
    print(f"Transcribing: {audio_filename}")
    print(f"Language hint: {language_hint or 'auto-detect'}")
    print("\nProcessing... (this may take 30-60 seconds on first request)")

    start_time = time.time()

    with open(audio_filename, 'rb') as f:
        files_data = {"file": (audio_filename, f)}
        data = {"language_hint": language_hint} if language_hint else {}

        response = requests.post(
            f"{STT_URL}/ml/stt/transcribe",
            files=files_data,
            data=data,
            headers=HEADERS,
            timeout=120
        )

    elapsed = time.time() - start_time

    if response.status_code == 200:
        result = response.json()
        print(f"\n‚úÖ Transcription Complete ({elapsed:.2f}s)")
        print("="*50)
        print(f"üìù Text: {result['text']}")
        print(f"üåç Language: {result['language']}")
        print(f"üìä Confidence: {result['confidence']:.2%}")
        print(f"‚è±Ô∏è Duration: {result['meta'].get('duration_seconds', 'N/A')}s")
        print(f"ü§ñ Model: {result.get('modelUsed', 'N/A')}")

        # Show word timestamps
        if result.get('timestamps'):
            print(f"\nüìç Word Timestamps ({len(result['timestamps'])} words):")
            for ts in result['timestamps'][:10]:  # Show first 10
                print(f"   [{ts['start']:.2f}s - {ts['end']:.2f}s] {ts['word']}")
            if len(result['timestamps']) > 10:
                print(f"   ... and {len(result['timestamps'])-10} more words")
    else:
        print(f"\n‚ùå Transcription Failed: HTTP {response.status_code}")
        print(response.text)
else:
    print("‚ö†Ô∏è Please upload an audio file first")

## Step 6: TTS Testing (Text-to-Speech)

In [None]:
# @title Synthesize Speech from Text
# @markdown Enter the text you want to convert to speech

text_to_synthesize = "Hello! This is a test of the text to speech system."  # @param {type:"string"}
tts_language = "en"  # @param ["en", "hi", "ta", "te", "es", "fr", "de", "ja", "zh-cn"]
speech_speed = 1.0  # @param {type:"slider", min:0.5, max:2.0, step:0.1}

print(f"Text: {text_to_synthesize}")
print(f"Language: {tts_language}")
print(f"Speed: {speech_speed}x")
print("\nGenerating speech... (this may take 30-60 seconds on first request)")

start_time = time.time()

response = requests.post(
    f"{TTS_URL}/ml/tts/predict",
    json={
        "text": text_to_synthesize,
        "language": tts_language,
        "speed": speech_speed
    },
    headers={"Content-Type": "application/json", **HEADERS},
    timeout=120
)

elapsed = time.time() - start_time

if response.status_code == 200:
    result = response.json()
    print(f"\n‚úÖ Speech Generated ({elapsed:.2f}s)")
    print("="*50)
    print(f"‚è±Ô∏è Duration: {result['duration']:.2f}s")
    print(f"üìä Status: {result['status']}")
    print(f"üéØ MOS Score: {result['meta'].get('mos_score', 'N/A')}")
    print(f"üìÅ Server Path: {result['audio_path']}")
else:
    print(f"\n‚ùå Speech Generation Failed: HTTP {response.status_code}")
    print(response.text)

## Step 7: Multi-Language Testing

In [None]:
# @title Test TTS with Multiple Languages
test_texts = {
    "en": "Hello, how are you today?",
    "hi": "‡§®‡§Æ‡§∏‡•ç‡§§‡•á, ‡§Ü‡§™ ‡§ï‡•à‡§∏‡•á ‡§π‡•à‡§Ç?",
    "es": "Hola, c√≥mo est√°s hoy?",
    "fr": "Bonjour, comment allez-vous?",
    "de": "Hallo, wie geht es Ihnen?",
    "ja": "„Åì„Çì„Å´„Å°„ÅØ„ÄÅ„ÅäÂÖÉÊ∞ó„Åß„Åô„ÅãÔºü",
    "zh-cn": "‰Ω†Â•ΩÔºå‰Ω†‰ªäÂ§©ÊÄé‰πàÊ†∑Ôºü"
}

print("Testing TTS in multiple languages...")
print("="*60)

results = {}

for lang, text in test_texts.items():
    try:
        start = time.time()
        response = requests.post(
            f"{TTS_URL}/ml/tts/predict",
            json={"text": text, "language": lang},
            headers={"Content-Type": "application/json", **HEADERS},
            timeout=60
        )
        elapsed = time.time() - start

        if response.status_code == 200:
            data = response.json()
            print(f"‚úÖ {lang.upper()}: {text[:30]}... ({data['duration']:.1f}s audio, {elapsed:.1f}s processing)")
            results[lang] = {"success": True, "duration": data['duration']}
        else:
            print(f"‚ùå {lang.upper()}: Failed (HTTP {response.status_code})")
            results[lang] = {"success": False}

    except Exception as e:
        print(f"‚ùå {lang.upper()}: Error ({e})")
        results[lang] = {"success": False, "error": str(e)}

print("="*60)
success_count = sum(1 for r in results.values() if r['success'])
print(f"\nResults: {success_count}/{len(test_texts)} languages successful")

## Step 8: Performance Measurement

In [None]:
# @title Measure TTS Latency
num_runs = 3  # @param {type:"slider", min:1, max:10, step:1}

print(f"Running {num_runs} TTS requests to measure latency...")
print("="*50)

times = []
test_text = "Quick latency test."

for i in range(num_runs):
    start = time.time()
    response = requests.post(
        f"{TTS_URL}/ml/tts/predict",
        json={"text": test_text, "language": "en"},
        headers={"Content-Type": "application/json", **HEADERS},
        timeout=60
    )
    elapsed = time.time() - start

    if response.status_code == 200:
        times.append(elapsed)
        print(f"   Run {i+1}: {elapsed:.2f}s")
    else:
        print(f"   Run {i+1}: Failed")

if times:
    print("="*50)
    print(f"\nüìä Latency Statistics:")
    print(f"   Min:  {min(times):.2f}s")
    print(f"   Max:  {max(times):.2f}s")
    print(f"   Avg:  {sum(times)/len(times):.2f}s")

## Step 9: Integration Test (Round Trip)

In [None]:
# @title STT ‚Üí TTS Round Trip Test
# @markdown This test transcribes audio and then synthesizes it back to speech

if audio_filename and os.path.exists(audio_filename):
    print("Starting round-trip test...")
    print("="*60)

    # Step 1: Transcribe
    print("\n[Step 1] Transcribing audio...")
    start = time.time()

    with open(audio_filename, 'rb') as f:
        stt_response = requests.post(
            f"{STT_URL}/ml/stt/transcribe",
            files={"file": f},
            headers=HEADERS,
            timeout=120
        )

    stt_time = time.time() - start

    if stt_response.status_code == 200:
        stt_result = stt_response.json()
        print(f"   ‚úÖ Transcribed ({stt_time:.2f}s)")
        print(f"   Text: {stt_result['text'][:100]}...")
        print(f"   Language: {stt_result['language']}")

        # Step 2: Synthesize
        print("\n[Step 2] Synthesizing speech...")
        start = time.time()

        tts_response = requests.post(
            f"{TTS_URL}/ml/tts/predict",
            json={
                "text": stt_result['text'],
                "language": stt_result['language']
            },
            headers={"Content-Type": "application/json", **HEADERS},
            timeout=120
        )

        tts_time = time.time() - start

        if tts_response.status_code == 200:
            tts_result = tts_response.json()
            print(f"   ‚úÖ Synthesized ({tts_time:.2f}s)")
            print(f"   Duration: {tts_result['duration']:.2f}s")

            print("\n" + "="*60)
            print("üìä Round-Trip Summary:")
            print(f"   STT Time: {stt_time:.2f}s")
            print(f"   TTS Time: {tts_time:.2f}s")
            print(f"   Total Time: {stt_time + tts_time:.2f}s")
            print("   ‚úÖ Round-trip test PASSED!")
        else:
            print(f"   ‚ùå TTS failed: {tts_response.status_code}")
    else:
        print(f"   ‚ùå STT failed: {stt_response.status_code}")
else:
    print("‚ö†Ô∏è Please upload an audio file first (in the STT section above)")

## Step 10: Custom Testing

In [None]:
# @title Custom TTS Request
# @markdown Use this cell to test with any text

custom_text = "Enter your custom text here"  # @param {type:"string"}
custom_lang = "en"  # @param {type:"string"}
custom_speed = 1.0  # @param {type:"number"}

print(f"Sending custom TTS request...")
print(f"  Text: {custom_text}")
print(f"  Language: {custom_lang}")
print(f"  Speed: {custom_speed}")

response = requests.post(
    f"{TTS_URL}/ml/tts/predict",
    json={
        "text": custom_text,
        "language": custom_lang,
        "speed": custom_speed
    },
    headers={"Content-Type": "application/json", **HEADERS},
    timeout=120
)

print(f"\nResponse ({response.status_code}):")
print(json.dumps(response.json(), indent=2))

## Step 11: Test Summary

In [None]:
# @title Generate Test Summary

print("="*60)
print("TEST SUMMARY")
print("="*60)

print(f"\nüîó Service URLs:")
print(f"   STT: {STT_URL}")
print(f"   TTS: {TTS_URL}")

print(f"\nüè• Service Health:")
stt_ok = check_stt_health()
tts_ok = check_tts_health()

print(f"\nüìã Overall Status:")
if stt_ok and tts_ok:
    print("   ‚úÖ All services are operational!")
elif stt_ok:
    print("   ‚ö†Ô∏è Only STT service is operational")
elif tts_ok:
    print("   ‚ö†Ô∏è Only TTS service is operational")
else:
    print("   ‚ùå Both services are down")

print("\n" + "="*60)