# VoiceDub - YouTube Dubbing Backend (GPU)

This notebook runs the dubbing backend on Google Colab's free T4 GPU.

**Setup:**
1. Go to **Runtime > Change runtime type > T4 GPU**
2. Run **Cell 1** (install) — this will restart the runtime
3. After restart, **skip Cell 1** and run Cells 2, 3, 4 in order

**Features:** Chatterbox TTS (human-like voice) + Whisper (GPU transcription) + Groq/Gemini translation

In [None]:
#@title 1. Install Dependencies (run ONCE, restarts runtime)
#@markdown Run this cell first. It installs everything and restarts the runtime.
#@markdown **After restart, skip this cell and go to Cell 2.**

import os

# Clone or update repo
if os.path.exists('/content/app'):
    !cd /content/app && git fetch origin && git reset --hard origin/master
else:
    !git clone https://github.com/sasmalgiri/youtube-dubbing.git /content/app

# Install build tools
!apt-get -qq install -y libsndfile1 > /dev/null 2>&1

# Fix numpy for chatterbox (must be 1.26.x)
!pip install -q numpy==1.26.4

# Install chatterbox deps manually (pip build fails on Colab)
!pip install -q "librosa>=0.11.0" "s3tokenizer" "torch>=2.6.0" "torchaudio>=2.6.0" \
    "transformers==4.46.3" "diffusers==0.29.0" "resemble-perth>=1.0.1" \
    "conformer>=0.3.2" "safetensors>=0.5.3" "spacy-pkuseg" "pykakasi>=2.3.0" \
    "pyloudnorm" "omegaconf"
!pip install -q chatterbox-tts --no-deps

# Install backend deps + pyngrok
!pip install -q fastapi uvicorn[standard] python-multipart pydantic edge-tts \
    faster-whisper deep-translator google-genai groq openai elevenlabs \
    sse-starlette rich yt-dlp pyngrok

print("\n" + "=" * 60)
print("All installed! Restarting runtime for numpy fix...")
print("After restart, SKIP this cell and run Cells 2, 3, 4.")
print("=" * 60)
os._exit(0)

In [None]:
#@title 2. Set API Keys + Setup
#@markdown Enter your API keys and run this cell.

GROQ_API_KEY = "" #@param {type:"string"}
GEMINI_API_KEY = "" #@param {type:"string"}
ELEVENLABS_API_KEY = "" #@param {type:"string"}
OPENAI_API_KEY = "" #@param {type:"string"}
NGROK_AUTH_TOKEN = "" #@param {type:"string"}
NGROK_DOMAIN = "" #@param {type:"string"}

#@markdown ---
#@markdown **Required:** `NGROK_AUTH_TOKEN` + at least one translation key (`GROQ_API_KEY` recommended)
#@markdown
#@markdown **Translation priority:** OpenAI GPT-4o > Groq Llama 3.3 > Gemini > Google Translate
#@markdown
#@markdown **Optional:** `NGROK_DOMAIN`, `ELEVENLABS_API_KEY`, `OPENAI_API_KEY`

import os

# Set environment variables
if GROQ_API_KEY:
    os.environ['GROQ_API_KEY'] = GROQ_API_KEY
if GEMINI_API_KEY:
    os.environ['GEMINI_API_KEY'] = GEMINI_API_KEY
if ELEVENLABS_API_KEY:
    os.environ['ELEVENLABS_API_KEY'] = ELEVENLABS_API_KEY
if OPENAI_API_KEY:
    os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

# Ensure repo + backend dir exist
if not os.path.exists('/content/app/backend'):
    !git clone https://github.com/sasmalgiri/youtube-dubbing.git /content/app

# Write .env file
os.makedirs('/content/app/backend', exist_ok=True)
with open('/content/app/backend/.env', 'w') as f:
    if GROQ_API_KEY:
        f.write(f'GROQ_API_KEY={GROQ_API_KEY}\n')
    if GEMINI_API_KEY:
        f.write(f'GEMINI_API_KEY={GEMINI_API_KEY}\n')
    if ELEVENLABS_API_KEY:
        f.write(f'ELEVENLABS_API_KEY={ELEVENLABS_API_KEY}\n')
    if OPENAI_API_KEY:
        f.write(f'OPENAI_API_KEY={OPENAI_API_KEY}\n')

# Install deno for yt-dlp (in case YouTube URL is used directly)
!curl -fsSL https://deno.land/install.sh | sh 2>/dev/null
os.environ['PATH'] = '/root/.deno/bin:' + os.environ.get('PATH', '')

# Check GPU and pre-download Whisper model
import torch
print(f"\nGPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    device, compute = 'cuda', 'float16'
else:
    print("WARNING: No GPU detected! Go to Runtime > Change runtime type > T4 GPU")
    print("Continuing with CPU (slower transcription, no Chatterbox)...")
    device, compute = 'cpu', 'int8'

print("\nPre-downloading Whisper model...")
%cd /content/app/backend
from faster_whisper import WhisperModel
model = WhisperModel('small', device=device, compute_type=compute)
del model

print("\n" + "=" * 60)
print("Setup complete!")
print(f"  GPU:        {'YES - ' + torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'NO (CPU mode)'}")
print(f"  Groq:       {'configured (recommended)' if GROQ_API_KEY else 'not set'}")
print(f"  Gemini:     {'configured' if GEMINI_API_KEY else 'not set'}")
print(f"  OpenAI:     {'configured' if OPENAI_API_KEY else 'not set (optional)'}")
print(f"  ElevenLabs: {'configured' if ELEVENLABS_API_KEY else 'not set (optional)'}")
print(f"  ngrok:      {'configured' if NGROK_AUTH_TOKEN else 'MISSING'}")
print(f"  ngrok domain: {NGROK_DOMAIN or 'not set (will use random)'}")
translation = "OpenAI" if OPENAI_API_KEY else "Groq" if GROQ_API_KEY else "Gemini" if GEMINI_API_KEY else "Google Translate (basic)"
print(f"\n  Translation engine: {translation}")
print("=" * 60)
print("\nNow run Cell 3 to start the server!")

In [None]:
#@title 3. Start Backend Server + ngrok Tunnel
import subprocess
import time
import os
from pyngrok import ngrok, conf

# Ensure deno is in PATH
os.environ['PATH'] = '/root/.deno/bin:' + os.environ.get('PATH', '')

# Set ngrok auth token
if not NGROK_AUTH_TOKEN:
    raise ValueError('Missing NGROK_AUTH_TOKEN — go back to Cell 2')
conf.get_default().auth_token = NGROK_AUTH_TOKEN

# Start uvicorn in background
proc = subprocess.Popen(
    ['python', '-m', 'uvicorn', 'app:app', '--host', '0.0.0.0', '--port', '8000'],
    cwd='/content/app/backend',
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT,
    env=os.environ.copy(),
)
time.sleep(3)

# Connect ngrok
if NGROK_DOMAIN:
    public_url = ngrok.connect(8000, 'http', domain=NGROK_DOMAIN)
    url = f'https://{NGROK_DOMAIN}'
else:
    public_url = ngrok.connect(8000, 'http')
    url = str(public_url)

print('=' * 60)
print('Backend running on GPU!')
print(f'\nPUBLIC URL: {url}')
print(f'\nTest: {url}/api/health')
print('\nSet this in your frontend .env.local:')
print(f'  NEXT_PUBLIC_API_URL={url}')
print('=' * 60)

In [None]:
#@title 4. Monitor Server Logs
#@markdown Keep this cell running to see backend logs in real-time.

import time

print('Monitoring server... (this cell keeps running)')
print('Submit a dubbing job from your frontend to see progress here.')
print('-' * 60)

try:
    while proc.poll() is None:
        line = proc.stdout.readline()
        if line:
            print(line.decode('utf-8', errors='replace').rstrip())
        else:
            time.sleep(0.5)
    print('\nServer process exited!')
except KeyboardInterrupt:
    print('\nStopped monitoring (server still running)')