# Train Hindi XTTSv2 (Multi-speaker + Speaker Embeddings)
This notebook prepares metadata, extracts ECAPA embeddings, then fine-tunes XTTSv2. Run cells in order on GPU.

In [1]:
import os, sys
PROJ_DIR = os.path.abspath('..')
DATA_DIR = os.path.join(PROJ_DIR, 'data')
AUDIO_DIR = os.path.join(DATA_DIR, 'audio')
META_CSV = os.path.join(DATA_DIR, 'metadata.csv')
EMB_OUT = os.path.join(DATA_DIR, 'embeddings.npy')
OUT_DIR = os.path.join(PROJ_DIR, 'checkpoints')
os.makedirs(OUT_DIR, exist_ok=True)
print('Project:', PROJ_DIR)
print('Data:', DATA_DIR)


Project: d:\IMPORTANT\hindi_voice_cloning_final
Data: d:\IMPORTANT\hindi_voice_cloning_final\data


In [2]:
# Add project root to Python path
import sys
if PROJ_DIR not in sys.path:
	sys.path.append(PROJ_DIR)

# Step 1: Build metadata.csv from JSON
# 'scripts' may not be a package; import the module by file path to avoid ModuleNotFoundError
import importlib.util, importlib.machinery
pm_path = os.path.join(PROJ_DIR, "scripts", "prepare_metadata.py")
spec = importlib.util.spec_from_file_location("prepare_metadata", pm_path)
pm = importlib.util.module_from_spec(spec)
spec.loader.exec_module(pm)

json_path = os.path.join(DATA_DIR, 'commonvoice_hindi.json')
pm.build(json_path, AUDIO_DIR, META_CSV)
print('Saved metadata to', META_CSV)


Wrote 0 rows to d:\IMPORTANT\hindi_voice_cloning_final\data\metadata.csv
Saved metadata to d:\IMPORTANT\hindi_voice_cloning_final\data\metadata.csv


In [3]:
# Step 2: Extract speaker embeddings (ECAPA-TDNN)
import scripts.extract_embeddings as ex
ex.run(META_CSV, AUDIO_DIR, EMB_OUT)
print('Saved embeddings to', EMB_OUT)


  from .autonotebook import tqdm as notebook_tqdm
  from speechbrain.pretrained import EncoderClassifier


Saved embeddings to d:\IMPORTANT\hindi_voice_cloning_final\data\embeddings.npy
Saved embeddings to d:\IMPORTANT\hindi_voice_cloning_final\data\embeddings.npy


In [15]:
# ==============================================
# ‚úÖ HINDI VOICE CLONING TRAINING SCRIPT (XTTS)
# Compatible with Coqui-TTS v0.22.0 + Transformers ‚â•4.33.0
# Includes full runtime patch for missing imports (jieba + soundfile)
# ==============================================

import os, sys, time, shutil, subprocess, torch, torch.nn as nn

# ======================================================
# Step 0 ‚Äì Ensure Dependencies
# ======================================================
# Ensure a transformers-compatible tokenizers version is installed first,
# then install TTS and transformers. Use --upgrade to replace incompatible installs.
required_packages = [
    "tokenizers==0.23.0",
    "TTS==0.22.0",
    "transformers>=4.33.0",
]
for pkg in required_packages:
    subprocess.call([sys.executable, "-m", "pip", "install", "-q", "--upgrade", pkg])

# ======================================================
# üß© Monkey-patch for missing optional imports
# ======================================================
from importlib import import_module

try:
    imp_utils = import_module("transformers.utils.import_utils")

    # Patch missing helper functions dynamically
    if not hasattr(imp_utils, "is_jieba_available"):
        imp_utils.is_jieba_available = lambda: False
        print("‚úÖ Patched: added dummy is_jieba_available()")

    # Patch both correct and typo versions for soundfile
    if not hasattr(imp_utils, "is_soundfile_available"):
        imp_utils.is_soundfile_available = lambda: False
        print("‚úÖ Patched: added dummy is_soundfile_available()")
    if not hasattr(imp_utils, "is_soundfile_availble"):
        imp_utils.is_soundfile_availble = lambda: False
        print("‚úÖ Patched: added dummy is_soundfile_availble() (typo)")
    
    # Patch missing TPU function
    if not hasattr(imp_utils, "is_torch_tpu_available"):
        imp_utils.is_torch_tpu_available = lambda: False
        print("‚úÖ Patched: added dummy is_torch_tpu_available()")
        
    # Patch missing tf_required function
    if not hasattr(imp_utils, "tf_required"):
        def tf_required(func):
            def wrapper(*args, **kwargs):
                return func(*args, **kwargs)
            return wrapper
        imp_utils.tf_required = tf_required
        print("‚úÖ Patched: added dummy tf_required decorator")

    # Patch missing torch_required function
    if not hasattr(imp_utils, "torch_required"):
        def torch_required(func):
            def wrapper(*args, **kwargs):
                return func(*args, **kwargs)
            return wrapper
        imp_utils.torch_required = torch_required
        print("‚úÖ Patched: added dummy torch_required decorator")

except Exception as e:
    print(f"‚ö†Ô∏è Transformer patch failed: {e}")

# ======================================================
# Step 1 ‚Äì Imports after patch
# ======================================================
from trainer import Trainer, TrainerArgs
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.config.shared_configs import BaseDatasetConfig
from TTS.tts.models.xtts import Xtts

# ======================================================
# Step 2 ‚Äì Paths
# ======================================================
DATA_DIR = r"D:\IMPORTANT\hindi_voice_cloning_final\dataset"
META_CSV = os.path.join(DATA_DIR, "metadata.csv")
OUT_DIR = r"D:\IMPORTANT\hindi_voice_cloning_final\output"
os.makedirs(OUT_DIR, exist_ok=True)

# Clean stale .lock or log files
for root, _, files in os.walk(OUT_DIR):
    for f in files:
        if "trainer_0_log" in f.lower() or ".lock" in f.lower():
            try:
                os.remove(os.path.join(root, f))
            except Exception:
                pass

# Unique folder
timestamp = time.strftime("%b-%d-%Y_%I-%M-%S%p")
RUN_DIR = os.path.join(OUT_DIR, f"hindi_xtts_final-{timestamp}")
os.makedirs(RUN_DIR, exist_ok=True)
print(f"üßæ Logs and checkpoints will be saved in: {RUN_DIR}")

# ======================================================
# Step 3 ‚Äì Dataset Configuration
# ======================================================
dataset_config = BaseDatasetConfig(
    formatter="ljspeech",
    meta_file_train=META_CSV,
    path=DATA_DIR,
)

# ======================================================
# Step 4 ‚Äì XTTS Model Configuration
# ======================================================
config = XttsConfig()
config.audio.resample = 16000
config.languages = ["hi"]
config.enable_eos_bos_chars = True
config.text_cleaner = "multilingual_cleaners"
config.model_args.num_chars = 256
config.dataset_config = dataset_config

config.run_name = "hindi_xtts_final"
config.output_path = RUN_DIR
config.batch_size = 8
config.epochs = 8
config.test_delay_epochs = 1
config.num_loader_workers = 4
config.eval_split_max_size = 100
config.save_step = 500
config.print_step = 25
config.save_checkpoints = True
config.use_phonemes = False
config.mixed_precision = True
config.grad_clip = 1.0

# ======================================================
# Step 5 ‚Äì Initialize Model
# ======================================================
print("üîÑ Initializing XTTS model for Hindi training...")
model = Xtts.init_from_config(config)

# Fallback for missing criterion (Tacotron2Loss)
class Tacotron2Loss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
    def forward(self, pred, target):
        return self.mse(pred, target)

if not hasattr(model, "get_criterion"):
    model.get_criterion = lambda: Tacotron2Loss()

# ======================================================
# Step 6 ‚Äì Trainer Setup
# ======================================================
try:
    args = TrainerArgs()
except Exception:
    class DummyArgs:
        def parse_args(self, *a, **kw): return self
    args = DummyArgs()

trainer_log = os.path.join(RUN_DIR, "trainer_0_log.txt")
if os.path.exists(trainer_log):
    try:
        os.remove(trainer_log)
    except PermissionError:
        trainer_log = trainer_log.replace(".txt", "_alt.txt")
        print(f"‚ö†Ô∏è Log file locked, switching to {trainer_log}")

trainer = Trainer(
    args=args,
    config=config,
    output_path=RUN_DIR,
    model=model,
)

# Refresh log file safely
if hasattr(trainer, "log_file") and trainer.log_file:
    try:
        trainer.log_file.close()
    except Exception:
        pass
    trainer.log_file = open(trainer_log, "a", encoding="utf-8")

# ======================================================
# Step 7 ‚Äì Start Training
# ======================================================
print("üöÄ Starting XTTS training for Hindi voice cloning...")

try:
    trainer.fit()
except PermissionError as e:
    print(f"‚ö†Ô∏è PermissionError: {e}")
    print("‚è≥ Retrying after releasing handles...")
    time.sleep(5)
    trainer.log_file = open(trainer_log.replace(".txt", "_retry.txt"), "a", encoding="utf-8")
    trainer.fit()

print(f"‚úÖ Training complete! Checkpoints saved in: {RUN_DIR}")


ImportError: cannot import name 'is_g2p_en_available' from 'transformers.utils' (d:\IMPORTANT\hindi_voice_cloning_final\venv\lib\site-packages\transformers\utils\__init__.py)

In [None]:
# Step 4: Quick inference example (after training)
from TTS.api import TTS
model_file = os.path.join(OUT_DIR, 'best_model.pth')
if os.path.exists(model_file):
    tts = TTS(model_path=model_file, gpu=True)
    # load first embedding and synthesize using it
    import numpy as np
    embobj = np.load(EMB_OUT, allow_pickle=True).item()
    emb = embobj['embeddings'][0]
    tts.tts_to_file(text='‡§®‡§Æ‡§∏‡•ç‡§§‡•á, ‡§Ø‡§π ‡§™‡•ç‡§∞‡§∂‡§ø‡§ï‡•ç‡§∑‡§£ ‡§ï‡•á ‡§¨‡§æ‡§¶ ‡§ï‡§æ ‡§ë‡§°‡§ø‡§Ø‡•ã ‡§π‡•à‡•§', speaker_embeddings=emb, file_path=os.path.join(PROJ_DIR, 'test_output.wav'))
    print('Saved test_output.wav')
else:
    print('No model found yet. Train first or point to a model path.')
