# Quick Bengali T3+S3 joint training on 10 samples
This notebook runs a minimal training loop over 10 samples from eucalyptus/shrutilipi_bengali, computing both T3 (text→speech tokens) and S3 (tokens→mel) losses and plotting them.


In [None]:
# Setup Environment and GPU
import sys, subprocess, importlib, os

# pkgs = [
#     ("datasets[audio]", "datasets"),
#     ("librosa", "librosa"),
#     ("soundfile", "soundfile"),
#     ("matplotlib", "matplotlib"),
# ]
# for spec, mod in pkgs:
#     try:
#         importlib.import_module(mod)
#     except ImportError:
#         print(f"Installing {spec}...")
#         subprocess.check_call([sys.executable, "-m", "pip", "install", spec])

import torch
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Device name:", torch.cuda.get_device_name(0))
    torch.set_float32_matmul_precision("high")
    torch.backends.cudnn.benchmark = True


CUDA available: False


In [4]:
# Configure Run (paths, hyperparameters, N_SAMPLES=10)
from pathlib import Path

ROOT = Path(r"d:\Code\voice_clone\local_test").resolve()
CHECKPOINT_DIR = ROOT / "custom_train" / "checkpoints_bengali_notebook"
CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)

TOKENIZER_PATH = str(ROOT / "tokenizer_bn_tts.json")
BATCH_SIZE = 2
EPOCHS = 1
ACCUM_STEPS = 1
LEARNING_RATE = 3e-4
N_SAMPLES = 10
S3GEN_SR = 24000
S3_SR = 16000

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


Device: cpu


In [3]:
# Add Project To Python Path and Imports
import sys, os
sys.path.append(str(ROOT))

import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

from custom_train.custom_llama_configs import T3Config
from src.chatterbox.models.t3.t3 import T3
from custom_train.custom_modules.custom_cond_enc import T3Cond
from bengalitokenization import BengaliTokenizer
from src.chatterbox.models.s3tokenizer.s3tokenizer import S3Tokenizer, S3_SR as S3_SR_CONST
from src.chatterbox.models.s3gen.s3gen import S3Token2Mel
from src.chatterbox.models.s3gen.utils.mel import mel_spectrogram
from src.chatterbox.models.s3gen.utils.mask import make_pad_mask
from src.chatterbox.models.voice_encoder.voice_encoder import VoiceEncoder
import torch.nn.functional as F

# keep constants consistent
S3_SR = S3_SR_CONST
print("S3_SR:", S3_SR, "S3GEN_SR:", S3GEN_SR)


  from .autonotebook import tqdm as notebook_tqdm


S3_SR: 16000 S3GEN_SR: 24000


In [None]:
# !pip install torchcodec==0.2

Defaulting to user installation because normal site-packages is not writeable


ERROR: Could not find a version that satisfies the requirement torchcodec==0.2 (from versions: 0.0.0.dev0, 0.7.0)
ERROR: No matching distribution found for torchcodec==0.2


In [8]:
audio_path = r"D:\Code\voice_clone\local_test\dataset\bengali_audio_files\audio_00002.wav"
print(os.path.exists(audio_path))

True


In [5]:
import soundfile as sf

audio_path = ROOT / "dataset" / "bengali_audio_files" / "audio_00002.wav"
data, samplerate = sf.read(audio_path)

print("Data shape:", data.shape)
print("Sample rate:", samplerate)

Data shape: (191360,)
Sample rate: 16000


In [6]:
# Load local CSV manifest and decode audio manually with soundfile
import pandas as pd
import soundfile as sf
import librosa
import numpy as np
from pathlib import Path

manifest_path = ROOT / "dataset" / "bengali_audio_manifest.csv"
df = pd.read_csv(manifest_path)

# Align column names with our Dataset impl: expects 'audio' and 'transcriptions'
df = df.rename(columns={"audio_path": "audio", "text": "transcriptions"})

# Keep it small for quick run
df = df.iloc[:min(N_SAMPLES, len(df))].copy()

AUDIO_DIR = ROOT / "dataset" / "bengali_audio_files"

# Decode audio files with soundfile and store as numpy arrays
def load_audio(path):
    resolved = ROOT / "dataset" / Path(path)
    try:
        wav, sr = sf.read(resolved)
        if wav.ndim > 1:
            wav = np.mean(wav, axis=1)
        if sr != S3GEN_SR:
            wav = librosa.resample(wav, orig_sr=sr, target_sr=S3GEN_SR)
            sr = S3GEN_SR
        return {"array": wav, "sampling_rate": sr}
    except Exception as e:
        print(f"Error loading {resolved}: {e}")
        return None

# Only load audio if not already present
if "audio" in df.columns:
    df["audio"] = df["audio"].apply(load_audio)

# Optional: add a default accent label
df["accent"] = "standard"

print("Columns:", df.columns.tolist(), "len:", len(df))
print(df.iloc[0])


Columns: ['audio', 'transcriptions', 'accent'] len: 10
audio             {'array': [0.0014218403828098098, 0.0077988284...
transcriptions    আসামে জাতীয় নাগরিক পঞ্জীর চূড়ান্ত খসড়া আগাম...
accent                                                     standard
Name: 0, dtype: object


In [None]:
# # Optional: Patch/Validate Fields (language_id, accent_id)
# # This dataset has only audio and transcriptions. We can add a default accent.

# def add_default_accent(batch):
#     batch['accent'] = 'standard'
#     return batch

# ds = ds.map(add_default_accent)
# print("Columns:", ds.column_names)


In [12]:
# Initialize Tokenizers and Voice Encoder
hp = T3Config.bengali_accent(TOKENIZER_PATH)
hp.emotion_adv = False
text_tok = BengaliTokenizer(TOKENIZER_PATH)
s3_tok = S3Tokenizer().to(device)
ve = VoiceEncoder().to(device)
ve.eval()
for p in ve.parameters():
    p.requires_grad_(False)
print("Config hidden size:", hp.n_channels)


Config hidden size: 1024


In [13]:
# Build PyTorch Dataset and DataLoader (reuse your code)
import torch
from torch.utils.data import DataLoader, Dataset
import librosa

# Reuse helpers from the training script
from custom_train.train_bengali_t3 import audio_to_tensor_pair, BengaliTTSDataset, collate_fn

# Ensure df does not contain rows with failed audio loading
if "audio_array" in df.columns:
    df = df[df["audio_array"].apply(lambda x: x is not None)]
    df = df.reset_index(drop=True)

train_ds = BengaliTTSDataset(df, hp=hp, text_tok=text_tok, s3_tok=s3_tok, ve=ve)
loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=0, collate_fn=collate_fn, pin_memory=True)

print("Batch keys:", list(train_ds[0].keys()))
print("text_tokens:", train_ds[0]['text_tokens'].shape, "mel24:", train_ds[0]['mel24'].shape)

batch = next(iter(loader))
print("Batch keys:", list(batch.keys()))
print("text_tokens:", batch['text_tokens'].shape, "mel24:", batch['mel24'].shape)


min value is  tensor(-1.0437)
max value is  tensor(1.0356)
Batch keys: ['text_tokens', 'text_token_lens', 'speech_tokens_t3', 'speech_token_lens_t3', 'speech_tokens_raw', 'speech_token_lens_raw', 'mel24', 'mel_len24', 'speaker_emb', 'language_id', 'accent_id']
min value is  tensor(-1.0437)
max value is  tensor(1.0356)
min value is  tensor(-1.0437)
max value is  tensor(1.0356)
text_tokens: torch.Size([114]) mel24: torch.Size([309, 80])
min value is  tensor(-1.0289)
max value is  tensor(1.0623)
min value is  tensor(-1.0868)
max value is  tensor(1.0811)
Batch keys: ['text_tokens', 'text_token_lens', 'speech_tokens_t3', 'speech_token_lens_t3', 'speech_tokens_raw', 'speech_token_lens_raw', 'mel24', 'mel_len24', 'speaker_emb', 'language_id', 'accent_id']
text_tokens: torch.Size([2, 355]) mel24: torch.Size([2, 1097, 80])


In [None]:
# Initialize Models, Optimizer, and AMP Scaler
t3_model = T3(hp=hp).to(device)
s3_model = S3Token2Mel().to(device)
optimizer = torch.optim.AdamW(list(t3_model.parameters()) + list(s3_model.parameters()), lr=LEARNING_RATE)
scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())

t3_model.train(); s3_model.train()


In [15]:
# Short Training Loop (collect losses)
from tqdm import tqdm

L_txt, L_sp, L_s3, L_total = [], [], [], []

for epoch in range(EPOCHS):
    pbar = tqdm(loader, desc=f"Epoch {epoch+1}/{EPOCHS}")
    step = 0
    optimizer.zero_grad()
    for batch in pbar:
        text_tokens = batch["text_tokens"].to(device)
        text_token_lens = batch["text_token_lens"].to(device)
        speech_tokens_t3 = batch["speech_tokens_t3"].to(device)
        speech_token_lens_t3 = batch["speech_token_lens_t3"].to(device)
        speech_tokens_raw = batch["speech_tokens_raw"].to(device)
        speech_token_lens_raw = batch["speech_token_lens_raw"].to(device)
        mel24 = batch["mel24"].to(device)
        mel_len24 = batch["mel_len24"].to(device)
        spk = batch["speaker_emb"].to(device)

        cond = T3Cond(
            speaker_emb=spk,
            language_id=torch.full((text_tokens.size(0),), 1, dtype=torch.long, device=device),
            accent_id=torch.zeros((text_tokens.size(0),), dtype=torch.long, device=device),
            emotion_adv=None,
        )

        with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
            # T3 losses
            loss_text, loss_speech = t3_model.loss(
                t3_cond=cond,
                text_tokens=text_tokens,
                text_token_lens=text_token_lens,
                speech_tokens=speech_tokens_t3,
                speech_token_lens=speech_token_lens_t3,
            )

            # S3 losses
            spk_norm = F.normalize(spk, dim=1)
            spk_proj = s3_model.flow.spk_embed_affine_layer(spk_norm)
            mask_tok = (~make_pad_mask(speech_token_lens_raw)).float().unsqueeze(-1).to(device)
            tok_emb = s3_model.flow.input_embedding(torch.clamp(speech_tokens_raw, min=0)) * mask_tok
            h, _ = s3_model.flow.encoder(tok_emb, speech_token_lens_raw)
            h = s3_model.flow.encoder_proj(h)             # (B, Tm, 80)

            Tm = int(min(h.shape[1], int(mel_len24.min().item())))
            h = h[:, :Tm, :].transpose(1, 2).contiguous()
            feat = mel24[:, :Tm, :].transpose(1, 2).contiguous()
            mask_mel = (~make_pad_mask(torch.full_like(mel_len24, Tm))).unsqueeze(1).float().to(device)
            cond_mel = torch.zeros_like(feat)
            loss_s3, _ = s3_model.flow.decoder.compute_loss(x1=feat, mask=mask_mel, mu=h, spks=spk_proj, cond=cond_mel)

            total_loss = loss_text + loss_speech + loss_s3

        scaler.scale(total_loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()

        L_txt.append(loss_text.item()); L_sp.append(loss_speech.item()); L_s3.append(loss_s3.item()); L_total.append(total_loss.item())
        pbar.set_postfix({"L_txt": f"{L_txt[-1]:.3f}", "L_sp": f"{L_sp[-1]:.3f}", "L_s3": f"{L_s3[-1]:.3f}", "L": f"{L_total[-1]:.3f}"})
        step += 1


Epoch 1/1:   0%|          | 0/5 [00:00<?, ?it/s]

min value is  tensor(-1.0348)
max value is  tensor(1.0371)
min value is  tensor(-1.0868)
max value is  tensor(1.0811)


  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
Epoch 1/1:   0%|          | 0/5 [00:08<?, ?it/s]


RuntimeError: Expected target size [2, 16000], got [2, 355]

In [None]:
# Plot Training Loss Curves
plt.figure(figsize=(8,5))
plt.plot(L_txt, label='L_t3_txt')
plt.plot(L_sp, label='L_t3_sp')
plt.plot(L_s3, label='L_s3')
plt.plot(L_total, label='L_total', linewidth=2)
plt.xlabel('Step')
plt.ylabel('Loss')
plt.title('Training losses (10 samples)')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# Save Checkpoint and List Files
ckpt_path = CHECKPOINT_DIR / "t3_s3_bn_quickrun.pt"
state = {
    "t3_model": t3_model.state_dict(),
    "s3_model": s3_model.state_dict(),
    "hp": hp.__dict__,
}
import torch
torch.save(state, str(ckpt_path))
print("Saved:", ckpt_path)
print("Dir:", list(CHECKPOINT_DIR.iterdir()))
