## Drive Mount

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Setup


In [None]:
import pandas as pd
import numpy as np
import math

In [None]:
userName="Shivam"

In [None]:
df = pd.read_csv(f'/content/drive/MyDrive/S2S2/WorkSplit/{userName}.csv')
df = df.sort_values(by=['lang', 'text']).reset_index(drop=True)

total_rows = len(df)
total_rows

In [None]:
df.head()

## Stratified Chunks

In [None]:
lang_proportions = df['lang'].value_counts(normalize=True)
display(lang_proportions)

In [None]:
num_chunks = 100
total_rows = len(df)
rows_per_chunk = total_rows // num_chunks

rows_per_language_per_chunk = {
    lang: round(prop * rows_per_chunk)
    for lang, prop in lang_proportions.items()
}

print(f"Total rows: {total_rows}")
print(f"Rows per chunk (approx): {rows_per_chunk}")
print(f"Rows per language per chunk:")
for lang, count in rows_per_language_per_chunk.items():
    print(f" - {lang}: {count}")

In [None]:
chunked_dfs = []
remaining_df = df.copy()

RANDOM_SEED = 42

for chunk_id in range(num_chunks):
    current_chunk_rows = []

    for lang, count in rows_per_language_per_chunk.items():
        lang_df = remaining_df[remaining_df['lang'] == lang]
        if len(lang_df) >= count:
            sampled_rows = lang_df.sample(n=count, replace=False, random_state=RANDOM_SEED + chunk_id)
            current_chunk_rows.append(sampled_rows)
            remaining_df = remaining_df.drop(sampled_rows.index)
        else:
            current_chunk_rows.append(lang_df)
            remaining_df = remaining_df.drop(lang_df.index)

    if current_chunk_rows:
        chunked_df = pd.concat(current_chunk_rows)
        chunked_dfs.append(chunked_df.reset_index(drop=True))


In [None]:
#Sanity Check
import random
for i, chunk in random.sample(list(enumerate(chunked_dfs)),5):
    display(chunk['lang'].value_counts(normalize=True))
    print("-" * 30)

## TTS

In [None]:
!pip install coqui-tts -q

In [None]:
!apt-get install -y espeak-ng
# !pip install TTS==0.21.3

In [None]:
import os
import asyncio
import pandas as pd

In [None]:
import os

base_output = f"/content/drive/MyDrive/S2S2/TTS_New/{userName}"
langs = df["lang"].unique()


In [None]:
from TTS.api import TTS
coqui_tts = TTS(model_name="tts_models/en/vctk/vits", progress_bar=False, gpu=True)

speakers = coqui_tts.speakers
print(f"Loaded {len(speakers)} voices. Sample: {speakers[:10]}")

selected_speakers = random.sample(speakers, min(10, len(speakers)))  # use 10 random voices

In [None]:
start_chunk=74

In [None]:
import os
import time
import asyncio
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

MAX_WORKERS = 16
MAX_RETRIES = 3
RETRY_DELAY = 2
CHUNK_PAUSE = 2
MIN_FILE_SIZE = 2000

def ensure_valid_audio(path):
    return os.path.exists(path) and os.path.getsize(path) > MIN_FILE_SIZE


def process_row(idx, row):
    text = str(row.get("en_text", "")).strip()
    out_path = os.path.join(base_output, row["english_audio_filepath"].lstrip("/"))

    if not text or not out_path:
        return f"Invalid input on row {idx}"

    os.makedirs(os.path.dirname(out_path), exist_ok=True)

    for attempt in range(1, MAX_RETRIES + 1):
        try:
            speaker = random.choice(selected_speakers)
            coqui_tts.tts_to_file(
                text=text,
                speaker=speaker,
                file_path=out_path
            )

            if ensure_valid_audio(out_path):
                return None
            else:
                raise ValueError("Empty or invalid audio file")

        except Exception as e:
            if attempt < MAX_RETRIES:
                time.sleep(RETRY_DELAY)
                continue
            else:
                return "Failed"

    return None

for chunk_idx, chunk in enumerate(chunked_dfs[start_chunk:], start=start_chunk):
    print(f"Processing chunk {chunk_idx+1}/{len(chunked_dfs)}")

    errors = []
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = {executor.submit(process_row, idx, row): idx for idx, row in chunk.iterrows()}

        for f in tqdm(as_completed(futures), total=len(futures), desc="Synthesizing"):
            err = f.result()
            if err:
                errors.append(err)

    if errors:
        print(f"{len(errors)} errors in chunk {chunk_idx+1}:")
        for e in errors[:5]:
            print("   ", e)

    print(f"Finished chunk {chunk_idx+1}\n")
    time.sleep(CHUNK_PAUSE)