# **Section 1: Environment Setup & Dependencies**
In this section, we configure the runtime environment, mount persistent storage (Google Drive), and authenticate with Hugging Face to access the required datasets.

### **1. Project Setup & Directory Initialization**
We initialize the project structure by creating the necessary subdirectories for raw data, processed features, metadata, and model checkpoints.

In [None]:
!mkdir -p /content/drive/MyDrive/hindi_dfake/{raw,processed,metadata,scripts,checkpoints}

### **2. Environment Configuration**
We mount Google Drive to ensure persistent storage and dynamically define the root paths (`ROOT_DIR`, `OUT_DIR`, etc.) to avoid hardcoding errors across different sessions.

In [None]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

import os, glob
MOUNT_PATH = "/content/drive" if os.path.isdir("/content/drive/MyDrive") else "/content/gdrive"
print("Drive is already mounted here ‚Üí", MOUNT_PATH)
ROOT_DIR = f"{MOUNT_PATH}/MyDrive/hindi_dfake"
OUT_DIR  = f"{ROOT_DIR}/raw/real_clean/ivr"
CSV_PATH = f"{ROOT_DIR}/metadata/master_real.csv"

Mounted at /content/drive
Drive is already mounted here ‚Üí /content/drive


### **3. Hugging Face Authentication**
Authentication is required to download the **IndicVoices-R** and **Common Voice** datasets directly from the Hugging Face Hub.

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

# **Section 2: Synthetic Dataset Construction**
In this section, we generate the "fake" counterpart of our dataset. We employ state-of-the-art Text-to-Speech (TTS) models to synthesize audio that mirrors the content of our real dataset, ensuring a balanced dataset for training.

### **1 Environment Setup & Data Retrieval**
We install necessary system dependencies (FFmpeg) and provide the configuration to download raw datasets via the Kaggle API.

In [None]:
!pip install -q kaggle

# 1Ô∏è‚É£  Import the helper
from google.colab import files

# 2Ô∏è‚É£  Upload kaggle.json
files.upload()          # pick the kaggle.json from your laptop

# 3Ô∏è‚É£  Move & set permissions
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


Saving kaggle.json to kaggle.json


In [None]:
!apt-get -qq install ffmpeg

In [None]:
!pip install -q kagglehub

In [None]:
!kaggle config view

Configuration values from /root/.kaggle
- username: anishdb
- path: None
- proxy: None
- competition: None


### **2. Edge-TTS**
We generate 55 hours of synthetic Hindi speech using **Microsoft Edge-TTS**, employing optimized concurrency to efficiently synthesize high-quality audio while ensuring the process can automatically resume from interruptions.

In [None]:
# =================== INSTANT-START, RESUME-SAFE EDGE-TTS ===================
# Safe to run after stopping the previous cell. Won't touch existing WAVs.

# --- CONFIG (tune these) ---
TARGET_HOURS       = 55
ROOT_DIR           = "/content/drive/MyDrive/hindi_dfake"
REAL_CSV           = f"{ROOT_DIR}/metadata/master_real.csv"
OUT_DIR            = f"{ROOT_DIR}/raw/fake_tts"
FAKE_CSV           = f"{ROOT_DIR}/metadata/master_fake.csv"

SEED               = 2025
MIN_CHARS, MAX_CHARS = 6, 220  # allow slightly longer texts to pack better

# Throughput: start with these; adjust if throttled
MAX_CONCURRENT     = 6
BATCH_SIZE         = MAX_CONCURRENT * 2
VOICE_RATE         = "+0%"

# Speed toggles
USE_DIRECT_PCM     = True   # ask for WAV directly
PACK_N             = 3      # pack 3 sentences per request

# Timeouts (seconds)
TTS_TIMEOUT_SEC    = 40     # per request
FFMPEG_TIMEOUT_SEC = 30

# --- SETUP ---
!pip -q install edge-tts tqdm soundfile pandas numpy nest_asyncio > /dev/null
!apt-get -qq install ffmpeg > /dev/null

import os, csv, hashlib, random, tempfile, subprocess, shlex, time
from pathlib import Path
import pandas as pd, numpy as np, soundfile as sf
from tqdm.auto import tqdm
import asyncio, nest_asyncio, edge_tts
nest_asyncio.apply()
loop = asyncio.get_event_loop()
random.seed(SEED); np.random.seed(SEED)
os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(Path(FAKE_CSV).parent, exist_ok=True)

# --- Voices: hard-coded to avoid any network lookup delays ---
hi_voices = ["hi-IN-MadhurNeural", "hi-IN-SwaraNeural"]
print("Hindi voices:", hi_voices)

# --- Load texts ---
def load_texts(csv_path):
    df = pd.read_csv(csv_path)
    col = next((c for c in ["text","transcript","sentence"] if c in df.columns), None)
    assert col, "master_real.csv must have 'text' (or 'transcript'/'sentence') column."
    texts = (df[col].astype(str).str.replace("\n"," ").str.strip()
             .dropna().loc[lambda s: s.str.len().between(MIN_CHARS, MAX_CHARS)]
             .drop_duplicates().tolist())
    random.shuffle(texts)
    return texts

texts = load_texts(REAL_CSV)
print(f"Loaded {len(texts)} unique sentences.")

# --- Light, on-the-fly packer (no big pre-build) ---
JOINER = " ‡•§ "
def text_stream(items, n=PACK_N, max_len=300):
    i = random.randrange(len(items))
    while True:
        if n <= 1:
            t = items[i % len(items)].strip(); i += 1
            if len(t) >= MIN_CHARS: yield t
            continue
        # build one bundle
        bundle, cur = [], 0
        while len(bundle) < n:
            t = items[i % len(items)].strip(); i += 1
            if len(t) < MIN_CHARS: continue
            if (cur and cur + 1 + len(t) > max_len): break
            bundle.append(t); cur += (len(t) if not cur else 1 + len(t))
        yield JOINER.join(bundle)

# --- Truth accounting from disk ---
def folder_seconds(wav_dir: str) -> float:
    total = 0.0
    for fn in os.listdir(wav_dir):
        if fn.endswith(".wav"):
            p = os.path.join(wav_dir, fn)
            try:
                d, sr = sf.read(p); total += len(d)/float(sr)
            except: pass
    return total

secs_cap  = TARGET_HOURS * 3600
secs_done = folder_seconds(OUT_DIR)
print(f"Already on disk (truth): {secs_done/3600:.2f} h")

# --- Known UIDs from disk + CSV (no dupes) ---
def uid_for(text, voice, rate, seed):
    return "tts_edge_" + hashlib.sha1(f"{text}|{voice}|{rate}|{seed}".encode("utf-8")).hexdigest()[:16]

existing_uids = set()
for fn in os.listdir(OUT_DIR):
    if fn.endswith(".wav") and fn.startswith("tts_edge_"):
        existing_uids.add(Path(fn).stem)
if os.path.isfile(FAKE_CSV):
    try:
        df_existing = pd.read_csv(FAKE_CSV, usecols=["utt_id"])
        existing_uids |= set(df_existing["utt_id"].astype(str).tolist())
    except Exception:
        pass

need_header = not os.path.isfile(FAKE_CSV)
fields = ["utt_id","path","duration","speaker_id","gender","text","label",
          "fake_type","tts_model","voice","rate","seed"]

# --- Safe wrappers with timeouts ---
async def tts_save_with_timeout(comm: edge_tts.Communicate, out_path: str, timeout=TTS_TIMEOUT_SEC):
    return await asyncio.wait_for(comm.save(out_path), timeout=timeout)

def ffmpeg_convert_with_timeout(cmd_list, timeout=FFMPEG_TIMEOUT_SEC):
    p = subprocess.Popen(cmd_list, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    try:
        outs, errs = p.communicate(timeout=timeout)
    except subprocess.TimeoutExpired:
        p.kill()
        try: p.communicate(timeout=3)
        except: pass
        raise TimeoutError("ffmpeg timed out")
    if p.returncode != 0:
        raise RuntimeError(errs.decode("utf-8","ignore"))
    return outs

# --- One-item synth (resume-safe; PCM-first; timeouts; fallback) ---
async def synth_one(text, voice, sem, pbar):
    async with sem:
        uid = uid_for(text, voice, VOICE_RATE, SEED)
        out_wav = os.path.join(OUT_DIR, f"{uid}.wav")

        if uid in existing_uids and os.path.isfile(out_wav):
            return None

        # Try direct PCM first
        if USE_DIRECT_PCM:
            tmp_wav = os.path.join(tempfile.gettempdir(), f"{uid}.wav")
            try:
                await tts_save_with_timeout(
                    edge_tts.Communicate(
                        text=text, voice=voice, rate=VOICE_RATE,
                        output_format="riff-16khz-16bit-mono-pcm"
                    ), tmp_wav
                )
                os.replace(tmp_wav, out_wav)
            except Exception:
                # fallback MP3 -> WAV
                try:
                    if os.path.exists(tmp_wav): os.remove(tmp_wav)
                except: pass
                try:
                    tmp_mp3 = os.path.join(tempfile.gettempdir(), f"{uid}.mp3")
                    await tts_save_with_timeout(edge_tts.Communicate(text=text, voice=voice, rate=VOICE_RATE), tmp_mp3)
                    ffmpeg_convert_with_timeout([
                        "ffmpeg","-loglevel","error","-y","-i",tmp_mp3,
                        "-ac","1","-ar","16000","-f","wav",out_wav
                    ])
                finally:
                    try: os.remove(tmp_mp3)
                    except: pass
        else:
            tmp_mp3 = os.path.join(tempfile.gettempdir(), f"{uid}.mp3")
            try:
                await tts_save_with_timeout(edge_tts.Communicate(text=text, voice=voice, rate=VOICE_RATE), tmp_mp3)
                ffmpeg_convert_with_timeout([
                    "ffmpeg","-loglevel","error","-y","-i",tmp_mp3,
                    "-ac","1","-ar","16000","-f","wav",out_wav
                ])
            finally:
                try: os.remove(tmp_mp3)
                except: pass

        # Validate and account
        try:
            data, sr = sf.read(out_wav)
            dur = len(data)/float(sr)
            if dur < 0.5 or float(np.abs(data).mean()) < 1e-5:
                os.remove(out_wav); return None
        except Exception:
            try: os.remove(out_wav)
            except: pass
            return None

        pbar.update(dur)
        return uid, out_wav, dur, text, voice

# --- Batch runner ---
async def run_batch():
    global secs_done, existing_uids
    sem = asyncio.Semaphore(MAX_CONCURRENT)
    ts = text_stream(texts, n=PACK_N)

    with open(FAKE_CSV, "a", newline="", encoding="utf-8") as fcsv:
        writer = csv.DictWriter(fcsv, fieldnames=fields)
        if need_header: writer.writeheader()

        with tqdm(total=max(0, secs_cap - secs_done), unit="s", desc="Edge-TTS hours") as pbar:
            while secs_done < secs_cap:
                batch_size = min(BATCH_SIZE, MAX_CONCURRENT * 2)
                choices = [next(ts) for _ in range(batch_size)]
                voices  = [random.choice(hi_voices) for _ in range(batch_size)]
                tasks   = [synth_one(t, v, sem, pbar) for t, v in zip(choices, voices)]

                results = await asyncio.gather(*tasks, return_exceptions=True)
                wrote_any = False

                for res in results:
                    if (not res) or isinstance(res, Exception):
                        continue
                    uid, out_wav, dur, text, voice = res
                    if uid in existing_uids:
                        continue
                    secs_done += dur
                    existing_uids.add(uid)
                    writer.writerow({
                        "utt_id": uid, "path": out_wav, "duration": round(dur,3),
                        "speaker_id": "edge_tts", "gender": "u", "text": text,
                        "label": "fake", "fake_type": "tts_edge",
                        "tts_model": "edge-tts", "voice": voice, "rate": VOICE_RATE,
                        "seed": SEED
                    })
                    wrote_any = True
                    if secs_done >= secs_cap:
                        break

                if wrote_any: fcsv.flush()
                if secs_done >= secs_cap: break

print(f"\nüöÄ Starting synthesis with {MAX_CONCURRENT} workers")
print(f"Target: {TARGET_HOURS}h | On-disk start: {secs_done/3600:.2f}h | Remaining: {(secs_cap-secs_done)/3600:.2f}h")
loop.run_until_complete(run_batch())

final_secs = folder_seconds(OUT_DIR)
print(f"\n‚úÖ COMPLETED. On-disk total: {final_secs/3600:.2f} h in {OUT_DIR}")
print(f"üìä Metadata: {FAKE_CSV}")

Hindi voices: ['hi-IN-MadhurNeural', 'hi-IN-SwaraNeural']
Loaded 24713 unique sentences.
Already on disk (truth): 49.16 h

üöÄ Starting synthesis with 6 workers
Target: 55h | On-disk start: 49.16h | Remaining: 5.84h


Edge-TTS hours:   0%|          | 0/21033.7919999999 [00:00<?, ?s/s]


‚úÖ COMPLETED. On-disk total: 55.01 h in /content/drive/MyDrive/hindi_dfake/raw/fake_tts
üìä Metadata: /content/drive/MyDrive/hindi_dfake/metadata/master_fake.csv


### **3. Quality Check: Side-by-Side Comparison**
We randomly select a sample from the real dataset and generate a corresponding synthetic version using Edge-TTS. This creates a direct "Real vs. Fake" pair for manual auditory inspection.

In [None]:
# ===== One-Pair Compare (v4): version-agnostic Edge-TTS (MP3->WAV) =====
# Writes: /content/drive/MyDrive/hindi_dfake/processed/compare/<utt_id>/{real.wav, fake_edge.wav}

# --- CONFIG ---
ROOT = "/content/drive/MyDrive/hindi_dfake"
REAL_CSV = f"{ROOT}/metadata/master_real.csv"
COMPARE_DIR = f"{ROOT}/processed/compare"
VOICE_ORDER = ["hi-IN-MadhurNeural", "hi-IN-SwaraNeural"]  # try in this order
VOICE_RATE = "+0%"
TARGET_UTT_ID = None      # set to a specific utt_id or keep None
CHUNK_MAX = 200
REQ_TIMEOUT = 45
RETRIES = 3

# --- SETUP ---
!pip -q install edge-tts soundfile pandas numpy nest_asyncio > /dev/null
!apt-get -qq install ffmpeg > /dev/null

import os, shutil, random, hashlib, tempfile, time, re, subprocess
from pathlib import Path
import soundfile as sf
import pandas as pd, numpy as np
import nest_asyncio, asyncio, edge_tts
from IPython.display import Audio, display

nest_asyncio.apply()
loop = asyncio.get_event_loop()

def clean_text(s: str) -> str:
    s = (s or "").replace("\n", " ").strip()
    s = re.sub(r"\s+", " ", s)
    s = s.replace("‚Äô","'").replace("‚Äú","\"").replace("‚Äù","\"")
    return s

def split_hi(text: str, max_len=CHUNK_MAX):
    parts = re.split(r"([‡•§.!?])", text)
    sents = []
    for i in range(0, len(parts), 2):
        sent = parts[i]
        punc = parts[i+1] if i+1 < len(parts) else ""
        piece = (sent + punc).strip()
        if piece:
            sents.append(piece)
    chunks, buf, n = [], [], 0
    for s in sents:
        if not buf:
            buf, n = [s], len(s); continue
        if n + 1 + len(s) <= max_len:
            buf.append(s); n += 1 + len(s)
        else:
            chunks.append(" ".join(buf)); buf, n = [s], len(s)
    if buf: chunks.append(" ".join(buf))
    if not chunks and len(text) > max_len:
        for i in range(0, len(text), max_len):
            chunks.append(text[i:i+max_len])
    return [c for c in chunks if len(c) >= 4]

def ensure_pcm16_16k(src_path, dst_path):
    # normalize/copy real without altering the source
    try:
        d, sr = sf.read(src_path, always_2d=False)
        if getattr(d, "ndim", 1) == 2: d = d.mean(axis=1)
        sf.write(dst_path, d, 16000, subtype="PCM_16")
    except Exception:
        shutil.copyfile(src_path, dst_path)

async def save_chunk_any_version(txt, voice, out_wav, rate=VOICE_RATE, timeout=REQ_TIMEOUT):
    """
    Save a chunk as WAV(16k, mono, PCM_16) regardless of edge-tts version:
    1) edge_tts -> MP3
    2) ffmpeg -> WAV
    """
    tmp_mp3 = os.path.join(tempfile.gettempdir(), f"cmp_{hashlib.sha1((txt+voice).encode()).hexdigest()[:10]}.mp3")
    try:
        comm = edge_tts.Communicate(text=txt, voice=voice, rate=rate)
        await asyncio.wait_for(comm.save(tmp_mp3), timeout=timeout)  # old API: no format kw
        subprocess.run([
            "ffmpeg","-loglevel","error","-y","-i",tmp_mp3,
            "-ac","1","-ar","16000","-sample_fmt","s16", out_wav
        ], check=True)
    finally:
        try: os.remove(tmp_mp3)
        except: pass

async def synth_concat_edge(text, wav_out):
    text = clean_text(text)
    chunks = split_hi(text, CHUNK_MAX) or [text]
    segs = []
    for idx, chunk in enumerate(chunks, 1):
        success = False
        for v in VOICE_ORDER:
            for attempt in range(1, RETRIES+1):
                tmp_wav = os.path.join(tempfile.gettempdir(), f"cmp_{hashlib.sha1((chunk+v).encode()).hexdigest()[:10]}.wav")
                try:
                    await save_chunk_any_version(chunk, v, tmp_wav)
                    d, sr = sf.read(tmp_wav, always_2d=False)
                    if getattr(d, "ndim", 1) == 2: d = d.mean(axis=1)
                    segs.append(d.astype(np.float32))
                    os.remove(tmp_wav)
                    success = True
                    break
                except Exception:
                    try: os.remove(tmp_wav)
                    except: pass
                    await asyncio.sleep(min(2*attempt, 6))
            if success: break
        if not success:
            raise RuntimeError(f"Edge-TTS failed for chunk {idx}/{len(chunks)}")
    full = np.concatenate(segs, axis=0) if len(segs) > 1 else segs[0]
    sf.write(wav_out, full, 16000, subtype="PCM_16")

# pick a row
df = pd.read_csv(REAL_CSV)
text_col = next((c for c in ["text","transcript","sentence"] if c in df.columns), None)
assert text_col, "master_real.csv needs a text column."

def pick_row():
    if TARGET_UTT_ID:
        return df[df["utt_id"]==TARGET_UTT_ID].iloc[0]
    cand = df[df["path"].astype(str).apply(lambda p: Path(p).exists())]
    cand = cand[cand[text_col].astype(str).str.len().between(6, 400)]
    return cand.sample(1, random_state=777).iloc[0]

row = pick_row()
utt_id = str(row["utt_id"])
text   = clean_text(str(row[text_col]))
real_p = str(row["path"])

pair_dir = Path(COMPARE_DIR) / utt_id
pair_dir.mkdir(parents=True, exist_ok=True)
real_out = pair_dir / "real.wav"
fake_out = pair_dir / "fake_edge.wav"

if not real_out.exists():
    ensure_pcm16_16k(real_p, real_out)

made = None
if not fake_out.exists():
    try:
        loop.run_until_complete(synth_concat_edge(text, str(fake_out)))
        made = "edge"
    except Exception as e:
        print("Edge synth failed:", e)

# report & audio
from IPython.display import Audio, display
print(f"UTT: {utt_id}")
print("Text:", (text[:220] + ("..." if len(text) > 220 else "")))
print("Real:", sf.info(str(real_out)), "|", real_out)
if fake_out.exists():
    print("Fake (edge):", sf.info(str(fake_out)), "|", fake_out)
    display(Audio(filename=str(fake_out)))
else:
    print("Fake: (synthesis failed)")
display(Audio(filename=str(real_out)))

UTT: cb0999e7159064f1
Text: ‡§â‡§∏ ‡§¶‡•Å‡§∞‡•ç‡§ò‡§ü‡§®‡§æ ‡§Æ‡•á‡§Ç ‡§ï‡§à ‡§≤‡•ã‡§ó‡•ã‡§Ç ‡§ï‡•Ä ‡§Æ‡•å‡§§ ‡§π‡•ã ‡§ó‡§Ø‡•Ä‡•§
Real: /content/drive/MyDrive/hindi_dfake/processed/compare/cb0999e7159064f1/real.wav
samplerate: 16000 Hz
channels: 1
duration: 3.744 s
format: WAV (Microsoft) [WAV]
subtype: Signed 16 bit PCM [PCM_16] | /content/drive/MyDrive/hindi_dfake/processed/compare/cb0999e7159064f1/real.wav
Fake (edge): /content/drive/MyDrive/hindi_dfake/processed/compare/cb0999e7159064f1/fake_edge.wav
samplerate: 16000 Hz
channels: 1
duration: 3.888 s
format: WAV (Microsoft) [WAV]
subtype: Signed 16 bit PCM [PCM_16] | /content/drive/MyDrive/hindi_dfake/processed/compare/cb0999e7159064f1/fake_edge.wav


### **4. MMS-TTS (Meta)**
We generate ~3,000 clips using **Meta's MMS-TTS** to serve as a held-out test set, ensuring the model generalizes to unseen architectures.

In [None]:
# =================== MMS-TTS (Meta) Hindi ‚Äî BULK GENERATOR (~3k clips) ===================
# - Free, keyless, Colab-ready
# - Writes WAVs:  /content/drive/MyDrive/hindi_dfake/raw/fake_tts_mms
# - Appends CSV:  /content/drive/MyDrive/hindi_dfake/metadata/master_fake.csv
# - Resume-safe, dedup against disk + CSV, quality-gated, 16 kHz mono PCM_16

!pip -q install torch torchaudio transformers soundfile librosa pandas numpy tqdm > /dev/null

import os, hashlib, random, json, time
from pathlib import Path
from typing import List
import numpy as np
import pandas as pd
import soundfile as sf
import librosa
import torch
from tqdm.auto import tqdm
from transformers import VitsModel, AutoTokenizer

# ------------------ CONFIG ------------------
ROOT        = "/content/drive/MyDrive/hindi_dfake"
REAL_CSV    = f"{ROOT}/metadata/master_real.csv"     # source texts
OUT_DIR     = f"{ROOT}/raw/fake_tts_mms"             # target wav folder (separate from edge_tts)
FAKE_CSV    = f"{ROOT}/metadata/master_fake.csv"     # global metadata you‚Äôve been using

TARGET_CLIPS   = 3000      # ~3k audios
SEED           = 2025
MIN_CHARS      = 6
MAX_CHARS      = 220
PACK_N         = 1         # set to 2 or 3 to concatenate sentences with " ‡•§ "
MAX_PACK_LEN   = 300       # max total chars when packing
QUALITY_MIN_SEC= 0.6
QUALITY_MIN_MEAN_ABS = 1e-5
CSV_FLUSH_EVERY = 50       # flush metadata to disk every N clips

MODEL_NAME    = "facebook/mms-tts-hin"
VOICE_TAG     = "default"
FAKE_TAG      = "tts_mms"
RATE_TAG      = "+0%"
SPEAKER_ID    = "mms_tts"
GENDER_TAG    = "u"

# ------------------ SETUP ------------------
random.seed(SEED); np.random.seed(SEED)
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)
Path(Path(FAKE_CSV).parent).mkdir(parents=True, exist_ok=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# Load model/tokenizer once
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model     = VitsModel.from_pretrained(MODEL_NAME).to(device).eval()
base_sr   = getattr(model.config, "sampling_rate", 16000)

# ------------------ TEXTS ------------------
def load_texts_from_real(csv_path: str) -> List[str]:
    df = pd.read_csv(csv_path)
    # fallbacks you used before
    for col in ["text", "transcript", "sentence"]:
        if col in df.columns:
            ser = df[col].astype(str)
            break
    else:
        raise ValueError("master_real.csv must have 'text' (or 'transcript'/'sentence') column.")
    texts = (ser.str.replace("\n"," ", regex=False)
                .str.strip()
                .dropna()
                .loc[lambda s: s.str.len().between(MIN_CHARS, MAX_CHARS)]
                .drop_duplicates()
                .tolist())
    random.shuffle(texts)
    return texts

texts_base = load_texts_from_real(REAL_CSV)
print(f"Loaded {len(texts_base)} base sentences")

JOINER = " ‡•§ "
def text_stream(items: List[str], n=1, max_len=300):
    """Infinite generator of texts; packs n sentences if n>1."""
    i = random.randrange(max(1, len(items)))
    while True:
        if n <= 1:
            t = items[i % len(items)].strip(); i += 1
            if len(t) >= MIN_CHARS: yield t
            continue
        bundle, cur = [], 0
        while len(bundle) < n:
            t = items[i % len(items)].strip(); i += 1
            if len(t) < MIN_CHARS: continue
            add = (len(t) if cur == 0 else 1 + len(t))
            if cur and cur + add > max_len:
                break
            bundle.append(t); cur += add
        if bundle:
            yield JOINER.join(bundle)

ts = text_stream(texts_base, n=PACK_N, max_len=MAX_PACK_LEN)

# ------------------ RESUME/DEDUP ------------------
def uid_for(text: str) -> str:
    # Include model + voice in hash key to avoid collisions across TTS backends
    key = f"{text}|{MODEL_NAME}|{VOICE_TAG}|{SEED}"
    return "tts_mms_" + hashlib.sha1(key.encode("utf-8")).hexdigest()[:16]

existing_uids = set()
for p in Path(OUT_DIR).glob("*.wav"):
    if p.stem.startswith("tts_mms_"):
        existing_uids.add(p.stem)

if Path(FAKE_CSV).exists():
    try:
        df_exist = pd.read_csv(FAKE_CSV, usecols=["utt_id"])
        existing_uids |= set(df_exist["utt_id"].astype(str).tolist())
    except Exception:
        pass

print(f"Resume: {len(existing_uids)} items already indexed (disk and/or CSV)")

# ------------------ SYNTHESIS ------------------
def synth_one(text: str):
    """Return (wav_path, duration_sec) or (None, None) on failure/quality reject."""
    uid = uid_for(text)
    wav_path = os.path.join(OUT_DIR, f"{uid}.wav")
    if uid in existing_uids and os.path.isfile(wav_path):
        return None, None  # already have it

    # tokenize + generate
    inputs = tokenizer(text, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        out = model(**inputs)
        wav = out.waveform.squeeze().cpu().float().numpy()

    sr = base_sr
    if wav.ndim > 1:
        wav = wav.mean(axis=0)
    if sr != 16000:
        wav = librosa.resample(wav.astype(np.float32), orig_sr=sr, target_sr=16000)
        sr = 16000

    wav = np.clip(wav, -1.0, 1.0).astype(np.float32)
    dur = len(wav) / sr

    # quality gates (like your earlier pipelines)
    if dur < QUALITY_MIN_SEC or float(np.abs(wav).mean()) < QUALITY_MIN_MEAN_ABS:
        return None, None

    # write PCM_16
    sf.write(wav_path, wav, sr, subtype="PCM_16")
    return wav_path, dur

# ------------------ MAIN LOOP ------------------
to_make = max(0, TARGET_CLIPS)
made = 0
rows = []
pbar = tqdm(total=to_make, desc="MMS-TTS clips")

while made < to_make:
    text = next(ts)
    uid = uid_for(text)
    if uid in existing_uids:
        continue

    wav_path, dur = synth_one(text)
    if wav_path is None:
        continue

    rows.append({
        "utt_id": uid,
        "path": wav_path,
        "duration": round(float(dur), 3),
        "speaker_id": SPEAKER_ID,
        "gender": GENDER_TAG,
        "text": text,
        "label": "fake",
        "fake_type": FAKE_TAG,
        "tts_model": MODEL_NAME,
        "voice": VOICE_TAG,
        "rate": RATE_TAG,
        "seed": SEED
    })
    existing_uids.add(uid)
    made += 1
    pbar.update(1)

    # periodic flush (resume-safe)
    if (made % CSV_FLUSH_EVERY) == 0:
        if rows:
            if Path(FAKE_CSV).exists():
                df_out = pd.concat([pd.read_csv(FAKE_CSV), pd.DataFrame(rows)], ignore_index=True)
            else:
                df_out = pd.DataFrame(rows)
            df_out.to_csv(FAKE_CSV, index=False)
            rows = []

# final flush
if rows:
    if Path(FAKE_CSV).exists():
        df_out = pd.concat([pd.read_csv(FAKE_CSV), pd.DataFrame(rows)], ignore_index=True)
    else:
        df_out = pd.DataFrame(rows)
    df_out.to_csv(FAKE_CSV, index=False)
    rows = []

pbar.close()

print("\n‚úÖ Done.")
print(f"   Wrote ~{TARGET_CLIPS} clips ‚Üí {OUT_DIR}")
print(f"   Updated metadata ‚Üí {FAKE_CSV}")

Device: cuda
Loaded 24713 base sentences
Resume: 23735 items already indexed (disk and/or CSV)


MMS-TTS clips:   0%|          | 0/3000 [00:00<?, ?it/s]


‚úÖ Done.
   Wrote ~3000 clips ‚Üí /content/drive/MyDrive/hindi_dfake/raw/fake_tts_mms
   Updated metadata ‚Üí /content/drive/MyDrive/hindi_dfake/metadata/master_fake.csv
