# Data Preparation for Indic TTS/STT Training

This notebook helps you download and prepare datasets for training TTS and STT models on Indic languages.

## Datasets Covered:
- **TTS**: IndicTTS, Kathbath, OpenSLR
- **STT**: Shrutilipi, IndicSUPERB, Common Voice

## Languages Supported:
Hindi, Tamil, Telugu, Bengali, Marathi, Gujarati, Kannada, Malayalam

In [None]:
# Install dependencies
!pip install -q datasets huggingface_hub soundfile librosa tqdm

In [None]:
# Mount Google Drive for storage
from google.colab import drive
drive.mount('/content/drive')

# Set up output directory
import os
OUTPUT_DIR = '/content/drive/MyDrive/indic_speech_data'
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(f'{OUTPUT_DIR}/raw', exist_ok=True)
os.makedirs(f'{OUTPUT_DIR}/manifests', exist_ok=True)

## 1. Download Datasets

In [None]:
from datasets import load_dataset
from huggingface_hub import snapshot_download
import json
from tqdm import tqdm

# Language to download
LANGUAGE = 'hi'  # Change to: hi, ta, te, bn, mr, etc.

print(f"Downloading data for language: {LANGUAGE}")

In [None]:
# Download Common Voice dataset (good for both TTS and STT)
def download_common_voice(language, max_samples=5000):
    """Download Common Voice dataset."""
    print(f"Loading Common Voice for {language}...")
    
    try:
        dataset = load_dataset(
            'mozilla-foundation/common_voice_16_1',
            language,
            split='train',
            trust_remote_code=True
        )
        
        if max_samples:
            dataset = dataset.select(range(min(max_samples, len(dataset))))
        
        # Save to disk
        save_path = f'{OUTPUT_DIR}/raw/common_voice_{language}'
        dataset.save_to_disk(save_path)
        print(f"Saved {len(dataset)} samples to {save_path}")
        
        return dataset
    except Exception as e:
        print(f"Error downloading Common Voice: {e}")
        return None

cv_dataset = download_common_voice(LANGUAGE, max_samples=5000)

In [None]:
# Download OpenSLR Hindi dataset
def download_openslr_hindi():
    """Download OpenSLR Hindi TTS dataset."""
    import urllib.request
    import zipfile
    
    url = "https://www.openslr.org/resources/103/hi_in_female.zip"
    zip_path = f"{OUTPUT_DIR}/raw/hi_in_female.zip"
    extract_path = f"{OUTPUT_DIR}/raw/openslr_hindi"
    
    print("Downloading OpenSLR Hindi...")
    urllib.request.urlretrieve(url, zip_path)
    
    print("Extracting...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    
    print(f"Extracted to {extract_path}")
    return extract_path

if LANGUAGE == 'hi':
    openslr_path = download_openslr_hindi()

## 2. Generate Training Manifests

In [None]:
import soundfile as sf
import os

def get_audio_duration(audio_path):
    """Get duration of audio file."""
    try:
        info = sf.info(audio_path)
        return info.duration
    except:
        return 0.0

def create_manifest_from_hf_dataset(dataset, output_path, lang_code, dataset_type='stt'):
    """Create JSONL manifest from HuggingFace dataset."""
    entries = []
    
    for idx, sample in enumerate(tqdm(dataset, desc="Processing samples")):
        # Get audio path
        audio = sample.get('audio', {})
        audio_path = audio.get('path', '')
        
        # Get text
        text = sample.get('sentence', '') or sample.get('text', '')
        
        if not audio_path or not text:
            continue
        
        # Get duration
        if 'array' in audio:
            sr = audio.get('sampling_rate', 16000)
            duration = len(audio['array']) / sr
        else:
            duration = sample.get('duration', 0.0)
        
        # Filter by duration
        if duration < 0.5 or duration > 30.0:
            continue
        
        entry = {
            'audio_filepath': audio_path,
            'text': text,
            'duration': duration,
            'language': f'{lang_code}-IN',
            'speaker_id': sample.get('client_id', f'speaker_{idx // 100}')
        }
        entries.append(entry)
    
    # Split train/val
    split_idx = int(len(entries) * 0.9)
    train_entries = entries[:split_idx]
    val_entries = entries[split_idx:]
    
    # Write manifests
    train_path = f"{output_path}/{dataset_type}_{lang_code}_train.jsonl"
    val_path = f"{output_path}/{dataset_type}_{lang_code}_val.jsonl"
    
    with open(train_path, 'w', encoding='utf-8') as f:
        for entry in train_entries:
            f.write(json.dumps(entry, ensure_ascii=False) + '\n')
    
    with open(val_path, 'w', encoding='utf-8') as f:
        for entry in val_entries:
            f.write(json.dumps(entry, ensure_ascii=False) + '\n')
    
    print(f"Created manifests:")
    print(f"  Train: {train_path} ({len(train_entries)} samples)")
    print(f"  Val: {val_path} ({len(val_entries)} samples)")
    
    return train_path, val_path

In [None]:
# Create manifests from Common Voice
if cv_dataset:
    train_manifest, val_manifest = create_manifest_from_hf_dataset(
        cv_dataset,
        f"{OUTPUT_DIR}/manifests",
        LANGUAGE,
        dataset_type='stt'
    )

## 3. Audio Quality Validation

In [None]:
import numpy as np

def validate_audio_quality(manifest_path, output_path):
    """Validate audio quality and filter bad samples."""
    valid_entries = []
    invalid_count = 0
    
    with open(manifest_path, 'r', encoding='utf-8') as f:
        entries = [json.loads(line) for line in f if line.strip()]
    
    for entry in tqdm(entries, desc="Validating audio"):
        audio_path = entry['audio_filepath']
        
        if not os.path.exists(audio_path):
            invalid_count += 1
            continue
        
        try:
            audio, sr = sf.read(audio_path, dtype='float32')
            
            # Check for issues
            if audio.ndim > 1:
                audio = np.mean(audio, axis=1)
            
            # Check duration
            duration = len(audio) / sr
            if duration < 0.5 or duration > 30.0:
                invalid_count += 1
                continue
            
            # Check for silence
            rms = np.sqrt(np.mean(audio ** 2))
            if rms < 0.001:
                invalid_count += 1
                continue
            
            # Check for clipping
            if np.max(np.abs(audio)) > 0.99:
                # Still keep but flag
                entry['has_clipping'] = True
            
            entry['duration'] = duration
            valid_entries.append(entry)
            
        except Exception as e:
            invalid_count += 1
            continue
    
    # Write validated manifest
    with open(output_path, 'w', encoding='utf-8') as f:
        for entry in valid_entries:
            f.write(json.dumps(entry, ensure_ascii=False) + '\n')
    
    print(f"Validated: {len(valid_entries)} valid, {invalid_count} invalid")
    return output_path

# Validate the training manifest
# validated_manifest = validate_audio_quality(train_manifest, f"{OUTPUT_DIR}/manifests/stt_{LANGUAGE}_train_validated.jsonl")

## 4. Summary Statistics

In [None]:
def print_manifest_stats(manifest_path):
    """Print statistics for a manifest."""
    with open(manifest_path, 'r', encoding='utf-8') as f:
        entries = [json.loads(line) for line in f if line.strip()]
    
    durations = [e['duration'] for e in entries]
    
    print(f"\nManifest: {manifest_path}")
    print(f"  Total samples: {len(entries)}")
    print(f"  Total duration: {sum(durations)/3600:.2f} hours")
    print(f"  Avg duration: {np.mean(durations):.2f}s")
    print(f"  Min duration: {min(durations):.2f}s")
    print(f"  Max duration: {max(durations):.2f}s")
    
    # Language distribution
    langs = [e.get('language', 'unknown') for e in entries]
    from collections import Counter
    lang_counts = Counter(langs)
    print(f"  Languages: {dict(lang_counts)}")

# Print stats
# print_manifest_stats(train_manifest)
# print_manifest_stats(val_manifest)

## 5. Next Steps

After running this notebook, you should have:
1. Downloaded datasets in `{OUTPUT_DIR}/raw/`
2. Generated manifests in `{OUTPUT_DIR}/manifests/`

Next notebooks:
- `02_tts_xtts_finetuning_colab.ipynb` - Fine-tune XTTS for TTS
- `03_stt_whisper_finetuning_colab.ipynb` - Fine-tune Whisper for STT