In [4]:
import json
import os

# Load the lookup.json file
json_path = "lookup.json"  # Update path if needed
with open(json_path, "r", encoding="utf-8") as f:
    lookup = json.load(f)

missing_files = []

# Loop through each language and check file existence
for lang, wav_dict in lookup.items():
    paths=0
    for wav_filename, metadata in wav_dict.items():
        speaker = metadata["speaker"]
        expected_path = f"dataset/{lang}/{speaker}/{wav_filename}"
        
        # Check if the file exists
        if not os.path.exists(expected_path):
            missing_files.append(expected_path)
        paths+=1
    print(f"Processed {paths} files for {lang}")
# Display the results
if missing_files:
    print(f"❌ {len(missing_files)} missing files detected!")
    for file in missing_files[:10]:  # Show only first 10 for brevity
        print(f"   - {file}")
else:
    print("✅ All files in lookup.json exist in the dataset!")


Processed 82010 files for de
Processed 47831 files for tr
Processed 65816 files for en
✅ All files in lookup.json exist in the dataset!


In [5]:
import json
import random
import os

# Load the lookup.json file
lookup_path = "lookup.json"  # Adjust this path as needed
output_path = "lookup_10.json"  # Output path for the selected subset

# Define the target 10% distribution
TARGET_PERCENTAGE = 0.10  # Select 10% of the dataset
LANGUAGE_DISTRIBUTION = {"de": 0.40, "en": 0.34, "tr": 0.26}  # Adjust based on full dataset ratio

# Load lookup.json
with open(lookup_path, "r", encoding="utf-8") as f:
    lookup = json.load(f)

# Count total number of audio files per language
language_counts = {lang: len(files) for lang, files in lookup.items()}
total_files = sum(language_counts.values())

# Calculate the number of samples to select per language
target_samples = int(total_files * TARGET_PERCENTAGE)
selected_samples_per_lang = {
    lang: int(target_samples * ratio) for lang, ratio in LANGUAGE_DISTRIBUTION.items()
}

# Ensure we don’t select more than available files
for lang in selected_samples_per_lang:
    selected_samples_per_lang[lang] = min(selected_samples_per_lang[lang], language_counts[lang])

# Randomly select the required number of files per language
selected_lookup = {}
for lang, num_samples in selected_samples_per_lang.items():
    selected_files = random.sample(list(lookup[lang].keys()), num_samples)
    selected_lookup[lang] = {file: lookup[lang][file] for file in selected_files}

# Save the selected subset to lookup_10.json
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(selected_lookup, f, indent=4, ensure_ascii=False)

# Display result summary
selected_counts = {lang: len(files) for lang, files in selected_lookup.items()}
selected_counts


{'de': 7826, 'en': 6652, 'tr': 5086}

3545-oval-bazaar-expand-9
Zipping dataset to dataset.zip
Adding dataset/tr/tolga/common_voice_tr_30807057.wav
Sending 'dataset.zip' (59.8 GB)  
Code is: 3545-oval-bazaar-expand-9
On the other computer run

runpodctl receive 3545-oval-bazaar-expand-9

In [6]:
import json
import os
import random

def convert_json_to_text_metadata(json_filepath, output_dir, validation_split=0.15):
    """
    Converts a JSON dataset file to train.txt and val.txt metadata files
    in the format 'audio_filepath|transcription' for Coqui-TTS.

    Args:
        json_filepath (str): Path to your input JSON dataset file.
        output_dir (str): Directory to save train.txt and val.txt.
        validation_split (float): Fraction of data to use for validation (e.g., 0.15 for 15%).
    """

    os.makedirs(output_dir, exist_ok=True)
    train_metadata_file = os.path.join(output_dir, "train.txt")
    val_metadata_file = os.path.join(output_dir, "val.txt")

    data_entries = []

    with open(json_filepath, 'r', encoding='utf-8') as f:
        json_data = json.load(f)

    for lang_key, lang_data in json_data.items(): # Iterate through languages (e.g., "de", "en", "tr")
        for wav_filename, wav_info in lang_data.items(): # Iterate through wav files in each language
            transcription = wav_info["text"]
            speaker = wav_info["speaker"] # You have speaker info in JSON, though we might not use it directly in basic fine-tuning

            # Construct the full audio filepath based on your folder structure
            # Assuming structure: dataset/lang/speaker/wav_files
            audio_filepath = os.path.join("dataset", lang_key, speaker, wav_filename) # **Adjust "dataset" if your root dataset folder is named differently**

            data_entries.append({"filepath": audio_filepath, "text": transcription})

    # Shuffle data entries for random train/val split
    random.shuffle(data_entries)

    val_size = int(len(data_entries) * validation_split)
    val_entries = data_entries[:val_size]
    train_entries = data_entries[val_size:]

    # Write train.txt
    with open(train_metadata_file, 'w', encoding='utf-8') as train_file:
        for entry in train_entries:
            train_file.write(f"{entry['filepath']}|{entry['text']}\n")
    print(f" --> train.txt metadata file created at: {train_metadata_file} with {len(train_entries)} entries.")

    # Write val.txt
    with open(val_metadata_file, 'w', encoding='utf-8') as val_file:
        for entry in val_entries:
            val_file.write(f"{entry['filepath']}|{entry['text']}\n")
    print(f" --> val.txt metadata file created at: {val_metadata_file} with {len(val_entries)} entries.")

    print(" --> JSON to text metadata conversion complete.")


# --- Usage ---
if __name__ == "__main__":
    input_json_path = "lookup_10.json" # **Replace with the path to your lookup_10.json file on Runpod**
    output_directory = "metadata_text_files_10percent" # Directory to save train.txt and val.txt
    convert_json_to_text_metadata(input_json_path, output_directory)

    input_json_path_full = "lookup.json" # **Replace with the path to your full lookup.json file on Runpod**
    output_directory_full = "metadata_text_files_full" # Directory to save train.txt and val.txt for full dataset
    convert_json_to_text_metadata(input_json_path_full, output_directory_full)

    print(" --> Metadata conversion for both 10% and full datasets completed.")

 --> train.txt metadata file created at: metadata_text_files_10percent/train.txt with 16630 entries.
 --> val.txt metadata file created at: metadata_text_files_10percent/val.txt with 2934 entries.
 --> JSON to text metadata conversion complete.
 --> train.txt metadata file created at: metadata_text_files_full/train.txt with 166309 entries.
 --> val.txt metadata file created at: metadata_text_files_full/val.txt with 29348 entries.
 --> JSON to text metadata conversion complete.
 --> Metadata conversion for both 10% and full datasets completed.


In [1]:
from TTS.config.shared_configs import BaseDatasetConfig

In [None]:
# Define here the dataset that you want to use for the fine-tuning on.
config_dataset = BaseDatasetConfig(
    formatter="ljspeech",
    dataset_name="ljspeech",
    path="/raid/datasets/LJSpeech-1.1_24khz/",
    meta_file_train="/raid/datasets/LJSpeech-1.1_24khz/metadata.csv",
    language="en",
)