In [4]:
import pandas as pd
import os
from datasets import load_dataset
import shutil
import soundfile as sf

In [2]:
os.getcwd()

'/home/jovyan/work/Datasets'

In [3]:
ds = load_dataset("SPRINGLab/IndicTTS_Tamil", split="train") # Load the IndicTTS Tamil dataset from Hugging Face

Resolving data files:   0%|          | 0/17 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/17 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/17 [00:00<?, ?it/s]

In [4]:
print(ds[0].keys())

dict_keys(['audio', 'text', 'gender'])


In [5]:
print(ds)

Dataset({
    features: ['audio', 'text', 'gender'],
    num_rows: 9437
})


In [6]:
# Set output paths
output_dir = "/home/jovyan/work/Datasets/IndicTTS_Tamil"
audio_dir = os.path.join(output_dir, "audio")
text_dir = os.path.join(output_dir, "transcriptions")
os.makedirs(audio_dir, exist_ok=True)
os.makedirs(text_dir, exist_ok=True)

In [7]:
# Prepare metadata list
metadata = []
for i, sample in enumerate(ds): # Save audio and transcription
    audio = sample["audio"]
    transcription = sample["text"]
    audio_filename = f"tamil_audio_{i}.wav" # Save .wav file
    audio_path = os.path.join(audio_dir, audio_filename)
    sf.write(audio_path, audio["array"], audio["sampling_rate"])
    txt_path = os.path.join(text_dir, f"tamil_audio_{i}.txt") # Save transcription as .txt
    with open(txt_path, "w", encoding="utf-8") as f:
        f.write(transcription)
    metadata.append({"file": audio_filename, "text": transcription}) # Append to metadata
    if i % 100 == 0:
        print(f"Saved {i} files...")
df = pd.DataFrame(metadata) # Save metadata.csv
df.to_csv(os.path.join(output_dir, "metadata.csv"), index=False, encoding="utf-8")
print("Dataset download and save completed.")

Saved 0 files...
Saved 100 files...
Saved 200 files...
Saved 300 files...
Saved 400 files...
Saved 500 files...
Saved 600 files...
Saved 700 files...
Saved 3000 files...
Saved 3100 files...
Saved 3200 files...
Saved 3300 files...
Saved 3400 files...
Saved 3500 files...
Saved 3600 files...
Saved 3700 files...
Saved 3800 files...
Saved 3900 files...
Saved 4000 files...
Saved 4100 files...
Saved 4200 files...
Saved 4300 files...
Saved 4400 files...
Saved 4500 files...
Saved 4600 files...
Saved 4700 files...
Saved 4800 files...
Saved 4900 files...
Saved 5000 files...
Saved 5100 files...
Saved 5200 files...
Saved 5300 files...
Saved 5400 files...
Saved 5500 files...
Saved 5600 files...
Saved 5700 files...
Saved 5800 files...
Saved 5900 files...
Saved 6000 files...
Saved 6100 files...
Saved 6200 files...
Saved 6300 files...
Saved 6400 files...
Saved 6500 files...
Saved 6600 files...
Saved 6700 files...
Saved 6800 files...
Saved 6900 files...
Saved 7000 files...
Saved 7100 files...
Saved 7200

In [9]:
class AudioDatasetValidator:
    def __init__(self, metadata_path, audio_folder):
        self.metadata_path = metadata_path
        self.audio_folder = audio_folder
        self.df = pd.read_csv(metadata_path)
    
    def check_files_and_durations(self):
        found_list = []
        duration_sec_list = []
        duration_ms_list = []

        for idx, row in self.df.iterrows():
            file_path = os.path.join(self.audio_folder, row["file"])

            if os.path.exists(file_path):
                found_list.append("yes")
                try:
                    audio, sr = sf.read(file_path)
                    duration_sec = len(audio) / sr
                    duration_sec_list.append(duration_sec)
                    duration_ms_list.append(duration_sec * 1000)
                except Exception as e:
                    print(f"Error reading {row['file']}: {e}")
                    duration_sec_list.append(None)
                    duration_ms_list.append(None)
            else:
                found_list.append("no")
                duration_sec_list.append(None)
                duration_ms_list.append(None)

        self.df["found"] = found_list
        self.df["duration_sec"] = duration_sec_list
        self.df["duration_ms"] = duration_ms_list
    
    def save_updated_metadata(self, output_path=None):
        save_path = output_path if output_path else self.metadata_path
        self.df.to_csv(save_path, index=False, encoding='utf-8')
        print(f"Updated metadata saved to: {save_path}")
metadata_csv = "/home/jovyan/work/Datasets/IndicTTS_Tamil/metadata.csv" # Usage 
audio_dir = "/home/jovyan/work/Datasets/IndicTTS_Tamil/audio"
validator = AudioDatasetValidator(metadata_csv, audio_dir)
validator.check_files_and_durations()
validator.save_updated_metadata()

Updated metadata saved to: /home/jovyan/work/Datasets/IndicTTS_Tamil/metadata.csv


In [10]:
metadata_path = "/home/jovyan/work/Datasets/IndicTTS_Tamil/metadata.csv" # Load the updated metadata
df = pd.read_csv(metadata_path)
df_filtered = df[df["duration_sec"] <= 15].copy() # Remove rows where duration > 15 seconds
df_filtered.rename(columns={ # Rename columns
    "file": "path",
    "text": "sentence"
}, inplace=True)
filtered_path = "/home/jovyan/work/Datasets/IndicTTS_Tamil/metadata_filtered.csv" # Save filtered and renamed CSV
df_filtered.to_csv(filtered_path, index=False, encoding="utf-8")
print("\n DataFrame Info:") # Dataset info
print(df_filtered.info())
print(df_filtered.head())
print(df_filtered.tail())
print("\n Descriptive Statistics:")
print(df_filtered.describe())
print("Descriptive Statistics (Categorical Columns):") #Describe categorical columns (object types)
print(df_filtered.describe(include=['object']))
num_total = len(df_filtered) # Basic stats
num_unique_files = df_filtered["path"].nunique()
total_duration_sec = df_filtered["duration_sec"].sum()
total_duration_ms = df_filtered["duration_ms"].sum()
total_duration_min = total_duration_sec / 60

print("\n Basic Stats:")
print(f"Total audio files       : {num_total}")
print(f"Unique audio files      : {num_unique_files}")
print(f"Total duration (sec)    : {total_duration_sec} seconds")
print(f"Total duration (ms)    : {total_duration_sec} milli seconds")
print(f"Total duration (min)    : {total_duration_min} minutes")


 DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Index: 8059 entries, 0 to 9436
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   path          8059 non-null   object 
 1   sentence      8059 non-null   object 
 2   found         8059 non-null   object 
 3   duration_sec  8059 non-null   float64
 4   duration_ms   8059 non-null   float64
dtypes: float64(2), object(3)
memory usage: 377.8+ KB
None
                path                                           sentence found  \
0  tamil_audio_0.wav  அதற்குத் தகுந்தபடி, ஏதாவது கொஞ்சம் பேசி, வேஷம்...   yes   
1  tamil_audio_1.wav  ஆனால், அவன் எதிர்பார்த்த சந்தர்ப்பம் ஒன்றும், ...   yes   
2  tamil_audio_2.wav  அப்படியும், பல்லக்கு கீழே வைக்கப்படவில்லை ஒரே ...   yes   
3  tamil_audio_3.wav  கோட்டைக்குள் பல்லக்குப் போய்விட்டால், அப்புறம்...   yes   
4  tamil_audio_4.wav  எடுத்த காரியத்தை முடிக்காமல், உயிரோடு திரும்பி...   yes   

   duration_sec  duration_ms  
0  

In [11]:
# === Configuration ===
BASE_DIR = "/home/jovyan/work/Datasets/IndicTTS_Tamil"
CSV_PATH = os.path.join(BASE_DIR, "metadata_filtered.csv")
AUDIO_FOLDER = os.path.join(BASE_DIR, "audio")
TEXT_FOLDER = os.path.join(BASE_DIR, "transcriptions")
AUDIO_FINAL = os.path.join(BASE_DIR, "audio_final")
TEXT_FINAL = os.path.join(BASE_DIR, "transcriptions_final")


def ensure_folders(*folders):
    """Create folders if they don't exist."""
    for folder in folders:
        os.makedirs(folder, exist_ok=True)


def copy_valid_files(df, audio_src, text_src, audio_dst, text_dst):
    """Copy all files present in source folders to final folders, keeping original CSV intact."""
    copied_count = 0
    for idx, row in df.iterrows():
        audio_file = row["path"]
        text_file = audio_file.replace(".wav", ".txt")

        audio_path = os.path.join(audio_src, audio_file)
        text_path = os.path.join(text_src, text_file)

        audio_dst_path = os.path.join(audio_dst, audio_file)
        text_dst_path = os.path.join(text_dst, text_file)

        if os.path.exists(audio_path) and os.path.exists(text_path):
            shutil.copy(audio_path, audio_dst_path)
            shutil.copy(text_path, text_dst_path)
            copied_count += 1
        else:
            print(f"Skipped: Missing {audio_file} or {text_file}")
    return copied_count


def main():
    # Load CSV
    df = pd.read_csv(CSV_PATH)

    # Create required folders
    ensure_folders(AUDIO_FINAL, TEXT_FINAL)

    # Copy valid audio/text pairs
    total_copied = copy_valid_files(df, AUDIO_FOLDER, TEXT_FOLDER, AUDIO_FINAL, TEXT_FINAL)

    print(f"\n Total valid audio-transcription pairs copied: {total_copied}")
    print(f"Audio copied to      : {AUDIO_FINAL}")
    print(f"Transcriptions copied: {TEXT_FINAL}")
    print(f" Original CSV unchanged: {CSV_PATH}")
# === Run the Process ===
main()


 Total valid audio-transcription pairs copied: 8059
Audio copied to      : /home/jovyan/work/Datasets/IndicTTS_Tamil/audio_final
Transcriptions copied: /home/jovyan/work/Datasets/IndicTTS_Tamil/transcriptions_final
 Original CSV unchanged: /home/jovyan/work/Datasets/IndicTTS_Tamil/metadata_filtered.csv


In [12]:
def save_final_filtered_csv(original_csv_path, target_filename, save_dir):
    df = pd.read_csv(original_csv_path)
    save_path = os.path.join(save_dir, target_filename)
    df.to_csv(save_path, index=False, encoding='utf-8')
    print(f"Final cleaned CSV saved at: {save_path}")
    return save_path
# === Inputs ===
original_csv = "/home/jovyan/work/Datasets/IndicTTS_Tamil/metadata_filtered.csv"
new_filename = "filtered_with_durations_Indic_Tamil_cleaned.csv"
destination_dir = "/home/jovyan/work/Datasets"
# === Save the CSV ===
save_final_filtered_csv(original_csv, new_filename, destination_dir)

Final cleaned CSV saved at: /home/jovyan/work/Datasets/filtered_with_durations_Indic_Tamil_cleaned.csv


'/home/jovyan/work/Datasets/filtered_with_durations_Indic_Tamil_cleaned.csv'

**Merging cv corpus 19, openslr 65, indid tts tamil csv files and audio files**

In [None]:
# Audio file manager
class AudioDatasetManager:
    def __init__(self, folder_names, output_folder):
        self.folder_names = folder_names
        self.output_folder = output_folder

    def count_audio_files(self, folder):
        return len([file for file in os.listdir(folder) if file.endswith('.wav')])

    def process_audio_folders(self):
        print(" Counting and merging audio files...")
        total_files = {}
        os.makedirs(self.output_folder, exist_ok=True)

        for folder in self.folder_names:
            count = self.count_audio_files(folder)
            print(f" Total .wav files in '{folder}': {count}")
            total_files[folder] = count

            for file in os.listdir(folder):
                if file.endswith('.wav'):
                    src = os.path.join(folder, file)
                    dst = os.path.join(self.output_folder, file)
                    if not os.path.exists(dst):  # Avoid overwriting
                        shutil.copy2(src, dst)

        final_count = self.count_audio_files(self.output_folder)
        print(f"\n Total .wav files in merged folder '{self.output_folder}': {final_count}")
        return total_files, final_count


# CSV manager
class CSVDatasetManager:
    def __init__(self, file_names, output_file):
        self.file_names = file_names
        self.output_file = output_file

    def inspect_csv(self, file_path):
        df = pd.read_csv(file_path)
        print(f"\n File: {file_path}")
        print(f" Columns: {df.columns.tolist()}")
        print(f" Rows: {len(df)}")
        return df

    def merge_csvs(self):
        dataframes = []
        for path in self.file_names:
            df = self.inspect_csv(path)
            dataframes.append(df)

        final_df = pd.concat(dataframes, axis=0, ignore_index=True)
        final_df.to_csv(self.output_file, index=False)
        print(f"\n Merged CSV saved as '{self.output_file}'")
        return final_df

    def analyze_merged_csv(self, df):
        print("\n --- Merged CSV Analysis ---")
        print(f" Columns: {df.columns.tolist()}")
        print(f" Total Rows: {len(df)}")

        print("\n Info:")
        print(df.info())

        print("\n Describe:")
        print(df.describe())

        print("\n Head:")
        print(df.head())

        print("\n Tail:")
        print(df.tail())

        print("\n Unique values per column:")
        print(df.nunique())

        print("\n Total duration (seconds):", df['duration_sec'].sum())
        print(" Total duration (milliseconds):", df['duration_ms'].sum())

# AUDIO FILES
audio_folders = [
    "/home/jovyan/work/Datasets/cv-corpus/cv-corpus-19.0-2024-09-13/ta/clips",
    "/home/jovyan/work/Datasets/ta_slr65_final_audios",
    "/home/jovyan/work/Datasets/IndicTTS_Tamil/audio_final"
]
final_audio_output = "final_merged_tamil_audios"

audio_manager = AudioDatasetManager(audio_folders, final_audio_output)
audio_manager.process_audio_folders()

# CSV FILES
csv_files = [
    "/home/jovyan/work/Datasets/filtered_with_durations_cv_corpus_19_cleaned.csv",
    "/home/jovyan/work/Datasets/filtered_openslr65_tamil_cleaned.csv",
    "/home/jovyan/work/Datasets/filtered_with_durations_Indic_Tamil_cleaned.csv"
]
final_csv_output = "final_merged_tamil_dataset.csv"

csv_manager = CSVDatasetManager(csv_files, final_csv_output)
merged_df = csv_manager.merge_csvs()
csv_manager.analyze_merged_csv(merged_df)

 Counting and merging audio files...
 Total .wav files in '/home/jovyan/work/Datasets/cv-corpus/cv-corpus-19.0-2024-09-13/ta/clips': 0
 Total .wav files in '/home/jovyan/work/Datasets/ta_slr65_final_audios': 4284
 Total .wav files in '/home/jovyan/work/Datasets/IndicTTS_Tamil/audio_final': 8059

 Total .wav files in merged folder 'final_merged_tamil_audios': 12343

 File: /home/jovyan/work/Datasets/filtered_with_durations_cv_corpus_19_cleaned.csv
 Columns: ['path', 'sentence', 'found', 'duration_sec', 'duration_ms']
 Rows: 246840

 File: /home/jovyan/work/Datasets/filtered_openslr65_tamil_cleaned.csv
 Columns: ['path', 'sentence', 'found', 'duration_sec', 'duration_ms']
 Rows: 4284

 File: /home/jovyan/work/Datasets/filtered_with_durations_Indic_Tamil_cleaned.csv
 Columns: ['path', 'sentence', 'found', 'duration_sec', 'duration_ms']
 Rows: 8059

 Merged CSV saved as 'final_merged_tamil_dataset.csv'

 --- Merged CSV Analysis ---
 Columns: ['path', 'sentence', 'found', 'duration_sec', 'd

In [5]:
class AudioMerger:
    def __init__(self, wav_folder, mp3_folder):
        self.wav_folder = wav_folder
        self.mp3_folder = mp3_folder

    def count_files(self, folder, extension):
        return len([f for f in os.listdir(folder) if f.endswith(extension)])

    def copy_mp3_to_wav_folder(self):
        print(" Counting files before copying...")

        wav_count_before = self.count_files(self.wav_folder, '.wav')
        mp3_count_before = self.count_files(self.mp3_folder, '.mp3')
        total_before = len(os.listdir(self.wav_folder))

        print(f" WAV files in target folder: {wav_count_before}")
        print(f" MP3 files to copy: {mp3_count_before}")
        print(f" Total files in target folder before copy: {total_before}")

        copied_count = 0

        for file in os.listdir(self.mp3_folder):
            if file.endswith(".mp3"):
                src = os.path.join(self.mp3_folder, file)
                dst = os.path.join(self.wav_folder, file)
                if not os.path.exists(dst):
                    shutil.copy2(src, dst)
                    copied_count += 1

        print(f"\n Total MP3 files copied: {copied_count}")

        total_after = len(os.listdir(self.wav_folder))
        print(f" Total files in target folder after copy: {total_after}")



wav_folder = "/home/jovyan/work/Datasets/final_merged_tamil_audios"
mp3_folder = "/home/jovyan/work/Datasets/cv-corpus/cv-corpus-19.0-2024-09-13/ta/clips"

merger = AudioMerger(wav_folder, mp3_folder)
merger.copy_mp3_to_wav_folder()

 Counting files before copying...
 WAV files in target folder: 12343
 MP3 files to copy: 246840
 Total files in target folder before copy: 12343

 Total MP3 files copied: 246840
 Total files in target folder after copy: 259183
