In [1]:
import os
# Set Hugging Face cache to /mnt/data
os.environ["HF_HOME"] = "/mnt/data/stt/Datasets/Datasets/huggingface"
os.environ["HF_DATASETS_CACHE"] = "/mnt/data/stt/Datasets/Datasets/huggingface/datasets"
os.environ["TRANSFORMERS_CACHE"] = "/mnt/data/stt/Datasets/Datasets/huggingface/transformers"
os.environ["HF_METRICS_CACHE"] = "/mnt/data/stt/Datasets/Datasets/huggingface/metrics"

In [2]:
import pandas as pd
import numpy as np
import os
from datasets import load_dataset, Audio
import shutil
import soundfile as sf
from tqdm import tqdm
from IPython.display import Audio, display

In [3]:
os.getcwd()

'/mnt/data/stt/Datasets/Datasets'

In [4]:
from huggingface_hub import login

In [None]:
login(token="HF-TOKEN")  # Replace with your token

In [9]:
rm -rf ~/.cache/huggingface

In [None]:
# Now load dataset
from datasets import load_dataset
indicvoice = load_dataset("ai4bharat/IndicVoices", "tamil", split="train")

**From extraction to saving final csv pipeline**

In [6]:
class IndicVoicesProcessor:
    def __init__(self, save_dir):
        self.save_dir = save_dir
        self.audio_folder = os.path.join(save_dir, "indicvoices_tamil_audio")
        self.final_audio_folder = os.path.join(save_dir, "tamil_audios_indicvoices_final")
        self.raw_csv_path = os.path.join(save_dir, "indicvoices_tamil_metadata.csv")
        self.filtered_csv_path = os.path.join(save_dir, "filtered_tamil_indicvoices.csv")
        os.makedirs(self.audio_folder, exist_ok=True)
        os.makedirs(self.final_audio_folder, exist_ok=True)

    def load_dataset(self):
        return load_dataset("ai4bharat/IndicVoices", "tamil", split="train")

    def save_audio_and_metadata(self, dataset):
        data = []
        for idx in tqdm(range(len(dataset)), desc="Saving audio"):
            sample = dataset[idx]
            audio_array = sample["audio_filepath"]["array"]
            sr = sample["audio_filepath"]["sampling_rate"]
            duration_sec = len(audio_array) / sr
            duration_ms = duration_sec * 1000

            filename = f"indicvoices_ta_{idx}.wav"
            filepath = os.path.join(self.audio_folder, filename)
            sf.write(filepath, audio_array, samplerate=sr)

            data.append({
                "audio_filename": filename,
                "text": sample["text"],
                "duration": sample["duration"],
                "duration_sec": duration_sec,
                "duration_ms": duration_ms
            })

        df = pd.DataFrame(data)
        df.to_csv(self.raw_csv_path, index=False)
        print(f" Saved metadata to {self.raw_csv_path}")
        return df

    def validate_and_update_csv(self, df):
        audio_files_set = set(os.listdir(self.audio_folder))
        unique_df = df.drop_duplicates(subset=["audio_filename"]).copy()
        unique_df["found"] = unique_df["audio_filename"].apply(lambda x: "yes" if x in audio_files_set else "no")
        unique_df.rename(columns={
            "audio_filename": "path",
            "text": "sentence"
        }, inplace=True)

        # Move 'found' after 'sentence'
        found = unique_df.pop("found")
        unique_df.insert(unique_df.columns.get_loc("sentence") + 1, "found", found)

        print(" Unique files checked and updated.")
        return unique_df

    def filter_by_duration(self, df, max_duration=15.0):
        filtered = df[df["duration_sec"] <= max_duration].copy()
        filtered.to_csv(self.filtered_csv_path, index=False)
        print(f" Filtered metadata saved to {self.filtered_csv_path}")
        return filtered

    def copy_filtered_audios(self, df):
        for fname in tqdm(df["path"], desc="Copying filtered audio files"):
            src = os.path.join(self.audio_folder, fname)
            dst = os.path.join(self.final_audio_folder, fname)
            if os.path.exists(src):
                os.system(f'cp "{src}" "{dst}"')

    def compare_duration_columns(self, df):
        if "duration" in df.columns and all(df["duration"].round(3) == df["duration_sec"].round(3)):
            print(" 'duration' and 'duration_sec' are same — dropping 'duration'")
            df = df.drop(columns=["duration"])
        else:
            print(" 'duration' and 'duration_sec' differ — keeping both")
        return df

    def summarize_csv(self, df, label=""):
        print(f"\n Summary of {label} Dataset:")
        print("-" * 40)
        print("Head:\n", df.head())
        print("\nTail:\n", df.tail())
        print("\nInfo:")
        print(df.info())
        print("\nDescribe:\n", df.describe())
        print("\nUnique 'path' count:", df["path"].nunique())

        print("\n Total Durations:")
        if "duration" in df.columns:
            print("Sum of 'duration' (sec):", df["duration"].sum())
        print("Sum of 'duration_sec':", df["duration_sec"].sum())
        print("Sum of 'duration_ms':", df["duration_ms"].sum())

        return df


processor = IndicVoicesProcessor("/mnt/data/stt/Datasets/Datasets")
dataset = processor.load_dataset()
df_raw = processor.save_audio_and_metadata(dataset)
df_validated = processor.validate_and_update_csv(df_raw)
df_filtered = processor.filter_by_duration(df_validated)
processor.copy_filtered_audios(df_filtered)
df_final = processor.compare_duration_columns(df_filtered)
df_final = processor.summarize_csv(df_final, label="Final Filtered")

# Save final cleaned filtered CSV again (after possibly dropping column)
df_final.to_csv(processor.filtered_csv_path, index=False)
print(f"\n Final updated CSV saved to {processor.filtered_csv_path}")


README.md:   0%|          | 0.00/30.4k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/91 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/90 [00:00<?, ?it/s]

valid-00000-of-00001.parquet:   0%|          | 0.00/409M [00:00<?, ?B/s]

Downloading data:   0%|          | 0/90 [00:00<?, ?files/s]

train-00000-of-00090.parquet:   0%|          | 0.00/525M [00:00<?, ?B/s]

train-00001-of-00090.parquet:   0%|          | 0.00/529M [00:00<?, ?B/s]

train-00002-of-00090.parquet:   0%|          | 0.00/520M [00:00<?, ?B/s]

train-00003-of-00090.parquet:   0%|          | 0.00/502M [00:00<?, ?B/s]

train-00004-of-00090.parquet:   0%|          | 0.00/534M [00:00<?, ?B/s]

train-00005-of-00090.parquet:   0%|          | 0.00/510M [00:00<?, ?B/s]

train-00006-of-00090.parquet:   0%|          | 0.00/513M [00:00<?, ?B/s]

train-00007-of-00090.parquet:   0%|          | 0.00/521M [00:00<?, ?B/s]

train-00008-of-00090.parquet:   0%|          | 0.00/533M [00:00<?, ?B/s]

train-00009-of-00090.parquet:   0%|          | 0.00/496M [00:00<?, ?B/s]

train-00010-of-00090.parquet:   0%|          | 0.00/500M [00:00<?, ?B/s]

train-00011-of-00090.parquet:   0%|          | 0.00/515M [00:00<?, ?B/s]

train-00012-of-00090.parquet:   0%|          | 0.00/496M [00:00<?, ?B/s]

train-00013-of-00090.parquet:   0%|          | 0.00/515M [00:00<?, ?B/s]

train-00014-of-00090.parquet:   0%|          | 0.00/514M [00:00<?, ?B/s]

train-00015-of-00090.parquet:   0%|          | 0.00/517M [00:00<?, ?B/s]

train-00016-of-00090.parquet:   0%|          | 0.00/554M [00:00<?, ?B/s]

train-00017-of-00090.parquet:   0%|          | 0.00/567M [00:00<?, ?B/s]

train-00018-of-00090.parquet:   0%|          | 0.00/574M [00:00<?, ?B/s]

train-00019-of-00090.parquet:   0%|          | 0.00/591M [00:00<?, ?B/s]

train-00020-of-00090.parquet:   0%|          | 0.00/592M [00:00<?, ?B/s]

train-00021-of-00090.parquet:   0%|          | 0.00/587M [00:00<?, ?B/s]

train-00022-of-00090.parquet:   0%|          | 0.00/605M [00:00<?, ?B/s]

train-00023-of-00090.parquet:   0%|          | 0.00/587M [00:00<?, ?B/s]

train-00024-of-00090.parquet:   0%|          | 0.00/581M [00:00<?, ?B/s]

train-00025-of-00090.parquet:   0%|          | 0.00/581M [00:00<?, ?B/s]

train-00026-of-00090.parquet:   0%|          | 0.00/559M [00:00<?, ?B/s]

train-00027-of-00090.parquet:   0%|          | 0.00/603M [00:00<?, ?B/s]

train-00028-of-00090.parquet:   0%|          | 0.00/588M [00:00<?, ?B/s]

train-00029-of-00090.parquet:   0%|          | 0.00/593M [00:00<?, ?B/s]

train-00030-of-00090.parquet:   0%|          | 0.00/532M [00:00<?, ?B/s]

train-00031-of-00090.parquet:   0%|          | 0.00/460M [00:00<?, ?B/s]

train-00032-of-00090.parquet:   0%|          | 0.00/488M [00:00<?, ?B/s]

train-00033-of-00090.parquet:   0%|          | 0.00/474M [00:00<?, ?B/s]

train-00034-of-00090.parquet:   0%|          | 0.00/485M [00:00<?, ?B/s]

train-00035-of-00090.parquet:   0%|          | 0.00/497M [00:00<?, ?B/s]

train-00036-of-00090.parquet:   0%|          | 0.00/458M [00:00<?, ?B/s]

train-00037-of-00090.parquet:   0%|          | 0.00/510M [00:00<?, ?B/s]

train-00038-of-00090.parquet:   0%|          | 0.00/497M [00:00<?, ?B/s]

train-00039-of-00090.parquet:   0%|          | 0.00/484M [00:00<?, ?B/s]

train-00040-of-00090.parquet:   0%|          | 0.00/505M [00:00<?, ?B/s]

train-00041-of-00090.parquet:   0%|          | 0.00/480M [00:00<?, ?B/s]

train-00042-of-00090.parquet:   0%|          | 0.00/457M [00:00<?, ?B/s]

train-00043-of-00090.parquet:   0%|          | 0.00/437M [00:00<?, ?B/s]

train-00044-of-00090.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00045-of-00090.parquet:   0%|          | 0.00/410M [00:00<?, ?B/s]

train-00046-of-00090.parquet:   0%|          | 0.00/445M [00:00<?, ?B/s]

train-00047-of-00090.parquet:   0%|          | 0.00/443M [00:00<?, ?B/s]

train-00048-of-00090.parquet:   0%|          | 0.00/404M [00:00<?, ?B/s]

train-00049-of-00090.parquet:   0%|          | 0.00/387M [00:00<?, ?B/s]

train-00050-of-00090.parquet:   0%|          | 0.00/409M [00:00<?, ?B/s]

train-00051-of-00090.parquet:   0%|          | 0.00/395M [00:00<?, ?B/s]

train-00052-of-00090.parquet:   0%|          | 0.00/414M [00:00<?, ?B/s]

train-00053-of-00090.parquet:   0%|          | 0.00/430M [00:00<?, ?B/s]

train-00054-of-00090.parquet:   0%|          | 0.00/418M [00:00<?, ?B/s]

train-00055-of-00090.parquet:   0%|          | 0.00/419M [00:00<?, ?B/s]

train-00056-of-00090.parquet:   0%|          | 0.00/434M [00:00<?, ?B/s]

train-00057-of-00090.parquet:   0%|          | 0.00/424M [00:00<?, ?B/s]

train-00058-of-00090.parquet:   0%|          | 0.00/419M [00:00<?, ?B/s]

train-00059-of-00090.parquet:   0%|          | 0.00/410M [00:00<?, ?B/s]

train-00060-of-00090.parquet:   0%|          | 0.00/401M [00:00<?, ?B/s]

train-00061-of-00090.parquet:   0%|          | 0.00/409M [00:00<?, ?B/s]

train-00062-of-00090.parquet:   0%|          | 0.00/418M [00:00<?, ?B/s]

train-00063-of-00090.parquet:   0%|          | 0.00/406M [00:00<?, ?B/s]

train-00064-of-00090.parquet:   0%|          | 0.00/371M [00:00<?, ?B/s]

train-00065-of-00090.parquet:   0%|          | 0.00/416M [00:00<?, ?B/s]

train-00066-of-00090.parquet:   0%|          | 0.00/403M [00:00<?, ?B/s]

train-00067-of-00090.parquet:   0%|          | 0.00/395M [00:00<?, ?B/s]

train-00068-of-00090.parquet:   0%|          | 0.00/409M [00:00<?, ?B/s]

train-00069-of-00090.parquet:   0%|          | 0.00/403M [00:00<?, ?B/s]

train-00070-of-00090.parquet:   0%|          | 0.00/426M [00:00<?, ?B/s]

train-00071-of-00090.parquet:   0%|          | 0.00/438M [00:00<?, ?B/s]

train-00072-of-00090.parquet:   0%|          | 0.00/415M [00:00<?, ?B/s]

train-00073-of-00090.parquet:   0%|          | 0.00/424M [00:00<?, ?B/s]

train-00074-of-00090.parquet:   0%|          | 0.00/389M [00:00<?, ?B/s]

train-00075-of-00090.parquet:   0%|          | 0.00/412M [00:00<?, ?B/s]

train-00076-of-00090.parquet:   0%|          | 0.00/413M [00:00<?, ?B/s]

train-00077-of-00090.parquet:   0%|          | 0.00/406M [00:00<?, ?B/s]

train-00078-of-00090.parquet:   0%|          | 0.00/415M [00:00<?, ?B/s]

train-00079-of-00090.parquet:   0%|          | 0.00/410M [00:00<?, ?B/s]

train-00080-of-00090.parquet:   0%|          | 0.00/434M [00:00<?, ?B/s]

train-00081-of-00090.parquet:   0%|          | 0.00/413M [00:00<?, ?B/s]

train-00082-of-00090.parquet:   0%|          | 0.00/393M [00:00<?, ?B/s]

train-00083-of-00090.parquet:   0%|          | 0.00/446M [00:00<?, ?B/s]

train-00084-of-00090.parquet:   0%|          | 0.00/417M [00:00<?, ?B/s]

train-00085-of-00090.parquet:   0%|          | 0.00/396M [00:00<?, ?B/s]

train-00086-of-00090.parquet:   0%|          | 0.00/409M [00:00<?, ?B/s]

train-00087-of-00090.parquet:   0%|          | 0.00/418M [00:00<?, ?B/s]

train-00088-of-00090.parquet:   0%|          | 0.00/433M [00:00<?, ?B/s]

train-00089-of-00090.parquet:   0%|          | 0.00/446M [00:00<?, ?B/s]

Generating valid split:   0%|          | 0/4426 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/353805 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/85 [00:00<?, ?it/s]

Saving audio: 100%|██████████| 353805/353805 [27:35<00:00, 213.76it/s]


 Saved metadata to /mnt/data/stt/Datasets/Datasets/indicvoices_tamil_metadata.csv
 Unique files checked and updated.
 Filtered metadata saved to /mnt/data/stt/Datasets/Datasets/filtered_tamil_indicvoices.csv


Copying filtered audio files: 100%|██████████| 324196/324196 [16:20<00:00, 330.75it/s]


 'duration' and 'duration_sec' differ — keeping both

 Summary of Final Filtered Dataset:
----------------------------------------
Head:
                    path                                           sentence  \
0  indicvoices_ta_0.wav                   குணா ரமேஷ் சுரேஷ் விமல் கார்த்தி   
3  indicvoices_ta_3.wav     ஹாய் ஆல்லி எனக்கு ப்ரீதமோட மியூசிக் பிடிக்கும்   
4  indicvoices_ta_4.wav          மேற்தூவல் பொருட்கள் அனைத்தையும் காட்டவும்   
5  indicvoices_ta_5.wav  இப்போது வர ஃபோட்டோஸெலாம் காட்டிலும் இப்போது மு...   
6  indicvoices_ta_6.wav  புகைப்படத்தை எடுத்து அதில் ஹார்ட் காப்பின்னு ப...   

  found  duration  duration_sec  duration_ms  
0   yes     4.062         4.062       4062.0  
3   yes     4.104         4.104       4104.0  
4   yes     3.525         3.525       3525.0  
5   yes     5.056         5.056       5056.0  
6   yes     9.566         9.566       9566.0  

Tail:
                              path  \
353800  indicvoices_ta_353800.wav   
353801  indicvoices_ta_353801

**For dropping duration column for updated tamil csv file**

In [8]:
# Load the CSV file
csv_path = "/mnt/data/stt/Datasets/Datasets/filtered_tamil_indicvoices.csv"
df = pd.read_csv(csv_path)

# Show total number of columns before dropping
print(f" Total columns before dropping: {df.shape[1]}")
print(" Column names:", df.columns.tolist())

# Drop the 'duration' column if it exists
if 'duration' in df.columns:
    df.drop(columns=['duration'], inplace=True)
    print(" 'duration' column dropped.")
else:
    print(" 'duration' column not found.")

# Show total columns after dropping
print(f" Total columns after dropping: {df.shape[1]}")
print(" Updated column names:", df.columns.tolist())

 Total columns before dropping: 6
 Column names: ['path', 'sentence', 'found', 'duration', 'duration_sec', 'duration_ms']
 'duration' column dropped.
 Total columns after dropping: 5
 Updated column names: ['path', 'sentence', 'found', 'duration_sec', 'duration_ms']


In [9]:
df.to_csv(csv_path, index=False)
print(" Updated CSV saved.")

 Updated CSV saved.


**Converting .mp3 files in to .wav for previous merged folder and files for tamil**

In [1]:
!apt-get update
!apt-get install -y ffmpeg

0% [Working]

Hit:1 http://in.archive.ubuntu.com/ubuntu noble InRelease
Hit:2 http://in.archive.ubuntu.com/ubuntu noble-updates InRelease              
Hit:4 http://security.ubuntu.com/ubuntu noble-security InRelease               
Hit:5 http://in.archive.ubuntu.com/ubuntu noble-backports InRelease            
Hit:6 https://repo.zabbix.com/zabbix-tools/debian-ubuntu noble InRelease       
Hit:7 https://repo.zabbix.com/zabbix/7.0/ubuntu noble InRelease                
Ign:3 https://repo.r1soft.com/apt stable InRelease
Hit:8 https://repo.r1soft.com/apt stable Release
Reading package lists... Done
W: http://repo.r1soft.com/apt/dists/stable/Release.gpg: Key is stored in legacy trusted.gpg keyring (/etc/apt/trusted.gpg), see the DEPRECATION section in apt-key(8) for details.
W: http://repo.r1soft.com/apt/dists/stable/Release.gpg: Signature by key 12C9C3F52909509BDBCDE12639A4965166BD1D82 uses weak algorithm (dsa2048)
Reading package lists... Done
Building dependency tree... Done
Reading state information.

In [1]:
%pip install pydub

Collecting pydub
  Using cached pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import pandas as pd
from pydub import AudioSegment
from tqdm import tqdm

In [3]:
class AudioConverter:
    def __init__(self, input_folder, input_csv, output_folder, output_csv):
        self.input_folder = input_folder
        self.input_csv = input_csv
        self.output_folder = output_folder
        self.output_csv = output_csv

        os.makedirs(self.output_folder, exist_ok=True)

        self.df = pd.read_csv(self.input_csv)

    def convert_mp3_to_wav(self):
        updated_rows = []
        for _, row in tqdm(self.df.iterrows(), total=self.df.shape[0]):
            original_filename = row['path']
            input_path = os.path.join(self.input_folder, original_filename)

            if original_filename.endswith('.mp3'):
                wav_filename = os.path.splitext(original_filename)[0] + ".wav"
                output_path = os.path.join(self.output_folder, wav_filename)

                try:
                    sound = AudioSegment.from_mp3(input_path)
                    sound.export(output_path, format="wav")
                    row['path'] = wav_filename  # update path in df
                except Exception as e:
                    print(f"Failed to convert {original_filename}: {e}")
                    continue

            elif original_filename.endswith('.wav'):
                output_path = os.path.join(self.output_folder, original_filename)
                if not os.path.exists(output_path):  # Copy if not already present
                    try:
                        AudioSegment.from_wav(input_path).export(output_path, format="wav")
                    except Exception as e:
                        print(f"Failed to copy {original_filename}: {e}")
                        continue

            updated_rows.append(row)

        # Update the dataframe with converted paths
        self.df = pd.DataFrame(updated_rows)

    def save_updated_csv(self):
        self.df.to_csv(self.output_csv, index=False)
        print(f"\n Updated CSV saved to: {self.output_csv}")

    def generate_statistics(self):
        print("\n DataFrame Info:")
        print(self.df.info())

        print("\n Description:")
        print(self.df.describe())

        print("\n Unique path count:", self.df['path'].nunique())

        print("\n Head:")
        print(self.df.head())

        print("\n Tail:")
        print(self.df.tail())

        total_duration_sec = self.df["duration_sec"].sum()
        total_duration_ms = self.df["duration_ms"].sum()

        print(f"\n⏱ Total Duration (seconds): {total_duration_sec:.2f}")
        print(f"⏱ Total Duration (milliseconds): {total_duration_ms:.2f}")

    def compare_file_counts(self):
        old_files = [f for f in os.listdir(self.input_folder) if f.endswith(('.mp3', '.wav'))]
        new_files = [f for f in os.listdir(self.output_folder) if f.endswith('.wav')]

        print("\n Folder Statistics:")
        print(f" Original Folder ({self.input_folder}) has: {len(old_files)} audio files")
        print(f" New WAV Folder ({self.output_folder}) has: {len(new_files)} .wav audio files")

    def run(self):
        print(" Starting conversion...")
        self.convert_mp3_to_wav()
        self.save_updated_csv()
        self.compare_file_counts()
        self.generate_statistics()
        print("\n Process Completed Successfully.")


# === Run It ===

converter = AudioConverter(
    input_folder="/mnt/data/stt/Datasets/final_merged_tamil_audios_updated",
    input_csv="/mnt/data/stt/Datasets/final_merged_tamil_dataset_updated.csv",
    output_folder="/mnt/data/stt/Datasets/final_merged_tamil_audios_wav",
    output_csv="/mnt/data/stt/Datasets/final_merged_tamil_dataset_wav.csv"
)

converter.run()


 Starting conversion...


100%|██████████| 259137/259137 [12:14:40<00:00,  5.88it/s]  



 Updated CSV saved to: /mnt/data/stt/Datasets/final_merged_tamil_dataset_wav.csv

 Folder Statistics:
 Original Folder (/mnt/data/stt/Datasets/final_merged_tamil_audios_updated) has: 259137 audio files
 New WAV Folder (/mnt/data/stt/Datasets/final_merged_tamil_audios_wav) has: 259137 .wav audio files

 DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Index: 259137 entries, 0 to 259136
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   path          259137 non-null  object 
 1   sentence      259137 non-null  object 
 2   found         259137 non-null  object 
 3   duration_sec  259137 non-null  float64
 4   duration_ms   259137 non-null  float64
dtypes: float64(2), object(3)
memory usage: 11.9+ MB
None

 Description:
        duration_sec    duration_ms
count  259137.000000  259137.000000
mean        6.242496    6242.496284
std         2.168037    2168.037302
min         0.072000      72.000000
25%         4

**For merging audio and csv files for tamil**

In [3]:
# Audio file manager
class AudioDatasetManager:
    def __init__(self, folder_names, output_folder):
        self.folder_names = folder_names
        self.output_folder = output_folder

    def count_audio_files(self, folder):
        return len([file for file in os.listdir(folder) if file.endswith('.wav')])

    def process_audio_folders(self):
        print(" Counting and merging audio files...")
        total_files = {}
        os.makedirs(self.output_folder, exist_ok=True)

        for folder in self.folder_names:
            count = self.count_audio_files(folder)
            print(f" Total .wav files in '{folder}': {count}")
            total_files[folder] = count

            for file in os.listdir(folder):
                if file.endswith('.wav'):
                    src = os.path.join(folder, file)
                    dst = os.path.join(self.output_folder, file)
                    if not os.path.exists(dst):  # Avoid overwriting
                        shutil.copy2(src, dst)

        final_count = self.count_audio_files(self.output_folder)
        print(f"\n Total .wav files in merged folder '{self.output_folder}': {final_count}")
        return total_files, final_count


# CSV manager
class CSVDatasetManager:
    def __init__(self, file_names, output_file):
        self.file_names = file_names
        self.output_file = output_file

    def inspect_csv(self, file_path):
        df = pd.read_csv(file_path)
        print(f"\n File: {file_path}")
        print(f" Columns: {df.columns.tolist()}")
        print(f" Rows: {len(df)}")
        return df

    def merge_csvs(self):
        dataframes = []
        for path in self.file_names:
            df = self.inspect_csv(path)
            dataframes.append(df)

        final_df = pd.concat(dataframes, axis=0, ignore_index=True)
        final_df.to_csv(self.output_file, index=False)
        print(f"\n Merged CSV saved as '{self.output_file}'")
        return final_df

    def analyze_merged_csv(self, df):
        print("\n --- Merged CSV Analysis ---")
        print(f" Columns: {df.columns.tolist()}")
        print(f" Total Rows: {len(df)}")

        print("\n Info:")
        print(df.info())

        print("\n Describe:")
        print(df.describe())

        print("\n Head:")
        print(df.head())

        print("\n Tail:")
        print(df.tail())

        print("\n Unique values per column:")
        print(df.nunique())

        print("\n Total duration (seconds):", df['duration_sec'].sum())
        print(" Total duration (milliseconds):", df['duration_ms'].sum())

# AUDIO FILES
audio_folders = [
    "/mnt/data/stt/Datasets/final_merged_tamil_audios_wav",
    "/mnt/data/stt/Datasets/Datasets/tamil_audios_indicvoices_final"
]
final_audio_output = "final_merged_tamil_audios"

audio_manager = AudioDatasetManager(audio_folders, final_audio_output)
audio_manager.process_audio_folders()

# CSV FILES
csv_files = [
    "/mnt/data/stt/Datasets/final_merged_tamil_dataset_wav.csv",
    "//mnt/data/stt/Datasets/Datasets/filtered_tamil_indicvoices.csv"
]
final_csv_output = "final_merged_tamil_dataset.csv"

csv_manager = CSVDatasetManager(csv_files, final_csv_output)
merged_df = csv_manager.merge_csvs()
csv_manager.analyze_merged_csv(merged_df)


 Counting and merging audio files...
 Total .wav files in '/mnt/data/stt/Datasets/final_merged_tamil_audios_wav': 259137
 Total .wav files in '/mnt/data/stt/Datasets/Datasets/tamil_audios_indicvoices_final': 324196

 Total .wav files in merged folder 'final_merged_tamil_audios': 583333

 File: /mnt/data/stt/Datasets/final_merged_tamil_dataset_wav.csv
 Columns: ['path', 'sentence', 'found', 'duration_sec', 'duration_ms']
 Rows: 259137

 File: //mnt/data/stt/Datasets/Datasets/filtered_tamil_indicvoices.csv
 Columns: ['path', 'sentence', 'found', 'duration_sec', 'duration_ms']
 Rows: 324196

 Merged CSV saved as 'final_merged_tamil_dataset.csv'

 --- Merged CSV Analysis ---
 Columns: ['path', 'sentence', 'found', 'duration_sec', 'duration_ms']
 Total Rows: 583333

 Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583333 entries, 0 to 583332
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   path          5833

**For merged tamil files**

In [4]:
audio_folder = "/mnt/data/stt/Datasets/Datasets/final_merged_tamil_audios"
# List all .wav files
audio_files = [f for f in os.listdir(audio_folder) if f.endswith(".wav")]
# Count them
print(f" Total number of audio files: {len(audio_files)}")

 Total number of audio files: 583333


In [6]:
import os
from IPython.display import Audio
audio_files = sorted(os.listdir(audio_folder))[583321:583325]  # first 5 files
for file in audio_files:
    print(f" Playing: {file}")
    display(Audio(filename=os.path.join(audio_folder, file)))

 Playing: tamil_audio_986.wav


 Playing: tamil_audio_987.wav


 Playing: tamil_audio_988.wav


 Playing: tamil_audio_99.wav
