In [1]:
import os
os.environ["HF_HOME"] = "//mnt/data/stt/Datasets/Datasets/huggingface-cache"
os.environ["HF_DATASETS_CACHE"] = f"{os.environ['HF_HOME']}/datasets"
os.environ["TRANSFORMERS_CACHE"] = f"{os.environ['HF_HOME']}/transformers"
os.environ["HF_METRICS_CACHE"] = f"{os.environ['HF_HOME']}/metrics"
print("HF_DATASETS_CACHE:", os.environ["HF_DATASETS_CACHE"])

HF_DATASETS_CACHE: //mnt/data/stt/Datasets/Datasets/huggingface-cache/datasets


In [2]:
import pandas as pd
import numpy as np
import os
from datasets import load_dataset, Audio
import shutil
import soundfile as sf
from tqdm import tqdm
from IPython.display import Audio, display

In [3]:
os.getcwd()

'/mnt/data/stt/Datasets/Datasets'

In [4]:
ds_ur = load_dataset("SPRINGLab/IndicTTS_Odia", split="train") # Load the IndicTTS Tamil dataset from Hugging Face

README.md:   0%|          | 0.00/3.21k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/2.06G [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/2.05G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11564 [00:00<?, ? examples/s]

In [5]:
print(ds_ur)

Dataset({
    features: ['audio', 'text', 'gender'],
    num_rows: 11564
})


In [7]:
print(ds_ur.column_names)

['audio', 'text', 'gender']


In [9]:
print(ds_ur[0])

{'audio': {'path': 'text0001.wav', 'array': array([0., 0., 0., ..., 0., 0., 0.], shape=(141676,)), 'sampling_rate': 48000}, 'text': 'ଗଛ ବଞ୍ଚିଲେ, ଆମେ ବଞ୍ଚିବା ।', 'gender': 0}


In [12]:
# Constants
BASE_DIR = '/mnt/data/stt/Datasets/Datasets'
RAW_AUDIO_DIR = os.path.join(BASE_DIR, 'odia_audios_indic_tts')
FILTERED_AUDIO_DIR = os.path.join(BASE_DIR, 'odia_audios_indictts_final')
RAW_CSV_PATH = os.path.join(BASE_DIR, 'odia_indictts_metadata.csv')
FILTERED_CSV_PATH = os.path.join(BASE_DIR, 'filtered_odia_indictts_final.csv')


class IndicTTSOptimized:
    def __init__(self, dataset_name, split='train'):
        self.dataset_name = dataset_name
        self.split = split
        self.df = None
        self.filtered_df = None

    def load_and_process(self):
        dataset = load_dataset(self.dataset_name, split=self.split)
        os.makedirs(RAW_AUDIO_DIR, exist_ok=True)

        records = []
        existing_files = set()

        print("Processing and saving audio files...")

        for idx, item in enumerate(tqdm(dataset)):
            audio = item["audio"]["array"]
            sr = item["audio"]["sampling_rate"]
            text = item["text"]
            file_name = f"indic_tts_odia_{idx}.wav"
            file_path = os.path.join(RAW_AUDIO_DIR, file_name)

            sf.write(file_path, audio, sr)
            duration_sec = len(audio) / sr
            duration_ms = duration_sec * 1000

            records.append({
                'path': file_name,  # store only the filename
                'sentence': text,
                'duration_sec': duration_sec,
                'duration_ms': duration_ms
            })

            existing_files.add(file_name)

        self.df = pd.DataFrame(records)
        self.df.drop_duplicates(subset='path', inplace=True, ignore_index=True)

        print("Validating audio files...")
        self.df.insert(2, 'found', self.df['path'].apply(
            lambda x: 'yes' if x in existing_files else 'no'
        ))

    def save_raw_csv(self):
        self.df.to_csv(RAW_CSV_PATH, index=False)
        print(f"Raw CSV saved to: {RAW_CSV_PATH}")

    def filter_by_duration(self, max_duration=15.0):
        print(f"Filtering entries with duration <= {max_duration} seconds...")
        self.filtered_df = self.df[self.df['duration_sec'] <= max_duration].copy()
        self.filtered_df.reset_index(drop=True, inplace=True)

        print("Validating filtered audio files...")
        self.filtered_df['found'] = self.filtered_df['path'].apply(
            lambda x: 'yes' if os.path.exists(os.path.join(RAW_AUDIO_DIR, x)) else 'no'
        )

        self.filtered_df.to_csv(FILTERED_CSV_PATH, index=False)
        print(f"Filtered CSV saved to: {FILTERED_CSV_PATH}")

    def copy_filtered_audio_files(self):
        os.makedirs(FILTERED_AUDIO_DIR, exist_ok=True)
        print("Copying filtered audio files to final folder...")

        for filename in tqdm(self.filtered_df['path']):
            src_path = os.path.join(RAW_AUDIO_DIR, filename)
            dest_path = os.path.join(FILTERED_AUDIO_DIR, filename)
            if os.path.exists(src_path):
                shutil.copy(src_path, dest_path)

    def final_analysis(self):
        print("\n--- Filtered CSV Analysis ---")
        print("First 5 rows:")
        display(self.filtered_df.head())
        print("\nLast 5 rows:")
        display(self.filtered_df.tail())
        print("\nInfo:")
        display(self.filtered_df.info())
        print("\nDescribe:")
        display(self.filtered_df.describe())
        print("\nUnique audio files:", self.filtered_df['path'].nunique())

        total_sec = self.filtered_df['duration_sec'].sum()
        total_ms = self.filtered_df['duration_ms'].sum()
        print(f"\nTotal duration in seconds: {total_sec}")
        print(f"Total duration in milliseconds: {total_ms}")

    def run(self):
        self.load_and_process()
        self.save_raw_csv()
        self.filter_by_duration(max_duration=15.0)
        self.copy_filtered_audio_files()
        self.final_analysis()

processor = IndicTTSOptimized("SPRINGLab/IndicTTS_Odia")
processor.run()


Processing and saving audio files...


100%|██████████| 11564/11564 [01:23<00:00, 138.23it/s]


Validating audio files...
Raw CSV saved to: /mnt/data/stt/Datasets/Datasets/odia_indictts_metadata.csv
Filtering entries with duration <= 15.0 seconds...
Validating filtered audio files...
Filtered CSV saved to: /mnt/data/stt/Datasets/Datasets/filtered_odia_indictts_final.csv
Copying filtered audio files to final folder...


100%|██████████| 11554/11554 [00:39<00:00, 288.97it/s]


--- Filtered CSV Analysis ---
First 5 rows:





Unnamed: 0,path,sentence,found,duration_sec,duration_ms
0,indic_tts_odia_0.wav,"ଗଛ ବଞ୍ଚିଲେ, ଆମେ ବଞ୍ଚିବା ।",yes,2.951583,2951.583333
1,indic_tts_odia_1.wav,"ନିଇତି ସଞ୍ଜହେଲେ, ଜେଜୀମା ଗପ କହନ୍ତି ।",yes,3.830958,3830.958333
2,indic_tts_odia_2.wav,ପିଲାଏ ଉଠନ୍ତି ନାହିଁ କି ଜେଜୀମା’ର ଗପ ସରେ ନାହିଁ ।,yes,4.106438,4106.4375
3,indic_tts_odia_3.wav,"କିନ୍ତୁ ସବୁ କାହାଣୀର ଆରମ୍ଭ କିମ୍ବା ଶେଷ ହୋଇଥାଏ, ଅଗ...",yes,5.550021,5550.020833
4,indic_tts_odia_4.wav,"ବଣ ଜଙ୍ଗଲର କଥା ନ ଥାଇ, ଜେଜୀମା’ର କାହାଣୀ ନାହିଁ ।",yes,4.443646,4443.645833



Last 5 rows:


Unnamed: 0,path,sentence,found,duration_sec,duration_ms
11549,indic_tts_odia_11558.wav,ଏହିଭଳି ଜଣେ ମୋଟାବୁଦ୍ଧିଆ ପିଲା ତା’ ଜୀବନରେ ଅନେକ ବୋ...,yes,10.461771,10461.770833
11550,indic_tts_odia_11559.wav,ତଥାପି ସେ ନିଜର ଅଧ୍ୟବସାୟ ବଳରେ ଭବିଷ୍ୟତରେ ଜଣେ ବଡ଼ ବ...,yes,7.978542,7978.541667
11551,indic_tts_odia_11560.wav,"ଏଡ଼ିସନ୍ ପିଲାବେଳେ ଭାରି ଗରିବ ଥିଲେ, ବଜାରରେ ଫଳ ଓ ରେ...",yes,12.19975,12199.75
11552,indic_tts_odia_11561.wav,ତାଙ୍କର କଠୋର ପରିଶ୍ରମ ଓ ଆତ୍ମବିଶ୍ୱାସ ତାଙ୍କୁ ଜଣେ ଶ...,yes,7.976917,7976.916667
11553,indic_tts_odia_11563.wav,"ଏଡ଼ିସନ୍ ଙ୍କ ଏକ ବିଶେଷ ଗୁଣ ଥିଲା, ସେ ଯାହା କହୁଥିଲେ ...",yes,7.032,7032.0



Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11554 entries, 0 to 11553
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   path          11554 non-null  object 
 1   sentence      11554 non-null  object 
 2   found         11554 non-null  object 
 3   duration_sec  11554 non-null  float64
 4   duration_ms   11554 non-null  float64
dtypes: float64(2), object(3)
memory usage: 451.5+ KB


None


Describe:


Unnamed: 0,duration_sec,duration_ms
count,11554.0,11554.0
mean,5.965736,5965.735999
std,2.21182,2211.820337
min,1.810667,1810.666667
25%,4.29,4290.0
50%,5.569646,5569.645833
75%,7.190844,7190.84375
max,14.789271,14789.270833



Unique audio files: 11554

Total duration in seconds: 68928.11372916667
Total duration in milliseconds: 68928113.72916666


In [15]:
audio_folder = "/mnt/data/stt/Datasets/Datasets/odia_audios_indic_tts"
# List all .wav files
audio_files = [f for f in os.listdir(audio_folder) if f.endswith(".wav")]
# Count them
print(f" Total number of audio files: {len(audio_files)}")

 Total number of audio files: 11564


In [14]:
import os
from IPython.display import Audio
audio_files = sorted(os.listdir(audio_folder))[10:15]  # first 5 files
for file in audio_files:
    print(f" Playing: {file}")
    display(Audio(filename=os.path.join(audio_folder, file)))

 Playing: indic_tts_odia_10005.wav


 Playing: indic_tts_odia_10006.wav


 Playing: indic_tts_odia_10007.wav


 Playing: indic_tts_odia_10008.wav


 Playing: indic_tts_odia_10009.wav
