In [5]:
import pandas as pd
import numpy as np
import os
from datasets import load_dataset, Audio
import shutil
import soundfile as sf
from tqdm import tqdm
from IPython.display import Audio, display

In [16]:
os.getcwd()

'/home/jovyan/work/Datasets'

In [17]:
from datasets import load_dataset
shrutilipi = load_dataset("ai4bharat/Shrutilipi", "telugu", split="train")

README.md:   0%|          | 0.00/9.35k [00:00<?, ?B/s]

train-00000-of-00012.parquet:   0%|          | 0.00/462M [00:00<?, ?B/s]

train-00001-of-00012.parquet:   0%|          | 0.00/487M [00:00<?, ?B/s]

train-00002-of-00012.parquet:   0%|          | 0.00/494M [00:00<?, ?B/s]

train-00003-of-00012.parquet:   0%|          | 0.00/488M [00:00<?, ?B/s]

train-00004-of-00012.parquet:   0%|          | 0.00/478M [00:00<?, ?B/s]

train-00005-of-00012.parquet:   0%|          | 0.00/512M [00:00<?, ?B/s]

train-00006-of-00012.parquet:   0%|          | 0.00/467M [00:00<?, ?B/s]

train-00007-of-00012.parquet:   0%|          | 0.00/494M [00:00<?, ?B/s]

train-00008-of-00012.parquet:   0%|          | 0.00/513M [00:00<?, ?B/s]

train-00009-of-00012.parquet:   0%|          | 0.00/492M [00:00<?, ?B/s]

train-00010-of-00012.parquet:   0%|          | 0.00/517M [00:00<?, ?B/s]

train-00011-of-00012.parquet:   0%|          | 0.00/482M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/56082 [00:00<?, ? examples/s]

In [18]:
print(shrutilipi)

Dataset({
    features: ['audio_filepath', 'text', 'duration', 'lang'],
    num_rows: 56082
})


In [20]:
print(shrutilipi.column_names)

['audio_filepath', 'text', 'duration', 'lang']


In [21]:
print(shrutilipi[0])

{'audio_filepath': {'path': 'Regional-Vijayawada-Telugu-1320-20201217141250_chunk_49.flac', 'array': array([ 0.01376343,  0.01022339,  0.0005188 , ..., -0.06582642,
       -0.0574646 , -0.03210449], shape=(54400,)), 'sampling_rate': 16000}, 'text': 'శివన్ ప్రయోగం ప్రక్రియపై శాస్రవేత్తలతో సమావేశాన్ని నిర్వహించారు', 'duration': 3.4, 'lang': 'te'}


**From extraction to saving final csv pipeline**

In [23]:
class ShrutilipiProcessor:
    def __init__(self, save_dir):
        self.save_dir = save_dir
        self.audio_folder = os.path.join(save_dir, "Shrutilipi_telugu_audio")
        self.final_audio_folder = os.path.join(save_dir, "telugu_audios_shrutilipi_final")
        self.raw_csv_path = os.path.join(save_dir, "shrutilipi_telugu_metadata.csv")
        self.filtered_csv_path = os.path.join(save_dir, "filtered_telugu_shrutilipi.csv")
        os.makedirs(self.audio_folder, exist_ok=True)
        os.makedirs(self.final_audio_folder, exist_ok=True)

    def load_dataset(self):
        return load_dataset("ai4bharat/Shrutilipi", "telugu", split="train")

    def save_audio_and_metadata(self, dataset):
        data = []
        for idx in tqdm(range(len(dataset)), desc="Saving audio"):
            sample = dataset[idx]
            audio_array = sample["audio_filepath"]["array"]
            sr = sample["audio_filepath"]["sampling_rate"]
            duration_sec = len(audio_array) / sr
            duration_ms = duration_sec * 1000

            filename = f"shrutilipi_te_{idx}.wav"
            filepath = os.path.join(self.audio_folder, filename)
            sf.write(filepath, audio_array, samplerate=sr)

            data.append({
                "audio_filename": filename,
                "text": sample["text"],
                "duration": sample["duration"],
                "duration_sec": duration_sec,
                "duration_ms": duration_ms
            })

        df = pd.DataFrame(data)
        df.to_csv(self.raw_csv_path, index=False)
        print(f" Saved metadata to {self.raw_csv_path}")
        return df

    def validate_and_update_csv(self, df):
        audio_files_set = set(os.listdir(self.audio_folder))
        unique_df = df.drop_duplicates(subset=["audio_filename"]).copy()
        unique_df["found"] = unique_df["audio_filename"].apply(lambda x: "yes" if x in audio_files_set else "no")
        unique_df.rename(columns={
            "audio_filename": "path",
            "text": "sentence"
        }, inplace=True)

        # Move 'found' after 'sentence'
        found = unique_df.pop("found")
        unique_df.insert(unique_df.columns.get_loc("sentence") + 1, "found", found)

        print(" Unique files checked and updated.")
        return unique_df

    def filter_by_duration(self, df, max_duration=15.0):
        filtered = df[df["duration_sec"] <= max_duration].copy()
        filtered.to_csv(self.filtered_csv_path, index=False)
        print(f" Filtered metadata saved to {self.filtered_csv_path}")
        return filtered

    def copy_filtered_audios(self, df):
        for fname in tqdm(df["path"], desc="Copying filtered audio files"):
            src = os.path.join(self.audio_folder, fname)
            dst = os.path.join(self.final_audio_folder, fname)
            if os.path.exists(src):
                os.system(f'cp "{src}" "{dst}"')

    def compare_duration_columns(self, df):
        if "duration" in df.columns and all(df["duration"].round(3) == df["duration_sec"].round(3)):
            print(" 'duration' and 'duration_sec' are same — dropping 'duration'")
            df = df.drop(columns=["duration"])
        else:
            print(" 'duration' and 'duration_sec' differ — keeping both")
        return df

    def summarize_csv(self, df, label=""):
        print(f"\n Summary of {label} Dataset:")
        print("-" * 40)
        print("Head:\n", df.head())
        print("\nTail:\n", df.tail())
        print("\nInfo:")
        print(df.info())
        print("\nDescribe:\n", df.describe())
        print("\nUnique 'path' count:", df["path"].nunique())

        print("\n Total Durations:")
        if "duration" in df.columns:
            print("Sum of 'duration' (sec):", df["duration"].sum())
        print("Sum of 'duration_sec':", df["duration_sec"].sum())
        print("Sum of 'duration_ms':", df["duration_ms"].sum())

        return df


processor = ShrutilipiProcessor("/home/jovyan/work/Datasets")
dataset = processor.load_dataset()
df_raw = processor.save_audio_and_metadata(dataset)
df_validated = processor.validate_and_update_csv(df_raw)
df_filtered = processor.filter_by_duration(df_validated)
processor.copy_filtered_audios(df_filtered)
df_final = processor.compare_duration_columns(df_filtered)
df_final = processor.summarize_csv(df_final, label="Final Filtered")

# Save final cleaned filtered CSV again (after possibly dropping column)
df_final.to_csv(processor.filtered_csv_path, index=False)
print(f"\n Final updated CSV saved to {processor.filtered_csv_path}")


Saving audio: 100%|██████████| 56082/56082 [07:27<00:00, 125.23it/s]


 Saved metadata to /home/jovyan/work/Datasets/shrutilipi_telugu_metadata.csv
 Unique files checked and updated.
 Filtered metadata saved to /home/jovyan/work/Datasets/filtered_telugu_shrutilipi.csv


Copying filtered audio files: 100%|██████████| 55354/55354 [14:02<00:00, 65.67it/s]


 'duration' and 'duration_sec' are same — dropping 'duration'

 Summary of Final Filtered Dataset:
----------------------------------------
Head:
                   path                                           sentence  \
0  shrutilipi_te_0.wav  శివన్ ప్రయోగం ప్రక్రియపై శాస్రవేత్తలతో సమావేశా...   
1  shrutilipi_te_1.wav  ఉభయ తెలుగు రాష్టాల్లో ఎన్నికల పచారం ఈ సాయంకాలం...   
2  shrutilipi_te_2.wav  దేశంలోగల జాతీయ ఉన్నత విద్యా సంస్లలకు కేంద్ర మా...   
3  shrutilipi_te_3.wav  ఇప్పటికే ఉభయ తెలుగు రాష్టాల్లోని పధాన పార్టీలు...   
4  shrutilipi_te_4.wav  ఈరోజు కూడా ఉభయ తెలుగు రాష్ట్లోని పలు పాంతాల్లో...   

  found  duration_sec  duration_ms  
0   yes          3.40       3400.0  
1   yes          6.48       6480.0  
2   yes          8.84       8840.0  
3   yes          6.28       6280.0  
4   yes          6.68       6680.0  

Tail:
                           path  \
56077  shrutilipi_te_56077.wav   
56078  shrutilipi_te_56078.wav   
56079  shrutilipi_te_56079.wav   
56080  shrutilipi_te_560

**For updated audio folders**

In [6]:
audio_folder = "/mnt/data/stt/Datasets/Datasets/telugu_audios_shrutilipi_final"
# List all .wav files
audio_files = [f for f in os.listdir(audio_folder) if f.endswith(".wav")]
# Count them
print(f" Total number of audio files: {len(audio_files)}")

 Total number of audio files: 55354


In [7]:
audio_files = sorted(os.listdir(audio_folder))[:5]  # first 5 files
for file in audio_files:
    print(f" Playing: {file}")
    display(Audio(filename=os.path.join(audio_folder, file)))

 Playing: shrutilipi_te_0.wav


 Playing: shrutilipi_te_1.wav


 Playing: shrutilipi_te_10.wav


 Playing: shrutilipi_te_100.wav


 Playing: shrutilipi_te_1000.wav


**For original audio folder**

In [8]:
audio_folder = "/mnt/data/stt/Datasets/Shrutilipi_telugu_audio"
# List all .wav files
audio_files = [f for f in os.listdir(audio_folder) if f.endswith(".wav")]
# Count them
print(f" Total number of audio files: {len(audio_files)}")

 Total number of audio files: 56082
