In [1]:
import os
os.environ["HF_HOME"] = "//mnt/data/stt/Datasets/Datasets/huggingface-cache"
os.environ["HF_DATASETS_CACHE"] = f"{os.environ['HF_HOME']}/datasets"
os.environ["TRANSFORMERS_CACHE"] = f"{os.environ['HF_HOME']}/transformers"
os.environ["HF_METRICS_CACHE"] = f"{os.environ['HF_HOME']}/metrics"
print("HF_DATASETS_CACHE:", os.environ["HF_DATASETS_CACHE"])

HF_DATASETS_CACHE: //mnt/data/stt/Datasets/Datasets/huggingface-cache/datasets


In [2]:
import pandas as pd
import numpy as np
import os
from datasets import load_dataset, Audio
import shutil
import soundfile as sf
from tqdm import tqdm
from IPython.display import Audio, display

In [3]:
os.getcwd()

'/mnt/data/stt/Datasets/Datasets'

In [4]:
# Now load dataset
from datasets import load_dataset
shrutilipi_pa = load_dataset("ai4bharat/Shrutilipi", "punjabi", split="train")

README.md:   0%|          | 0.00/9.35k [00:00<?, ?B/s]

train-00000-of-00007.parquet:   0%|          | 0.00/443M [00:00<?, ?B/s]

train-00001-of-00007.parquet:   0%|          | 0.00/449M [00:00<?, ?B/s]

train-00002-of-00007.parquet:   0%|          | 0.00/463M [00:00<?, ?B/s]

train-00003-of-00007.parquet:   0%|          | 0.00/458M [00:00<?, ?B/s]

train-00004-of-00007.parquet:   0%|          | 0.00/456M [00:00<?, ?B/s]

train-00005-of-00007.parquet:   0%|          | 0.00/455M [00:00<?, ?B/s]

train-00006-of-00007.parquet:   0%|          | 0.00/458M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/21118 [00:00<?, ? examples/s]

In [5]:
print(shrutilipi_pa)

Dataset({
    features: ['audio_filepath', 'text', 'duration', 'lang'],
    num_rows: 21118
})


In [6]:
print(shrutilipi_pa.column_names)

['audio_filepath', 'text', 'duration', 'lang']


In [7]:
print(shrutilipi_pa[0])

{'audio_filepath': {'path': 'Regional-Chandigarh-Punjabi-1820-20181027193228_chunk_1.flac', 'array': array([ 0.02212524,  0.00546265, -0.00515747, ..., -0.00454712,
       -0.00863647, -0.012146  ], shape=(87680,)), 'sampling_rate': 16000}, 'text': 'ਪੰਜਾਬ ਦੇ ਮਾਲ ਮਹਿਕਮੇ ਵੱਲੋਂ ਜ਼ਮੀਨਾਂ ਦੀ ਸਟੀਕ ਨਿਸ਼ਾਨਦੇਹੀ ਵਾਸਤੇ ਖਾਸ ਕਿਸਮ ਦੀਆਂ ਮਸ਼ੀਨਾਂ ਦਾ ਪ੍ਬੰਧ ਕੀਤਾ ਗਿਐ', 'duration': 5.48, 'lang': 'pa'}


**From extraction to saving final csv pipeline**

In [8]:
class ShrutilipiProcessor:
    def __init__(self, save_dir):
        self.save_dir = save_dir
        self.audio_folder = os.path.join(save_dir, "Shrutilipi_punjabi_audio")
        self.final_audio_folder = os.path.join(save_dir, "punjabi_audios_shrutilipi_final")
        self.raw_csv_path = os.path.join(save_dir, "shrutilipi_punjabi_metadata.csv")
        self.filtered_csv_path = os.path.join(save_dir, "filtered_punjabi_shrutilipi.csv")
        os.makedirs(self.audio_folder, exist_ok=True)
        os.makedirs(self.final_audio_folder, exist_ok=True)

    def load_dataset(self):
        return load_dataset("ai4bharat/Shrutilipi", "punjabi", split="train")

    def save_audio_and_metadata(self, dataset):
        data = []
        for idx in tqdm(range(len(dataset)), desc="Saving audio"):
            sample = dataset[idx]
            audio_array = sample["audio_filepath"]["array"]
            sr = sample["audio_filepath"]["sampling_rate"]
            duration_sec = len(audio_array) / sr
            duration_ms = duration_sec * 1000

            filename = f"shrutilipi_pa_{idx}.wav"
            filepath = os.path.join(self.audio_folder, filename)
            sf.write(filepath, audio_array, samplerate=sr)

            data.append({
                "audio_filename": filename,
                "text": sample["text"],
                "duration": sample["duration"],
                "duration_sec": duration_sec,
                "duration_ms": duration_ms
            })

        df = pd.DataFrame(data)
        df.to_csv(self.raw_csv_path, index=False)
        print(f" Saved metadata to {self.raw_csv_path}")
        return df

    def validate_and_update_csv(self, df):
        audio_files_set = set(os.listdir(self.audio_folder))
        unique_df = df.drop_duplicates(subset=["audio_filename"]).copy()
        unique_df["found"] = unique_df["audio_filename"].apply(lambda x: "yes" if x in audio_files_set else "no")
        unique_df.rename(columns={
            "audio_filename": "path",
            "text": "sentence"
        }, inplace=True)

        # Move 'found' after 'sentence'
        found = unique_df.pop("found")
        unique_df.insert(unique_df.columns.get_loc("sentence") + 1, "found", found)

        print(" Unique files checked and updated.")
        return unique_df

    def filter_by_duration(self, df, max_duration=15.0):
        filtered = df[df["duration_sec"] <= max_duration].copy()
        filtered.to_csv(self.filtered_csv_path, index=False)
        print(f" Filtered metadata saved to {self.filtered_csv_path}")
        return filtered

    def copy_filtered_audios(self, df):
        for fname in tqdm(df["path"], desc="Copying filtered audio files"):
            src = os.path.join(self.audio_folder, fname)
            dst = os.path.join(self.final_audio_folder, fname)
            if os.path.exists(src):
                os.system(f'cp "{src}" "{dst}"')

    def compare_duration_columns(self, df):
        if "duration" in df.columns and all(df["duration"].round(3) == df["duration_sec"].round(3)):
            print(" 'duration' and 'duration_sec' are same — dropping 'duration'")
            df = df.drop(columns=["duration"])
        else:
            print(" 'duration' and 'duration_sec' differ — keeping both")
        return df

    def summarize_csv(self, df, label=""):
        print(f"\n Summary of {label} Dataset:")
        print("-" * 40)
        print("Head:\n", df.head())
        print("\nTail:\n", df.tail())
        print("\nInfo:")
        print(df.info())
        print("\nDescribe:\n", df.describe())
        print("\nUnique 'path' count:", df["path"].nunique())

        print("\n Total Durations:")
        if "duration" in df.columns:
            print("Sum of 'duration' (sec):", df["duration"].sum())
        print("Sum of 'duration_sec':", df["duration_sec"].sum())
        print("Sum of 'duration_ms':", df["duration_ms"].sum())

        return df


processor = ShrutilipiProcessor("/mnt/data/stt/Datasets/Datasets")
dataset = processor.load_dataset()
df_raw = processor.save_audio_and_metadata(dataset)
df_validated = processor.validate_and_update_csv(df_raw)
df_filtered = processor.filter_by_duration(df_validated)
processor.copy_filtered_audios(df_filtered)
df_final = processor.compare_duration_columns(df_filtered)
df_final = processor.summarize_csv(df_final, label="Final Filtered")

# Save final cleaned filtered CSV again (after possibly dropping column)
df_final.to_csv(processor.filtered_csv_path, index=False)
print(f"\n Final updated CSV saved to {processor.filtered_csv_path}")


Saving audio: 100%|██████████| 21118/21118 [01:49<00:00, 192.00it/s]


 Saved metadata to /mnt/data/stt/Datasets/Datasets/shrutilipi_punjabi_metadata.csv
 Unique files checked and updated.
 Filtered metadata saved to /mnt/data/stt/Datasets/Datasets/filtered_punjabi_shrutilipi.csv


Copying filtered audio files: 100%|██████████| 20367/20367 [01:11<00:00, 283.76it/s]


 'duration' and 'duration_sec' are same — dropping 'duration'

 Summary of Final Filtered Dataset:
----------------------------------------
Head:
                   path                                           sentence  \
0  shrutilipi_pa_0.wav  ਪੰਜਾਬ ਦੇ ਮਾਲ ਮਹਿਕਮੇ ਵੱਲੋਂ ਜ਼ਮੀਨਾਂ ਦੀ ਸਟੀਕ ਨਿਸ਼...   
1  shrutilipi_pa_1.wav  ਪੰਜਾਬ ਸਰਕਾਰ ਨੇ ਲੁਧਿਆਣਾ ਚ ਬਣਨ ਵਾਲੀ ਸਾਈਕਲ ਵੈਲੀ ਚ...   
2  shrutilipi_pa_2.wav  ਜਨਰਲ ਰਾਵਤ ਨੇ ਕਿਹਾ ਕਿ ਪਥੱਰ ਸਿਟਣ ਵਾਲੇ ਹੋਰ ਕੁਝ ਨਹ...   
3  shrutilipi_pa_3.wav  ਕਾਬਲੇ ਜ਼ਿਕਰ ਏ ਕਿ ਆਜ਼ਾਦ ਭਾਰਤ ਦੇ ਇਤਿਹਾਸ ਅੰਦਰ ਇਹ ...   
4  shrutilipi_pa_4.wav  ਪੰਜਾਬ ਦੇ ਮਾਲ ਮਹਿਕਮੇ ਵੱਲੋਂ ਜ਼ਮੀਨਾਂ ਦੀ ਸਟੀਕ ਨਿਸ਼...   

  found  duration_sec  duration_ms  
0   yes          5.48       5480.0  
1   yes          7.32       7320.0  
2   yes         10.52      10520.0  
3   yes          6.92       6920.0  
4   yes          4.88       4880.0  

Tail:
                           path  \
21113  shrutilipi_pa_21113.wav   
21114  shrutilipi_pa_21114.wav   
21115  shrutilipi_pa_21115.wav   
21116  shrutilipi_pa_211

**Dropping duration column**

In [16]:
import pandas as pd
csv_path = "/mnt/data/stt/Datasets/Datasets/filtered_bengali_shrutilipi.csv" # Load the CSV file
df = pd.read_csv(csv_path)


print(f" Total columns before dropping: {df.shape[1]}") # Show total number of columns before dropping
print(" Column names:", df.columns.tolist())


if 'duration' in df.columns: # Drop the 'duration' column if it exists
    df.drop(columns=['duration'], inplace=True)
    print(" 'duration' column dropped.")
else:
    print(" 'duration' column not found.")


print(f" Total columns after dropping: {df.shape[1]}") # Show total columns after dropping
print(" Updated column names:", df.columns.tolist())


 Total columns before dropping: 6
 Column names: ['path', 'sentence', 'found', 'duration', 'duration_sec', 'duration_ms']
 'duration' column dropped.
 Total columns after dropping: 5
 Updated column names: ['path', 'sentence', 'found', 'duration_sec', 'duration_ms']


In [17]:
df.to_csv(csv_path, index=False)
print(" Updated CSV saved.")

 Updated CSV saved.


**For updated audio folder**

In [9]:
audio_folder = "/mnt/data/stt/Datasets/Datasets/punjabi_audios_shrutilipi_final"
# List all .wav files
audio_files = [f for f in os.listdir(audio_folder) if f.endswith(".wav")]
# Count them
print(f" Total number of audio files: {len(audio_files)}")

 Total number of audio files: 20367


In [10]:
import os
from IPython.display import Audio
audio_files = sorted(os.listdir(audio_folder))[:5]  # first 5 files
for file in audio_files:
    print(f" Playing: {file}")
    display(Audio(filename=os.path.join(audio_folder, file)))

 Playing: shrutilipi_pa_0.wav


 Playing: shrutilipi_pa_1.wav


 Playing: shrutilipi_pa_10.wav


 Playing: shrutilipi_pa_100.wav


 Playing: shrutilipi_pa_1000.wav
