In [2]:
import pandas as pd
import numpy as np
import os
from datasets import load_dataset, Audio
import shutil
import soundfile as sf
from tqdm import tqdm
from IPython.display import Audio, display

In [3]:
os.getcwd()

'/mnt/data/stt/Datasets/Datasets/Merged_Datasets'

In [14]:
# Audio file manager
class AudioDatasetManager:
    def __init__(self, folder_names, output_folder):
        self.folder_names = folder_names
        self.output_folder = output_folder

    def count_audio_files(self, folder):
        return len([file for file in os.listdir(folder) if file.endswith('.wav')])

    def process_audio_folders(self):
        print(" Counting and merging audio files...")
        total_files = {}
        os.makedirs(self.output_folder, exist_ok=True)

        for folder in self.folder_names:
            count = self.count_audio_files(folder)
            print(f" Total .wav files in '{folder}': {count}")
            total_files[folder] = count

            for file in os.listdir(folder):
                if file.endswith('.wav'):
                    src = os.path.join(folder, file)
                    dst = os.path.join(self.output_folder, file)
                    if not os.path.exists(dst):  # Avoid overwriting
                        shutil.copy2(src, dst)

        final_count = self.count_audio_files(self.output_folder)
        print(f"\n Total .wav files in merged folder '{self.output_folder}': {final_count}")
        return total_files, final_count


# CSV manager
class CSVDatasetManager:
    def __init__(self, file_names, output_file):
        self.file_names = file_names
        self.output_file = output_file

    def inspect_csv(self, file_path):
        df = pd.read_csv(file_path)
        print(f"\n File: {file_path}")
        print(f" Columns: {df.columns.tolist()}")
        print(f" Rows: {len(df)}")
        return df

    def merge_csvs(self):
        dataframes = []
        for path in self.file_names:
            df = self.inspect_csv(path)
            dataframes.append(df)

        final_df = pd.concat(dataframes, axis=0, ignore_index=True)
        final_df.to_csv(self.output_file, index=False)
        print(f"\n Merged CSV saved as '{self.output_file}'")
        return final_df

    def analyze_merged_csv(self, df):
        print("\n --- Merged CSV Analysis ---")
        print(f" Columns: {df.columns.tolist()}")
        print(f" Total Rows: {len(df)}")

        print("\n Info:")
        print(df.info())

        print("\n Describe:")
        print(df.describe())

        print("\n Head:")
        print(df.head())

        print("\n Tail:")
        print(df.tail())

        print("\n Unique values per column:")
        print(df.nunique())

        print("\n Total duration (seconds):", df['duration_sec'].sum())
        print(" Total duration (milliseconds):", df['duration_ms'].sum())

# AUDIO FILES
audio_folders = [
    "/mnt/data/stt/Datasets/Datasets/punjabi_audios_BA_final",
    "/mnt/data/stt/Datasets/Datasets/punjabi_audios_BA_spo-tut_final",
    "/mnt/data/stt/Datasets/Datasets/punjabi_audios_indicvoices_final",
    "/mnt/data/stt/Datasets/Datasets/punjabi_audios_kathbath_final",
    "/mnt/data/stt/Datasets/Datasets/punjabi_audios_shrutilipi_final"
]
final_audio_output = "/mnt/data/stt/Datasets/Datasets/Merged_Datasets/final_merged_punjabi_audios"

audio_manager = AudioDatasetManager(audio_folders, final_audio_output)
audio_manager.process_audio_folders()

# CSV FILES
csv_files = [
    "/mnt/data/stt/Datasets/Datasets/filtered_punjabi_BA.csv",
    "/mnt/data/stt/Datasets/Datasets/filtered_punjabi_BA_spo-tut.csv",
    "/mnt/data/stt/Datasets/Datasets/filtered_punjabi_indicvoices.csv",
    "/mnt/data/stt/Datasets/Datasets/filtered_punjabi_kathbath.csv",
    "/mnt/data/stt/Datasets/Datasets/filtered_punjabi_shrutilipi.csv"
]
final_csv_output = "/mnt/data/stt/Datasets/Datasets/Merged_Datasets/final_merged_punjabi_dataset.csv"

csv_manager = CSVDatasetManager(csv_files, final_csv_output)
merged_df = csv_manager.merge_csvs()
csv_manager.analyze_merged_csv(merged_df)


 Counting and merging audio files...
 Total .wav files in '/mnt/data/stt/Datasets/Datasets/punjabi_audios_BA_final': 63965


 Total .wav files in '/mnt/data/stt/Datasets/Datasets/punjabi_audios_BA_spo-tut_final': 8484
 Total .wav files in '/mnt/data/stt/Datasets/Datasets/punjabi_audios_indicvoices_final': 184274
 Total .wav files in '/mnt/data/stt/Datasets/Datasets/punjabi_audios_kathbath_final': 83292
 Total .wav files in '/mnt/data/stt/Datasets/Datasets/punjabi_audios_shrutilipi_final': 20367

 Total .wav files in merged folder '/mnt/data/stt/Datasets/Datasets/Merged_Datasets/final_merged_punjabi_audios': 360382

 File: /mnt/data/stt/Datasets/Datasets/filtered_punjabi_BA.csv
 Columns: ['path', 'sentence', 'found', 'duration_sec', 'duration_ms']
 Rows: 63965

 File: /mnt/data/stt/Datasets/Datasets/filtered_punjabi_BA_spo-tut.csv
 Columns: ['path', 'sentence', 'found', 'duration_sec', 'duration_ms']
 Rows: 8484

 File: /mnt/data/stt/Datasets/Datasets/filtered_punjabi_indicvoices.csv
 Columns: ['path', 'sentence', 'found', 'duration', 'duration_sec', 'duration_ms']
 Rows: 184274

 File: /mnt/data/stt/Datasets

**Dropping duration column**

In [15]:
import pandas as pd
csv_path = "/mnt/data/stt/Datasets/Datasets/Merged_Datasets/final_merged_punjabi_dataset.csv" # Load the CSV file
df = pd.read_csv(csv_path)


print(f" Total columns before dropping: {df.shape[1]}") # Show total number of columns before dropping
print(" Column names:", df.columns.tolist())


if 'duration' in df.columns: # Drop the 'duration' column if it exists
    df.drop(columns=['duration'], inplace=True)
    print(" 'duration' column dropped.")
else:
    print(" 'duration' column not found.")


print(f" Total columns after dropping: {df.shape[1]}") # Show total columns after dropping
print(" Updated column names:", df.columns.tolist())


 Total columns before dropping: 6
 Column names: ['path', 'sentence', 'found', 'duration_sec', 'duration_ms', 'duration']
 'duration' column dropped.
 Total columns after dropping: 5
 Updated column names: ['path', 'sentence', 'found', 'duration_sec', 'duration_ms']


In [16]:
df.to_csv(csv_path, index=False)
print(" Updated CSV saved.")

 Updated CSV saved.


**For checking merged audio and csv files**

In [17]:
class AudioDatasetValidator:
    def __init__(self, csv_path, audio_folder, language):
        self.csv_path = csv_path
        self.audio_folder = audio_folder
        self.language = language
        self.df = None
        self.missing_files = []
    
    def load_csv(self):
        self.df = pd.read_csv(self.csv_path)
        print(f"\n [{self.language}] Loaded CSV: {self.csv_path}")
        print(f"Total rows: {len(self.df)}")

    def check_audio_files_exist(self):
        all_audio_files = set(os.listdir(self.audio_folder))
        self.missing_files = []

        for audio_file in self.df['path'].astype(str):
            if audio_file not in all_audio_files:
                self.missing_files.append(audio_file)

        found = len(self.df) - len(self.missing_files)
        print(f" [{self.language}] Files Found: {found}")
        print(f" [{self.language}] Missing Files: {len(self.missing_files)}")

        if self.missing_files:
            print(f"Some missing files (first 5): {self.missing_files[:5]}")

    def show_csv_statistics(self):
        print(f"\n [{self.language}] CSV Statistics:")
        print("\n Head:")
        print(self.df.head())
        print("\n Tail:")
        print(self.df.tail())
        print("\n Info:")
        print(self.df.info())
        print("\n Describe:")
        print(self.df.describe())
        print("\n Unique counts per column:")
        print(self.df.nunique())
        print(f"\n Total duration (sec): {self.df['duration_sec'].sum():.2f}")
        print(f" Total duration (ms): {self.df['duration_ms'].sum():.2f}")

    def count_audio_files_by_type(self):
        total_files = os.listdir(self.audio_folder)
        wav_files = [f for f in total_files if f.endswith('.wav')]
        mp3_files = [f for f in total_files if f.endswith('.mp3')]

        print(f"\n [{self.language}] Audio Folder: {self.audio_folder}")
        print(f" Total files: {len(total_files)}")
        print(f" .mp3 files: {len(mp3_files)}")
        print(f" .wav files: {len(wav_files)}")


# Kannada Dataset 
marathi_validator = AudioDatasetValidator(
    csv_path="/mnt/data/stt/Datasets/Datasets/Merged_Datasets/final_merged_punjabi_dataset.csv",
    audio_folder="/mnt/data/stt/Datasets/Datasets/Merged_Datasets/final_merged_punjabi_audios",
    language="Panjabi"
)

marathi_validator.load_csv()
marathi_validator.check_audio_files_exist()
marathi_validator.show_csv_statistics()
marathi_validator.count_audio_files_by_type()


 [Panjabi] Loaded CSV: /mnt/data/stt/Datasets/Datasets/Merged_Datasets/final_merged_punjabi_dataset.csv
Total rows: 360382
 [Panjabi] Files Found: 360382
 [Panjabi] Missing Files: 0

 [Panjabi] CSV Statistics:

 Head:
                        path  \
0  bashaanuvaadh_st_pa_0.wav   
1  bashaanuvaadh_st_pa_1.wav   
2  bashaanuvaadh_st_pa_2.wav   
3  bashaanuvaadh_st_pa_3.wav   
4  bashaanuvaadh_st_pa_4.wav   

                                            sentence found  duration_sec  \
0  ਮੇਰਾ ਨਾਮ ਸੰਜਨਾ ਹੈ ਮੈਂ ਤੁਹਾਨੂੰ ਆਪਣੇ ਪਿੰਡ ਦੇ ਪਾਣ...   yes          6.16   
1  ਸਾਡੇ ਪਿੰਡ ਵਿੱਚ ਕਾਫੀਆਂ ਜਗ੍ਹਾ ਖੇਤੀਬਾੜੀ ਹੁੰਦੀ ਹੈ ...   yes          5.76   
2  ਨਹਿਰ ਵਿੱਚ ਸੁੱਕਾ ਹੁੰਦਾ ਹੈ ਉੱਥੇ ਪਾਣੀ ਵੀ ਨਹੀਂ ਆਉਂ...   yes         10.48   
3  ਲੋਕਾਂ ਦਾ ਫਸਲ ਖਰਾਬ ਹੁੰਦਾ ਹੈ ਪਾਣੀ ਨਾ ਮਿਲਣ ਕਾਰਨ ਅ...   yes         14.56   
4  ਉਹਨਾਂ ਦੇ ਅੰਦਰ ਦੀ ਕਈ ਬਿਮਾਰੀਆਂ ਲੱਗ ਜਾਂਦੀਆਂ ਹਨ ਪਾ...   yes          6.96   

   duration_ms  
0       6160.0  
1       5760.0  
2      10480.0  
3      14560.0  
4       6960.0  

 Tail:
                     

**For playing merged audios**

In [18]:
audio_folder = "/mnt/data/stt/Datasets/Datasets/Merged_Datasets/final_merged_punjabi_audios"
# List all .wav files
audio_files = [f for f in os.listdir(audio_folder) if f.endswith(".wav")]
# Count them
print(f" Total number of audio files: {len(audio_files)}")

 Total number of audio files: 360382


In [19]:
audio_files = sorted(os.listdir(audio_folder))[:5]  # first 5 files

for file in audio_files:
    print(f" Playing: {file}")
    display(Audio(filename=os.path.join(audio_folder, file)))

 Playing: bashaanuvaadh_spo-tut_pa_0.wav


 Playing: bashaanuvaadh_spo-tut_pa_1.wav


 Playing: bashaanuvaadh_spo-tut_pa_10.wav


 Playing: bashaanuvaadh_spo-tut_pa_100.wav


 Playing: bashaanuvaadh_spo-tut_pa_1000.wav
