In [1]:
import os
import pandas as pd
from pydub import AudioSegment
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:

def create_audio_chunks(input_csv, ogg_folder, output_base, chunk_duration=5000, validation_ratio=0.2):
    """
    Create training and validation folders with numeric label folders containing 5-second chunks of .ogg audio files.
    
    Parameters:
        input_csv (str): Path to the train.csv file.
        ogg_folder (str): Folder where original .ogg files are stored.
        output_base (str): Base folder where 'train' and 'validation' folders will be created.
        chunk_duration (int): Duration of each chunk in milliseconds (default is 5000ms or 5 seconds).
        validation_ratio (float): Fraction of data to reserve for validation.
        
    Returns:
        label_to_info (dict): A dictionary mapping numeric labels to detailed species information (scientific_name, common_name).
    """
    # Read CSV file
    df = pd.read_csv(input_csv)
    
    # Create a mapping from numeric label to species information (idx, label, scientific_name, common_name)
    labels = sorted(df['primary_label'].unique())
    
    # Create the dictionary mapping numeric labels (0, 1, 2, ...) to species information
    label_to_info = {
        i: {
            'label': label,
            'scientific_name': df[df['primary_label'] == label]['scientific_name'].iloc[0],  # Adjust column name if needed
            'common_name': df[df['primary_label'] == label]['common_name'].iloc[0]  # Adjust column name if needed
        }
        for i, label in enumerate(labels)
    }
    
    # Split dataframe into training and validation sets
    train_df, val_df = train_test_split(df, test_size=validation_ratio, stratify=df['primary_label'], random_state=42)
    
    # Create directory structure using numeric labels
    for split in ['train', 'validation']:
        base_dir = os.path.join(output_base, split)
        os.makedirs(base_dir, exist_ok=True)
        for numeric_label in range(len(labels)):
            label_name = label_to_info[numeric_label]['label']  # Get the species label for the numeric index
            os.makedirs(os.path.join(base_dir, str(numeric_label)), exist_ok=True)  # Create folder with numeric index

    def process_and_save(row, split, ogg_folder, output_base, chunk_duration):
        label = row['primary_label']
        file_name = row['filename']

        # Construct the full file path correctly
        file_path = os.path.join(ogg_folder, label, os.path.basename(file_name))
        #file_path = os.path.join(ogg_folder, label, file_name)

        # Ensure all slashes are forward slashes
        file_path = file_path.replace("\\", "/")

        # If file path is invalid, print error and return
        if not os.path.exists(file_path):
            print(f"File does not exist: {file_path}")
            return

        try:
            # Load the .ogg file using pydub (pydub supports ogg if ffmpeg is installed)
            audio = AudioSegment.from_file(file_path, format="ogg")
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            return
        
        # Determine the number of chunks
        num_chunks = len(audio) // chunk_duration
        
        # Process each chunk
        for i in range(num_chunks):
            start_ms = i * chunk_duration
            end_ms = start_ms + chunk_duration
            chunk = audio[start_ms:end_ms]
            
            # Check if the chunk is shorter than the desired duration
            if len(chunk) < chunk_duration:
                # Calculate the difference and pad with silence
                silence_duration = chunk_duration - len(chunk)
                silence = AudioSegment.silent(duration=silence_duration)
                chunk = chunk + silence  # Pad with silence at the end

            # Find the numeric index of the current label
            numeric_label = labels.index(label)  # Get the numeric index of the label
            
            # Get the species information using the numeric index
            #label_info = label_to_info[numeric_label]
            
            # Use the numeric index for folder creation
            base_dir = os.path.join(output_base, split, str(numeric_label))  # Use numeric index for the folder
            os.makedirs(base_dir, exist_ok=True)  # Ensure the folder exists

            #output_file = os.path.join(base_dir, f"{os.path.splitext(file_name)[0]}_chunk{i}.wav")
            #output_file = os.path.join(base_dir, f"{label}_chunk{i}.wav")
            output_file = os.path.join(base_dir, f"{os.path.splitext(os.path.basename(file_name))[0]}_chunk{i}.wav")
            
            # Create the directory where the file will be saved if it doesn't exist
            os.makedirs(os.path.dirname(output_file), exist_ok=True)
            
            # Export chunk as .wav (or you can export as .ogg if desired)
            chunk.export(output_file, format="wav")
            #print(f"Exported {output_file}")
    
    ## Calculate total number of chunks across all files (for progress bar)
    #total_chunks = sum([len(AudioSegment.from_file(os.path.join(ogg_folder, row['primary_label'], row['filename']))) // chunk_duration for _, row in pd.concat([train_df, val_df]).iterrows()])
    
    # Track progress for the entire dataset processing
    #with tqdm(total=total_chunks, desc="Processing Dataset", unit="chunk") as pbar:
    with tqdm(total=len(train_df) + len(val_df), desc="Processing Files", unit="file") as pbar:
    # Process training files
        for _, row in train_df.iterrows():
            process_and_save(row, 'train', ogg_folder, output_base, chunk_duration)
            pbar.update(1)
            
        # Process validation files
        for _, row in val_df.iterrows():
            process_and_save(row, 'validation', ogg_folder, output_base, chunk_duration)
            pbar.update(1)
    
    # Return the dictionary mapping numeric labels to species information
    return label_to_info

In [3]:
# Set your paths
input_csv = r"D:/Repos/birdclef-2025/train.csv"
ogg_folder = r"D:/Repos/birdclef-2025/train_audio"
output_base = "data"  # This folder will be created if it doesn't exist

# Create the dataset
label_to_info = create_audio_chunks(input_csv, ogg_folder, output_base)


Processing Files:  60%|██████    | 17249/28564 [41:33<27:15,  6.92file/s]  


OSError: [Errno 28] No space left on device

In [None]:
# Now you can print or use the label_to_info dictionary
print(label_to_info)