# BirdClef+ 2025 Data Preprocessing V1

## Import libraries

In [19]:
import configparser
import os
import warnings
from pathlib import Path

import pandas as pd
import torch
import torchaudio

warnings.filterwarnings('ignore')

## Config

In [20]:
config = configparser.ConfigParser()

In [33]:
config['project'] = {
    'name': 'birdclef_2025',
    'project_path': str(Path(os.getcwd()).parent)
}

config['data'] = {
    'data_path': config['project']['project_path'] + '/data',
    'birdclef_2025': config['project']['project_path'] + '/data/birdclef_2025',
    'processed_audio':config['project']['project_path'] + '/data/audio_processed'
}

config['audio_params'] = {
    'wav_sec': 5,
    'sample_rate': 32000,
}

config['audio_preprocessing'] = {
    'min_segment': 32000 * 5,
    'backend': 'soundfile'
}

## Load data

In [34]:
meta = pd.read_csv(config['data']['birdclef_2025'] + '/train.csv')
taxonomy = pd.read_csv(config['data']['birdclef_2025'] + '/taxonomy.csv')

In [35]:
meta.head()

Unnamed: 0,primary_label,secondary_labels,type,filename,collection,rating,url,latitude,longitude,scientific_name,common_name,author,license
0,1139490,[''],[''],1139490/CSA36385.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3206,-73.7128,Ragoniella pulchella,Ragoniella pulchella,Fabio A. Sarria-S,cc-by-nc-sa 4.0
1,1139490,[''],[''],1139490/CSA36389.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3206,-73.7128,Ragoniella pulchella,Ragoniella pulchella,Fabio A. Sarria-S,cc-by-nc-sa 4.0
2,1192948,[''],[''],1192948/CSA36358.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3791,-73.7313,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,cc-by-nc-sa 4.0
3,1192948,[''],[''],1192948/CSA36366.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.28,-73.8582,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,cc-by-nc-sa 4.0
4,1192948,[''],[''],1192948/CSA36373.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3791,-73.7313,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,cc-by-nc-sa 4.0


## Data Preprocessing

In [36]:
train_audio_path = config['data']['birdclef_2025'] + '/train_audio'
audio_processed_path = config['data']['processed_audio'] + '/v1'

In [37]:
os.makedirs(audio_processed_path, exist_ok=True)

In [46]:
def crop_and_save(index):
    sig, _ = torchaudio.load(train_audio_path + '/' + meta.iloc[index].filename, backend=config['audio_preprocessing']['backend'])

    if sig.shape[1]<=int(config['audio_preprocessing']['min_segment']):
        sig = torch.concat([sig, torch.zeros(1,int(config['audio_preprocessing']['min_segment'])-sig.shape[1])], dim=1)

    dir_path = audio_processed_path + '/' + meta.iloc[index].filename.split('/')[0] + '/'
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)

    output_name = audio_processed_path + '/' + meta.iloc[index].filename
    torchaudio.save(uri=output_name, src=sig[:,:int(config['audio_preprocessing']['min_segment'])], sample_rate=int(config['audio_params']['sample_rate']), backend=config['audio_preprocessing']['backend'])

In [47]:
for index in range(len(meta)):
    crop_and_save(index)