# BirdClef+ 2025 Submission Baseline

## Import libraries

In [1]:
import configparser
import os
import warnings
from pathlib import Path
import time
from tqdm import tqdm
import concurrent.futures

import numpy as np
import pandas as pd
import timm
import torch
import torchaudio
import torchaudio.transforms as at
from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl

warnings.filterwarnings('ignore')

## Config

In [2]:
config = configparser.ConfigParser()

In [3]:
config['project'] = {
    'name': 'birdclef_2025',
    'project_path': '/mnt/d/Projects_D/BirdCLEF_2025'
}

config['data'] = {
    'data_path': config['project']['project_path'] + '/data',
    'workspace_path': config['project']['project_path'] + '/data',
    'birdclef_2025': config['project']['project_path'] + '/data/birdclef_2025',
    'processed_audio':config['project']['project_path'] + '/data/audio_processed',
}

config['audio_params'] = {
    'wav_sec': 5,
    'sample_rate': 32000,
}

config['audio_preprocessing'] = {
    'min_segment': 32000 * 5,
    'backend': 'soundfile'
}

config['mel_spectrogram'] = {
    'n_fft': 1024,
    'win_length': 1024,
    'hop_length': 512,
    'n_mels': 80,
    'f_min': 20,
    'f_max': 15000,
    'mel_scale': 'htk',
}

config['model'] = {
    'model_backbone': 'efficientnet',
    'model_desc': 'efficientnet_b0',
    'model_training': 'cv',
    'model_ver': 'v1',
    'model_path': config['project']['project_path'] + '/models/efficientnet_b0_cv-v1',
}

config['testing'] = {
    'debug': 0,
    'batch_size': 60,
    'num_workers': 4,
    'test_audio': config['data']['workspace_path'] + '/test_audio' + '/segments',
    'device': torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    'chunks_size': int(60 / int(config['audio_params']['wav_sec'])),
    'submission_path': config['data']['workspace_path'] + '/submissions',
    'use_tta': 0,
    'tta_count': 3
}

In [4]:
os.makedirs(config['testing']['test_audio'], exist_ok=True)
os.makedirs(config['testing']['submission_path'], exist_ok=True)

In [5]:
print(config['data']['birdclef_2025'], config['model']['model_path'], config['testing']['test_audio'])

/mnt/d/Projects_D/BirdCLEF_2025/data/birdclef_2025 /mnt/d/Projects_D/BirdCLEF_2025/models/efficientnet_b0_cv-v1 /mnt/d/Projects_D/BirdCLEF_2025/data/test_audio/segments


## Load test data

In [6]:
test_dir = '/test_soundscapes'
test_files = [file for file in os.listdir(config['data']['birdclef_2025'] + test_dir) if file.endswith('.ogg')]

if len(test_files) == 0:
    test_dir = '/train_soundscapes'
    test_files = [file for file in os.listdir(config['data']['birdclef_2025'] + test_dir) if file.endswith('.ogg')][:700]
    config['testing']['debug'] = '1'

In [7]:
print(f"Is Debug: {bool(int(config['testing']['debug']))}")

Is Debug: True


## Dataset

In [8]:
class BirdclefTestDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df
        self.config = config
        self.input_path = Path(config['testing']['test_audio'])
        self.min_segment = int(config['audio_preprocessing']['min_segment'])
        self.use_tta = bool(int(config['testing']['use_tta']))
        self.tta_count = int(config['testing']['tta_count'])

    @classmethod
    def normalize_std(cls, spec, eps=1e-23):
        mean = torch.mean(spec)
        std = torch.std(spec)
        return (spec - mean) / (std + eps)

    def get_mel_spectrogram(self, audio_signal):
        params = self.config['mel_spectrogram']
        mel_spectrogram = at.MelSpectrogram(
            sample_rate=int(self.config['audio_params']['sample_rate']),
            n_fft=int(params['n_fft']),
            win_length=int(params['win_length']),
            hop_length=int(params['hop_length']),
            n_mels=int(params['n_mels']),
            f_min=float(params['f_min']),
            f_max=float(params['f_max']),
            mel_scale=params['mel_scale']
        )
        mel_spec = mel_spectrogram(audio_signal)
        return torch.log(mel_spec)

    def apply_tta(self, spec, tta_idx):
        """Apply test-time augmentation"""
        if tta_idx == 0:
            return spec  # Original
        elif tta_idx == 1:
            return torch.flip(spec, dims=[2])  # Time shift (horizontal flip)
        elif tta_idx == 2:
            return torch.flip(spec, dims=[1])  # Frequency shift (vertical flip)
        elif tta_idx == 3:
            return torch.rot90(spec, k=1, dims=[1, 2])  # 90-degree rotation
        else:
            return spec

    def __getitem__(self, index):
        filename = self.df.iloc[index].row_id
        sig, _ = torchaudio.load(self.input_path / filename, backend=self.config['audio_preprocessing']['backend'])
        sig = sig / torch.max(torch.abs(sig))
        sig = sig + 1.5849e-05 * (torch.rand(1, self.min_segment) - 0.5)

        mel_spec = self.get_mel_spectrogram(sig)
        mel_spec = self.normalize_std(mel_spec)

        if self.use_tta:
            mel_specs = [self.apply_tta(mel_spec, tta_idx) for tta_idx in range(self.tta_count)]
            return torch.stack(mel_specs), filename
        else:
            return mel_spec.unsqueeze(0), filename

    def __len__(self):
        return len(self.df)

## Load model

In [9]:
class BirdclefModel(pl.LightningModule):
    def __init__(self, class_labels):
        super(BirdclefModel, self).__init__()
        self.save_hyperparameters()
        self.class_labels = class_labels
        self.num_classes = len(self.class_labels)
        self.model_backbone = config['model']['model_backbone']
        self.model = self.get_model()

    def get_model(self):
        """Initialize the model."""
        model_obj = timm.create_model(
            config['model']['model_desc'],
            pretrained=False,
            num_classes=self.num_classes
        )
        return model_obj

    def forward(self, x):
        """Forward pass with channel expansion."""
        x = torch.cat((x, x, x), 1)  # Repeat channels to match expected input shape
        return self.model(x)

In [10]:
class EnsembleModel(pl.LightningModule):
    def __init__(self, models):
        super(EnsembleModel, self).__init__()
        self.models = models  # List of models for cross-validation
        self.use_tta = bool(int(config['testing']['use_tta']))
        self.tta_count = int(config['testing']['tta_count'])

    def forward(self, x):
        """Average predictions from all models."""
        if self.use_tta:
            # Apply TTA over all models
            batch_size, tta_count, c, h, w = x.shape
            x = x.view(-1, c, h, w)  # Flatten TTA batch for inference

            preds = []
            for model in self.models:
                preds.append(model(x))  # Inference on all TTA samples

            preds = torch.stack(preds, dim=0)  # [n_models, n_tta * batch, n_classes]
            preds = preds.view(len(self.models), batch_size, tta_count, -1)  # Reshape back
            preds = preds.mean(dim=2)  # Average over TTA
        else:
            # No TTA, standard inference
            batch_size, tta_count, c, h, w = x.shape if x.dim() == 5 else (x.size(0), 1, *x.shape[1:])
            x = x.view(-1, c, h, w)  # Ensure 4D input
            preds = torch.stack([model(x) for model in self.models], dim=0)

        return preds.mean(dim=0)  # Average across models

## Labels

In [11]:
class_labels = os.listdir(config['data']['birdclef_2025'] + '/train_audio')
num_classes = len(class_labels)

In [12]:
num_classes

206

## Process test dataset

In [13]:
chunks_size = int(config['testing']['chunks_size'])

test_segment_files_dict = {}

for i in range(len(test_files)):
    test_segment_files_dict[i] = []

def process_file(file, index):
    test_filename = file.split('.')[0]
    sig, sr = torchaudio.load(config['data']['birdclef_2025'] + test_dir + '/' + file, backend=config['audio_preprocessing']['backend'])
    chunks = torch.chunk(sig, chunks_size, dim=1)
    for i, chunk in enumerate(chunks):
        test_segment_filename = config['testing']['test_audio'] + '/' + test_filename + '_' + str((i+1) * int(config['audio_params']['wav_sec'])) + '.ogg'
        if not os.path.exists(test_segment_filename):
            torchaudio.save(test_segment_filename, chunk, sr, backend=config['audio_preprocessing']['backend'])
        test_segment_files_dict[index].append(test_segment_filename.split('/')[-1])

# Use ThreadPoolExecutor to parallelize the processing
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    executor.map(process_file, test_files, [i for i in range(len(test_files))])

In [14]:
test_segment_files = []

for i in range(len(test_files)):
    test_segment_files.extend(test_segment_files_dict[i])

In [15]:
if len(test_segment_files) > 0:
    sig, sr = torchaudio.load(config['testing']['test_audio'] + '/' + test_segment_files[0])
    print(sig.shape, sr)

torch.Size([1, 160000]) 32000


In [16]:
test_df = pd.DataFrame(test_segment_files, columns=['row_id'])

In [17]:
test_df.count()

row_id    8400
dtype: int64

In [18]:
test_df.head()

Unnamed: 0,row_id
0,H02_20230420_074000_5.ogg
1,H02_20230420_074000_10.ogg
2,H02_20230420_074000_15.ogg
3,H02_20230420_074000_20.ogg
4,H02_20230420_074000_25.ogg


## Load model

In [19]:
# checkpoint_paths = [config['model']['model_path'] + '/' + config['model']['model_desc'] + '_' + config['model']['model_training'] + f'_fold{fold}' + '-' + config['model']['model_ver'] + '.ckpt' for fold in range(5)]
# model = BirdclefModel(class_labels=class_labels, checkpoint_paths=checkpoint_paths)
# model = model.to(torch.float32)
# model = model.to('cpu')
# model.eval()

fold_models = []
for fold in range(5):
    checkpoint_path = os.path.join(
        config['model']['model_path'],
        config['model']['model_desc'] + '_' + config['model']['model_training'] + f'_fold{fold}' + '-' + config['model']['model_ver'] + '.ckpt'
    )
    model = BirdclefModel.load_from_checkpoint(checkpoint_path, class_labels=class_labels)
    model = model.to(torch.float32).to('cpu').eval()
    fold_models.append(model)

# Create ensemble model with 5-fold models
ensemble_model = EnsembleModel(fold_models)
ensemble_model = ensemble_model.to(torch.float32).to('cpu').eval()

## Make predictions

In [20]:
test_dataset = BirdclefTestDataset(test_df)
test_loader = DataLoader(test_dataset, batch_size=int(config['testing']['batch_size']), shuffle=False, num_workers=int(config['testing']['num_workers']), drop_last=False)

In [21]:
pred = {'row_id': []}
for species_code in class_labels:
    pred[species_code] = []

with torch.no_grad():
    for mel_spec, filename in tqdm(test_loader):
        preds = ensemble_model(mel_spec)
        preds = torch.softmax(preds, dim=1).cpu().numpy()

        for file, prediction in zip(filename, preds):
            file_name = file.split('.')[0]
            pred['row_id'].append(file_name)

            for i, label in enumerate(class_labels):
                pred[label].append(prediction[i])

100%|██████████| 140/140 [06:41<00:00,  2.87s/it]


## Prepare submission file

In [22]:
submission = pd.DataFrame(pred)

In [23]:
submission.shape

(8400, 207)

In [24]:
submission.head()

Unnamed: 0,row_id,1139490,1192948,1194042,126247,1346504,134933,135045,1462711,1462737,...,yebfly1,yebsee1,yecspi2,yectyr1,yehbla2,yehcar1,yelori1,yeofly1,yercac1,ywcpar
0,H02_20230420_074000_5,0.001099,0.000514,0.002223,0.000424,0.002499,0.003292,0.009536,0.00174,0.001652,...,0.000682,0.000387,8.4e-05,0.000807,9.3e-05,4.5e-05,8e-06,0.004755,5.3e-05,4.7e-05
1,H02_20230420_074000_10,0.001678,0.000699,0.00431,0.001935,0.003054,0.003054,0.017736,0.001351,0.004202,...,0.000462,0.000532,0.000251,0.00068,4.3e-05,2.3e-05,1.7e-05,0.011091,3.1e-05,6.4e-05
2,H02_20230420_074000_15,0.00154,0.000458,0.002793,0.001847,0.001785,0.002368,0.01482,0.001194,0.002635,...,0.001386,0.00233,0.001292,0.001194,0.000362,0.000123,8.1e-05,0.010977,0.000123,0.000139
3,H02_20230420_074000_20,0.002555,0.000774,0.004188,0.002378,0.003967,0.002063,0.022302,0.001989,0.006783,...,0.000763,0.000827,0.00061,0.000447,0.000102,9.2e-05,2.3e-05,0.012044,0.000107,7e-05
4,H02_20230420_074000_25,0.002636,0.001123,0.004438,0.001683,0.003998,0.004953,0.022785,0.002362,0.007482,...,0.001214,0.002023,0.000922,0.000471,0.000294,5.2e-05,5.5e-05,0.010885,3e-05,8.6e-05


In [25]:
submission.to_csv(config['testing']['submission_path'] + '/' + config['model']['model_desc'] + '-' + config['model']['model_training'] + '-' + config['model']['model_ver'] + f'-use_tta_{int(config["testing"]["use_tta"])}' +'-submission.csv', index=False)

## End