# BirdClef+ 2025 Training Baseline

## Import libraries

In [1]:
import os
import time
import configparser
from pathlib import Path
from tqdm import tqdm

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
import torchaudio
import torchaudio.transforms as at
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

import timm
import wandb

from score.metric import score

import warnings

warnings.filterwarnings('ignore')

## Config

In [2]:
config = configparser.ConfigParser()

In [8]:
config['project'] = {
    'name': 'birdclef_2025',
    'project_path': '/mnt/d/Projects_D/BirdCLEF_2025/'
}

config['data'] = {
    'data_path': config['project']['project_path'] + '/data',
    'birdclef_2025': config['project']['project_path'] + '/data/birdclef_2025',
    'processed_audio':config['project']['project_path'] + '/data/audio_processed',
}

config['audio_params'] = {
    'wav_sec': 5,
    'sample_rate': 32000,
}

config['audio_preprocessing'] = {
    'min_segment': 32000 * 5,
    'backend': 'soundfile'
}

config['mel_spectrogram'] = {
    'n_fft': 1024,
    'win_length': 1024,
    'hop_length': 512,
    'n_mels': 80,
    'f_min': 20,
    'f_max': 15000,
    'mel_scale': 'htk',
}

config['model'] = {
    'model_backbone': 'efficientnet',
    'model_desc': 'efficientnet_b1',
    'model_ver': 'v1',
    'model_path': config['project']['project_path'] + '/models'
}

config['training'] = {
    'processed_audio': config['data']['processed_audio'] + '/v1',
    'device': torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    'epochs': 8,
    'batch_size': 40,
    'num_workers': 8,
    'learning_rate': 0.001
}

## WandB

In [9]:
config['project']['project_path'] + '/wandb'

'/mnt/d/Projects_D/BirdCLEF_2025//wandb'

In [10]:
os.path.exists(config['project']['project_path'] + '/wandb')

True

In [11]:
wandb.init(project=config['project']['name'], name=f"{config['model']['model_desc']}-{config['model']['model_ver']}", dir=config['project']['project_path'] + '/wandb')

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33msiddhantmahalle[0m ([33msiddhant-ai[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


## Load data

In [12]:
meta = pd.read_csv(config['data']['birdclef_2025'] + '/train.csv')
taxonomy = pd.read_csv(config['data']['birdclef_2025'] + '/taxonomy.csv')

In [13]:
meta.head()

Unnamed: 0,primary_label,secondary_labels,type,filename,collection,rating,url,latitude,longitude,scientific_name,common_name,author,license
0,1139490,[''],[''],1139490/CSA36385.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3206,-73.7128,Ragoniella pulchella,Ragoniella pulchella,Fabio A. Sarria-S,cc-by-nc-sa 4.0
1,1139490,[''],[''],1139490/CSA36389.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3206,-73.7128,Ragoniella pulchella,Ragoniella pulchella,Fabio A. Sarria-S,cc-by-nc-sa 4.0
2,1192948,[''],[''],1192948/CSA36358.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3791,-73.7313,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,cc-by-nc-sa 4.0
3,1192948,[''],[''],1192948/CSA36366.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.28,-73.8582,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,cc-by-nc-sa 4.0
4,1192948,[''],[''],1192948/CSA36373.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3791,-73.7313,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,cc-by-nc-sa 4.0


In [14]:
class_labels = meta.primary_label.unique()
num_classes = len(class_labels)

## Utils

## Dataset

In [15]:
class BirdclefDataset(Dataset):
    def __init__(self, df, class_labels, mode='train'):
        self.df = df
        self.class_labels = class_labels
        self.mode = mode
        self.config = config
        self.input_path = Path(config['data']['processed_audio']) / 'v1'
        self.min_segment = int(config['audio_preprocessing']['min_segment'])

    @classmethod
    def normalize_std(cls, spec, eps=1e-23):
        mean = torch.mean(spec)
        std = torch.std(spec)
        return (spec - mean) / (std + eps)

    def get_mel_spectrogram(self, audio_signal):
        params = self.config['mel_spectrogram']
        mel_spectrogram = at.MelSpectrogram(
            sample_rate=int(self.config['audio_params']['sample_rate']),
            n_fft=int(params['n_fft']),
            win_length=int(params['win_length']),
            hop_length=int(params['hop_length']),
            n_mels=int(params['n_mels']),
            f_min=float(params['f_min']),
            f_max=float(params['f_max']),
            mel_scale=params['mel_scale']
        )
        mel_spec = mel_spectrogram(audio_signal)
        return torch.log(mel_spec)

    def __getitem__(self, index):
        filename = self.df.iloc[index].filename
        sig, _ = torchaudio.load(self.input_path / filename, backend=self.config['audio_preprocessing']['backend'])
        sig = sig / torch.max(torch.abs(sig))
        sig = sig + 1.5849e-05 * (torch.rand(1, self.min_segment) - 0.5)

        mel_spec = self.get_mel_spectrogram(sig)
        mel_spec = self.normalize_std(mel_spec)

        target = self.df.iloc[index].primary_label
        y = np.array([1 if lbl == target else 0 for lbl in self.class_labels])

        return mel_spec, torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.df)

## Model

In [16]:
class BirdclefModel(pl.LightningModule):
    def __init__(self, class_labels):
        super(BirdclefModel, self).__init__()
        self.save_hyperparameters()
        self.class_labels = class_labels
        self.num_classes = len(self.class_labels)
        self.model_backbone = config['model']['model_backbone']
        self.model = self.get_model()
        self.loss_fn = nn.BCEWithLogitsLoss()
        self.hparams.learning_rate = float( config['training']['learning_rate'])

        # For storing predictions and labels
        self.pred_train = []
        self.label_train = []
        self.pred_val = []
        self.label_val = []

    def get_model(self):

        model_obj = timm.create_model(config['model']['model_desc'], pretrained=True, num_classes=self.num_classes)
        return model_obj

    def forward(self, x):
        x = torch.cat((x, x, x), 1)
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        preds = self(x)
        loss = self.loss_fn(preds, y)

        self.label_train.append(y.detach().cpu().numpy())
        self.pred_train.append(torch.softmax(preds, dim=1).detach().cpu().numpy())

        self.log("train_loss", loss, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        preds = self(x)
        loss = self.loss_fn(preds, y)

        self.label_val.append(y.detach().cpu().numpy())
        self.pred_val.append(torch.softmax(preds, dim=1).detach().cpu().numpy())

        self.log("val_loss", loss, on_epoch=True)
        return loss

    def on_train_epoch_end(self):
        auc = self.cal_score(self.label_train, self.pred_train)
        self.log("train_auc", auc, on_epoch=True)
        self.pred_train.clear()
        self.label_train.clear()

    def on_validation_epoch_end(self):
        auc = self.cal_score(self.label_val, self.pred_val)
        self.log("val_auc", auc, on_epoch=True)
        self.pred_val.clear()
        self.label_val.clear()

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": scheduler,
                "monitor": "val_loss"
            }
        }

    def cal_score(self, label, pred):
        label = np.concatenate(label)
        pred = np.concatenate(pred)

        label_df = pd.DataFrame(label>0.5, columns=self.class_labels)
        pred_df = pd.DataFrame(pred, columns=self.class_labels)
        label_df['id'] = np.arange(len(label_df))
        pred_df['id'] = np.arange(len(pred_df))

        return score(label_df, pred_df, row_id_column_name='id')

## Training

In [17]:
train_df, val_df = train_test_split(meta, test_size=0.2, random_state=42)

train_dataset = BirdclefDataset(train_df, mode='train', class_labels=class_labels)
train_loader = DataLoader(train_dataset, batch_size=int(config['training']['batch_size']), shuffle=True, num_workers=int(config['training']['num_workers']),drop_last=True)

val_dataset = BirdclefDataset(val_df, mode='val', class_labels=class_labels)
val_loader = DataLoader(val_dataset, batch_size=int(config['training']['batch_size']), shuffle=False, num_workers=int(config['training']['num_workers']),drop_last=True)

In [18]:
model = BirdclefModel(class_labels)

model.safetensors:   0%|          | 0.00/31.5M [00:00<?, ?B/s]

In [19]:
checkpoint_callback = ModelCheckpoint(
    dirpath=config['model']['model_path'] + '/'+ config['model']['model_desc'] + '-' + config['model']['model_ver'],
    filename='{epoch:02d}-{val_loss:.2f}',
    save_top_k=1,
    monitor='val_loss',
    mode='min'
)

early_stopping = EarlyStopping(monitor="val_loss", patience=3, mode="min")

In [20]:
logger = WandbLogger(log_model='all')

In [21]:
trainer = pl.Trainer(
    max_epochs=int(config['training']['epochs']),
    devices=1,
    accelerator='gpu' if torch.cuda.is_available() else 'cpu',
    logger=logger,
    callbacks=[checkpoint_callback, early_stopping]
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [22]:
trainer.fit(model, train_loader, val_loader)

You are using a CUDA device ('NVIDIA GeForce RTX 3070 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type              | Params | Mode 
------------------------------------------------------
0 | model   | EfficientNet      | 6.8 M  | train
1 | loss_fn | BCEWithLogitsLoss | 0      | train
------------------------------------------------------
6.8 M     Trainable params
0         Non-trainable params
6.8 M     Total params
27.108    Total estimated model params size (MB)
474       Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=8` reached.


Restoring states from the checkpoint path at /mnt/d/Projects_D/BirdCLEF_2025//models/efficientnet_b1-v1/epoch=07-val_loss=0.01.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type              | Params | Mode 
------------------------------------------------------
0 | model   | EfficientNet      | 6.8 M  | train
1 | loss_fn | BCEWithLogitsLoss | 0      | train
------------------------------------------------------
6.8 M     Trainable params
0         Non-trainable params
6.8 M     Total params
27.108    Total estimated model params size (MB)
474       Modules in train mode
0         Modules in eval mode
Restored all states from the checkpoint at /mnt/d/Projects_D/BirdCLEF_2025//models/efficientnet_b1-v1/epoch=07-val_loss=0.01.ckpt


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

In [16]:
def extract_metrics(metrics):
    metrics = {k:v.item() for k, v in metrics.items()}
    return metrics

In [17]:
metrics = extract_metrics(trainer.callback_metrics)

In [22]:
for k, v in metrics.items():
    print(k + ":", v)

train_loss: 0.004555719438940287
train_loss_step: 0.0027519643772393465
val_loss: 0.011869397945702076
val_auc: 0.9654795527458191
train_loss_epoch: 0.004555719438940287
train_auc: 0.9978811144828796


In [23]:
wandb.finish()

0,1
epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇█████
train_auc,▁▆▇▇████
train_loss_epoch,█▄▃▃▂▂▁▁
train_loss_step,█▇▅▅▄▄▅▄▄▃▄▂▃▃▄▃▃▄▃▂▃▃▄▂▂▂▂▂▁▂▂▂▂▁▁▂▁▁▁▂
trainer/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
val_auc,▁▆▆▇▇███
val_loss,█▄▃▂▁▁▂▂

0,1
epoch,7.0
train_auc,0.99788
train_loss_epoch,0.00456
train_loss_step,0.00511
trainer/global_step,4567.0
val_auc,0.96548
val_loss,0.01187
