# **BirdCLEF 2025 Inference Notebook**
This notebook runs inference on BirdCLEF 2025 test soundscapes and generates a submission file. It supports both single model inference and ensemble inference with multiple models. You can find the pre-processing and training processes in the following notebooks:

- [Transforming Audio-to-Mel Spec. | BirdCLEF'25](https://www.kaggle.com/code/kadircandrisolu/transforming-audio-to-mel-spec-birdclef-25)  
- [EfficientNet B0 Pytorch [Train] | BirdCLEF'25](https://www.kaggle.com/code/kadircandrisolu/efficientnet-b0-pytorch-train-birdclef-25)

**Features**
- Audio Preprocessing
- Test-Time Augmentation (TTA)

In [1]:
import os
import gc
import warnings
import logging
import time
import math
import cv2
from pathlib import Path

import copy                 
import numpy as np
import pandas as pd
import librosa
import torch
import torch.nn as nn
import torch.nn.functional as F
import timm
from tqdm.auto import tqdm


warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.ERROR)

In [2]:
class CFG:
 
    test_soundscapes = '/kaggle/input/birdclef-2025/test_soundscapes'
    submission_csv = '/kaggle/input/birdclef-2025/sample_submission.csv'
    taxonomy_csv = '/kaggle/input/birdclef-2025/taxonomy.csv'
    model_path = '/kaggle/input/best-test'  
    
    # Audio & mel parameters
    FS = 32_000; WINDOW_SIZE = 5
    N_FFT = 1024; HOP_LENGTH = 512; N_MELS = 128
    FMIN = 50; FMAX = 14_000
    TARGET_SHAPE = (256, 256)

    # Model / training options
    in_channels   = 1
    pretrained    = False          #‑‑ trained‑from‑scratch
    dropout_rate  = 0.5
    mixup_alpha   = 0.0

    # Inference
    device   = 'cpu'               # change to 'cuda' if available
    batch_size = 16
    use_tta    = False; tta_count = 3
    threshold  = 0.5

    # Debug
    debug = False
    debug_count = 3

cfg = CFG()

In [3]:
print(f"Using device: {cfg.device}")
print(f"Loading taxonomy data...")
taxonomy_df = pd.read_csv(cfg.taxonomy_csv)
species_ids = taxonomy_df['primary_label'].tolist()
num_classes = len(species_ids)
print(f"Number of classes: {num_classes}")

Using device: cpu
Loading taxonomy data...
Number of classes: 206


**Model 1 BiLSTM**

In [4]:
class GlobalAttentionPool(nn.Module):
    """
    Learnable attention pooling:
     - projects spatial features to K,V
     - uses a single learnable query to attend over H×W tokens
    """
    def __init__(self, input_dim, num_heads=8):
        super().__init__()
        self.attention = nn.MultiheadAttention(input_dim, num_heads, batch_first=True)
        self.query = nn.Parameter(torch.randn(1, 1, input_dim))
        
    def forward(self, x):
        # x shape: [batch_size, seq_len, input_dim]
        batch_size = x.size(0)
        query = self.query.expand(batch_size, -1, -1)  # [batch_size, 1, input_dim]
        
        # Apply attention
        attn_output, _ = self.attention(query, x, x)  # [batch_size, 1, input_dim]
        
        # Squeeze the sequence dimension
        return attn_output.squeeze(1)  # [batch_size, input_dim]
class BiLSTM(nn.Module):
    def __init__(self, cfg, num_classes=None):
        super().__init__()
        self.cfg = cfg
    
        # Use either passed num_classes or get from taxonomy
        if num_classes is not None:
            cfg.num_classes = num_classes
        else:
        # load label count from taxonomy file
            taxonomy_df = pd.read_csv(cfg.taxonomy_csv)
            cfg.num_classes = len(taxonomy_df)
        
        # Define input feature dimension
        self.input_dim = cfg.input_dim  # Add this to your config
        
        # BiLSTM layers
        self.lstm_hidden_size = getattr(cfg, 'lstm_hidden_size', 256)
        self.lstm_num_layers = getattr(cfg, 'lstm_num_layers', 2)
        self.lstm_dropout = getattr(cfg, 'lstm_dropout', 0.2)
        
        self.bilstm = nn.LSTM(
            input_size=self.input_dim,
            hidden_size=self.lstm_hidden_size,
            num_layers=self.lstm_num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=self.lstm_dropout if self.lstm_num_layers > 1 else 0
        )
        
        # Feature dimension after BiLSTM
        bilstm_output_dim = self.lstm_hidden_size * 2  # *2 because bidirectional
        self.feat_dim = bilstm_output_dim
        
        # Attention pooling
        self.pool = GlobalAttentionPool(bilstm_output_dim, num_heads=8)
        
        # Projection head
        hidden_dim = bilstm_output_dim // 2
        self.proj_head = nn.Sequential(
            nn.Linear(bilstm_output_dim, hidden_dim),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(p=getattr(cfg, 'dropout_rate', 0.5)),
        )
        
        # Classifier
        self.classifier = nn.Linear(hidden_dim, cfg.num_classes)
        
        # Mixup config
        self.mixup_enabled = getattr(cfg, 'mixup_alpha', 0) > 0
        if self.mixup_enabled:
            self.mixup_alpha = cfg.mixup_alpha
            
    def forward(self, x, targets=None):
        batch_size = x.size(0)
        
        # For BiLSTM, x should be [batch_size, sequence_length, features]
        # Check if reshaping is needed based on input dimensions
        if len(x.shape) == 4:  # [B, C, H, W] format (like a spectrogram)
            # Reshape for LSTM: [batch_size, seq_len, features]
            # Assuming x is [B, C, H, W], reshape to [B, H, W*C] or similar
            # This depends on how your data is structured
            x = x.permute(0, 2, 1, 3).contiguous()  # [B, H, C, W]
            x = x.view(batch_size, x.size(1), -1)  # [B, H, C*W]
        
        # Apply BiLSTM
        lstm_out, _ = self.bilstm(x)  # [B, seq_len, hidden_size*2]
        
        # Apply attention pooling
        pooled = self.pool(lstm_out)  # [B, hidden_size*2]
        
        # Projection head
        proj = self.proj_head(pooled)
        
        # Mixup logic if needed
        if self.training and self.mixup_enabled and targets is not None:
            lam = np.random.beta(self.mixup_alpha, self.mixup_alpha)
            index = torch.randperm(batch_size).to(x.device)
            
            mixed_proj = lam * proj + (1 - lam) * proj[index, :]
            logits = self.classifier(mixed_proj)
            
            return logits, lam, index
        
        # Standard forward pass
        logits = self.classifier(proj)
        
        if targets is not None:
            return logits, None, None
        else:
            return logits

**Model 2 Efficientnet**

In [5]:
# EfficientNet B0
class GlobalAttentionPool(nn.Module):
    """
    Learnable attention pooling:
     - projects spatial features to K,V
     - uses a single learnable query to attend over H×W tokens
    """
    def __init__(self, in_channels, num_heads=8):
        super().__init__()
        self.num_heads = num_heads
        self.scale = (in_channels // num_heads) ** -0.5

        # learnable query token: (1, 1, C)
        self.query = nn.Parameter(torch.randn(1, 1, in_channels))

        # projectors for keys & values
        self.to_k = nn.Conv2d(in_channels, in_channels, kernel_size=1, bias=False)
        self.to_v = nn.Conv2d(in_channels, in_channels, kernel_size=1, bias=False)

    def forward(self, x):
        # x: (B, C, H, W)
        B, C, H, W = x.shape
        # (B, C, H*W) → (B, H*W, C)
        k = self.to_k(x).view(B, C, -1).permute(0, 2, 1)
        v = self.to_v(x).view(B, C, -1).permute(0, 2, 1)
        # expand query to batch
        q = self.query.expand(B, -1, -1)                  # (B, 1, C)

        # compute attention scores & aggregate
        attn = (q @ k.transpose(-2, -1)) * self.scale     # (B, 1, H*W)
        attn = attn.softmax(dim=-1)
        out = attn @ v                                    # (B, 1, C)
        return out.squeeze(1)                             # (B, C)

class BirdCLEFModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg

        # load label count
        taxonomy_df = pd.read_csv(cfg.taxonomy_csv)
        cfg.num_classes = len(taxonomy_df)

        # backbone (e.g. EfficientNet, ResNet…)
        self.backbone = timm.create_model(
            cfg.model_name,
            pretrained=cfg.pretrained,
            in_chans=cfg.in_channels,
            drop_rate=0.2,
            drop_path_rate=0.2
        )

        # strip off original head
        if 'efficientnet' in cfg.model_name:
            feat_dim = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
        elif 'resnet' in cfg.model_name:
            feat_dim = self.backbone.fc.in_features
            self.backbone.fc = nn.Identity()
        else:
            feat_dim = self.backbone.get_classifier().in_features
            self.backbone.reset_classifier(0, '')

        self.feat_dim = feat_dim

        self.pool = GlobalAttentionPool(feat_dim, num_heads=8)

        hidden_dim = feat_dim // 2
        self.proj_head = nn.Sequential(
            nn.Linear(feat_dim, hidden_dim),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(p=getattr(cfg, 'dropout_rate', 0.5)),
        )

        self.classifier = nn.Linear(hidden_dim, cfg.num_classes)

        # mixup config
        self.mixup_enabled = getattr(cfg, 'mixup_alpha', 0) > 0
        if self.mixup_enabled:
            self.mixup_alpha = cfg.mixup_alpha

    def forward(self, x, targets=None):
        if self.training and self.mixup_enabled and targets is not None:
            x, ta, tb, lam = self.mixup_data(x, targets)
        else:
            ta, tb, lam = None, None, None

        # extract features
        feats = self.backbone(x)
        if isinstance(feats, dict):
            feats = feats['features']

        # if 4D feature map → attention‐pool to 1D
        if feats.ndim == 4:
            feats = self.pool(feats)

        # projection head
        feats = self.proj_head(feats)

        logits = self.classifier(feats)

        # mixup‐aware loss
        if self.training and self.mixup_enabled and targets is not None:
            loss = self.mixup_criterion(F.binary_cross_entropy_with_logits,
                                        logits, ta, tb, lam)
            return logits, loss

        return logits

    def mixup_data(self, x, targets):
        B = x.size(0)
        lam = np.random.beta(self.mixup_alpha, self.mixup_alpha)
        idx = torch.randperm(B, device=x.device)
        mixed_x = lam * x + (1 - lam) * x[idx]
        return mixed_x, targets, targets[idx], lam

    def mixup_criterion(self, criterion, pred, y_a, y_b, lam):
        return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)

 **Ensembel Model**

In [6]:
class BirdCLEFEnsembleModel(nn.Module):
    def __init__(self, models, weights=None):
        super().__init__()
        self.models = nn.ModuleList(models)
        n = len(models)
        if weights is None:
            weights = torch.ones(n) / n
        self.register_buffer("weights", torch.as_tensor(weights).view(n, 1, 1))

    @torch.no_grad()
    def forward(self, x):
        outs = [torch.sigmoid(m(x)) for m in self.models]          # list of (B,C)
        outs = torch.stack(outs)                                   # (n,B,C)
        return (outs * self.weights).sum(0)                        # (B,C)

**Utilities**

In [7]:
def audio2melspec(audio_data, cfg):
    """Convert audio data to mel spectrogram"""
    if np.isnan(audio_data).any():
        mean_signal = np.nanmean(audio_data)
        audio_data = np.nan_to_num(audio_data, nan=mean_signal)

    mel_spec = librosa.feature.melspectrogram(
        y=audio_data,
        sr=cfg.FS,
        n_fft=cfg.N_FFT,
        hop_length=cfg.HOP_LENGTH,
        n_mels=cfg.N_MELS,
        fmin=cfg.FMIN,
        fmax=cfg.FMAX,
        power=2.0
    )

    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min() + 1e-8)
    
    return mel_spec_norm

def process_audio_segment(audio_data, cfg):
    """Process audio segment to get mel spectrogram"""
    if len(audio_data) < cfg.FS * cfg.WINDOW_SIZE:
        audio_data = np.pad(audio_data, 
                          (0, cfg.FS * cfg.WINDOW_SIZE - len(audio_data)), 
                          mode='constant')
    
    mel_spec = audio2melspec(audio_data, cfg)
    
    # Resize if needed
    if mel_spec.shape != cfg.TARGET_SHAPE:
        mel_spec = cv2.resize(mel_spec, cfg.TARGET_SHAPE, interpolation=cv2.INTER_LINEAR)
        
    return mel_spec.astype(np.float32)

In [8]:
def find_model_files(cfg):
    """
    Find all .pth model files in the specified model directory
    """
    model_files = []

    model_dir = Path(cfg.model_path)

    for path in model_dir.glob('**/*.pth'):
        model_files.append(str(path))

    return model_files
    
def load_model_from_path(path, cfg, num_classes):
    ckpt = torch.load(path, map_location=cfg.device)
    path_lc = str(path).lower()
    
    # Add support for your specific model name pattern
    if "bilstm" in path_lc:
        tmp_cfg = copy.deepcopy(cfg)
        tmp_cfg.num_classes = num_classes
        model = BiLSTM(tmp_cfg)
    elif "efficientnet" in path_lc:
        tmp_cfg = copy.deepcopy(cfg)
        tmp_cfg.model_name = "efficientnet_b0"
        tmp_cfg.num_classes = num_classes
        model = BirdCLEFModel(tmp_cfg)
    elif "fold" in path_lc:  # Add specific check for "fold" in the filename
        # Assuming the fold models are EfficientNet models
        tmp_cfg = copy.deepcopy(cfg)
        tmp_cfg.model_name = "efficientnet_b0"
        tmp_cfg.num_classes = num_classes
        model = BirdCLEFModel(tmp_cfg)

    else:
        raise ValueError(f"Cannot infer backbone type from {path}")
    
    # load weights (strict=False so missing optimizer keys won't error)
    model.load_state_dict(ckpt["model_state_dict"], strict=False)
    return model.to(cfg.device).eval()


def load_models(cfg, num_classes):
    paths  = find_model_files(cfg)
    models = [load_model_from_path(p, cfg, num_classes) for p in paths]
    if not models:
        raise RuntimeError("No .pth files found in model_path")
    return BirdCLEFEnsembleModel(models)      # <- single nn.Module



def predict_on_spectrogram(audio_path, models, cfg, species_ids):
    """Process a single audio file and predict species presence for each 5-second segment"""
    predictions = []
    row_ids     = []
    soundscape_id = Path(audio_path).stem

    try:
        print(f"Processing {soundscape_id}")
        audio_data, _ = librosa.load(audio_path, sr=cfg.FS)
        total_segments = int(len(audio_data) / (cfg.FS * cfg.WINDOW_SIZE))

        for segment_idx in range(total_segments):
            start = segment_idx * cfg.FS * cfg.WINDOW_SIZE
            end   = start + cfg.FS * cfg.WINDOW_SIZE
            seg   = audio_data[start:end]

            # build row_id
            t_sec = (segment_idx + 1) * cfg.WINDOW_SIZE
            row_ids.append(f"{soundscape_id}_{t_sec}")

            # collect preds (with or without TTA)
            preds_per_try = []
            n_tries = cfg.tta_count if cfg.use_tta else 1

            for t in range(n_tries):
                # preprocess + TTA
                mel = process_audio_segment(seg, cfg)
                if cfg.use_tta:
                    mel = apply_tta(mel, t)
                x = torch.tensor(mel, dtype=torch.float32).unsqueeze(0).unsqueeze(0).to(cfg.device)

                # single ensemble forward
                with torch.no_grad():
                    logits = models(x)              # ensemble returns raw logits
                    probs  = torch.sigmoid(logits)  # shape (1, C)
                preds_per_try.append(probs.cpu().numpy().squeeze())

            # average over TTA (or just take the one if no TTA)
            final_preds = np.mean(preds_per_try, axis=0)
            predictions.append(final_preds)

    except Exception as e:
        print(f"Error processing {audio_path}: {e}")

    return row_ids, predictions

In [9]:
def apply_tta(spec, tta_idx):
    """Apply test-time augmentation"""
    if tta_idx == 0:
        # Original spectrogram
        return spec
    elif tta_idx == 1:
        # Time shift (horizontal flip)
        return np.flip(spec, axis=1)
    elif tta_idx == 2:
        # Frequency shift (vertical flip)
        return np.flip(spec, axis=0)
    else:
        return spec

def run_inference(cfg, models, species_ids):
    """Run inference on all test soundscapes"""
    test_files = list(Path(cfg.test_soundscapes).glob('*.ogg'))
    
    if cfg.debug:
        print(f"Debug mode enabled, using only {cfg.debug_count} files")
        test_files = test_files[:cfg.debug_count]
    
    print(f"Found {len(test_files)} test soundscapes")

    all_row_ids = []
    all_predictions = []

    for audio_path in tqdm(test_files):
        row_ids, predictions = predict_on_spectrogram(str(audio_path), models, cfg, species_ids)
        all_row_ids.extend(row_ids)
        all_predictions.extend(predictions)
    
    return all_row_ids, all_predictions

def create_submission(row_ids, predictions, species_ids, cfg):
    """Create submission dataframe"""
    print("Creating submission dataframe...")

    submission_dict = {'row_id': row_ids}
    
    for i, species in enumerate(species_ids):
        submission_dict[species] = [pred[i] for pred in predictions]

    submission_df = pd.DataFrame(submission_dict)

    submission_df.set_index('row_id', inplace=True)

    sample_sub = pd.read_csv(cfg.submission_csv, index_col='row_id')

    missing_cols = set(sample_sub.columns) - set(submission_df.columns)
    if missing_cols:
        print(f"Warning: Missing {len(missing_cols)} species columns in submission")
        for col in missing_cols:
            submission_df[col] = 0.0

    submission_df = submission_df[sample_sub.columns]

    submission_df = submission_df.reset_index()
    
    return submission_df


In [10]:
def main():
    start_time = time.time()
    print("Starting BirdCLEF-2025 inference…")
    print(f"TTA enabled: {cfg.use_tta} "
          f"(variations: {cfg.tta_count if cfg.use_tta else 0})")

    # load ONE ensemble module
    ensemble = load_models(cfg, num_classes) 
    print(f"Loaded {len(ensemble.models)} sub-models")
    print("Weights:", ensemble.weights.cpu().numpy().flatten())
    
    # run inference ─────────────────────────────────────────
    row_ids, predictions = run_inference(cfg, ensemble, species_ids)

    # create Kaggle submission ─────────────────────────────
    submission_df = create_submission(row_ids, predictions, species_ids, cfg)
    submission_path = "submission.csv"
    submission_df.to_csv(submission_path, index=False)
    print(f"Submission saved to {submission_path}")

    elapsed = (time.time() - start_time) / 60
    print(f"Inference completed in {elapsed:.2f} minutes")

In [11]:
if __name__ == "__main__":
    main()

Starting BirdCLEF-2025 inference…
TTA enabled: False (variations: 0)
Loaded 2 sub-models
Weights: [0.5 0.5]
Found 0 test soundscapes


0it [00:00, ?it/s]

Creating submission dataframe...
Submission saved to submission.csv
Inference completed in 0.07 minutes
