In [None]:
# Import libraries
import numpy as np
import pandas as pd
import os
import torch
import torchaudio
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import librosa
import torch.nn.functional as F
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# Configuration settings
class Config:
    # Data paths (update these if running in a different environment or directory structure)
    data_dir = ""   # base directory for the competition data
    train_metadata = os.path.join(data_dir, "/kaggle/input/birdclef-2025/train.csv")
    train_audio_dir = os.path.join(data_dir, "/kaggle/input/birdclef-2025/train_audio")      # assuming train audio files are in train_audio/
    train_soundscapes_dir = os.path.join(data_dir, "/kaggle/input/birdclef-2025/train_soundscapes")      # assuming train audio files are in train_audio/
    test_audio_dir = os.path.join(data_dir, "/kaggle/input/birdclef-2025/test_soundscapes")  # assuming test soundscape files in test_soundscapes/
    sample_submission = os.path.join(data_dir, "/kaggle/input/birdclef-2025/sample_submission.csv")
    
    # Audio processing parameters
    sample_rate = 32000       # target sampling rate for audio
    clip_duration = 5.0       # duration (in seconds) of each audio clip or segment
    n_mels = 128              # number of mel frequency bins for spectrogram
    n_fft = 1024              # FFT window size 
    hop_length = 500          # hop length for STFT (controls time resolution)
    fmin = 40                 # min frequency for mel filter (in Hz)
    fmax = 15000               # max frequency for mel filter (if None, use sr/2)
    use_soundscape_noise_addition=True
    use_wave_augment=True
    # Data augmentation flags
    use_spec_augment = True   # whether to apply SpecAugment (time/freq masking) on training spectrograms
    use_mixup = True          # whether to apply Mixup augmentation during training
    
    # SpecAugment parameters
    time_mask_param = 50      # max width of time masking (in time frames)
    freq_mask_param = 15      # max width of frequency masking (in mel bins)
    
    # Training hyperparameters
    batch_size = 32
    num_epochs = 240
    learning_rate = 1e-3
    weight_decay = 1e-5       # weight decay for optimizer (regularization)
    val_split = 0.1           # fraction of training data to use as validation
    n_splits = 5 # Or your desired number of folds
    
cfg = Config()


In [None]:
# Load training metadata
train_df = pd.read_csv(cfg.train_metadata)
sub_df = pd.read_csv(cfg.sample_submission)
species_columns = sub_df.columns[1:]   # all columns except 'row_id'

# 2. Build a mapping from species code to integer index
label_to_index = {species: i for i, species in enumerate(species_columns)}

# 3. Map primary_label → target index
train_df['target'] = train_df['primary_label'].map(label_to_index)

# 4. Drop any rows where the label wasn’t found (just in case)
train_df = train_df[train_df['target'].notna()].reset_index(drop=True)
train_df['target'] = train_df['target'].astype(int)

# 5. Quick sanity check
print("Columns now:", train_df.columns.tolist())
print("Sample targets:", train_df[['primary_label','target']].head())

# Amphibian and Wildlife Sound Classification

This notebook implements a solution for the El Silencio Natural Reserve acoustic species identification competition.

In [None]:
# Define audio length in samples for 5 seconds
target_samples = int(cfg.sample_rate * cfg.clip_duration)

# Initialize mel spectrogram transformer and amplitude-to-dB converter from torchaudio
mel_transform = torchaudio.transforms.MelSpectrogram(
    sample_rate=cfg.sample_rate, n_fft=cfg.n_fft, hop_length=cfg.hop_length, 
    n_mels=cfg.n_mels, f_min=cfg.fmin, f_max=cfg.fmax
)
amp_to_db_transform = torchaudio.transforms.AmplitudeToDB(stype="power", top_db=80)

def load_audio_to_mel(file_path):
    """Load an audio file, crop/pad to cfg.clip_duration, and convert to a normalized log-mel spectrogram tensor."""
    # Load audio
    waveform, sr = torchaudio.load(file_path)
    # If stereo, convert to mono
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    # Resample if needed
    if sr != cfg.sample_rate:
        resampler = torchaudio.transforms.Resample(sr, cfg.sample_rate)
        waveform = resampler(waveform)
        sr = cfg.sample_rate
    # Ensure waveform length is target_samples (pad or crop)
    length = waveform.shape[1]
    if length > target_samples:
        # center crop
        start = (length - target_samples) // 2
        waveform = waveform[:, start:start + target_samples]
    elif length < target_samples:
        # pad with zeros to both ends to center the waveform in 5s
        pad_total = target_samples - length
        pad_left = pad_total // 2
        pad_right = pad_total - pad_left
        waveform = torch.nn.functional.pad(waveform, (pad_left, pad_right))
        length = waveform.shape[1]  # should now equal target_samples
    # Compute mel spectrogram (power)
    mel_spec = mel_transform(waveform)  # shape: [1, n_mels, time_frames]
    # Convert to log-scale (dB)
    mel_spec_db = amp_to_db_transform(mel_spec)  # shape: [1, n_mels, time_frames]
    # Normalize dB values to [0, 1]
    mel_spec_db = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min() + 1e-6)
    # At this point, mel_spec_db is a 1xN_melsxT tensor ready for model input
    return mel_spec_db


## Data Loading and Preparation

In [None]:
# Load the training metadata
train_df = pd.read_csv('/kaggle/input/birdclef-2025/train.csv')
print(f"Training data shape: {train_df.shape}")

# Load taxonomy information
taxonomy_df = pd.read_csv('/kaggle/input/birdclef-2025/taxonomy.csv')
print(f"Taxonomy data shape: {taxonomy_df.shape}")

# Display a few samples from the training data
train_df.head()

In [None]:
# Check for missing values
print("Missing values in training data:")
print(train_df.isnull().sum())

# Get unique species in the training data
unique_species = train_df['primary_label'].unique()
print(f"Number of unique species in training data: {len(unique_species)}")

## Audio Feature Extraction

We'll extract mel-spectrogram features from the audio files.

In [None]:
def load_audio_file(file_path, sr=32000, duration=5.0):
    """Load an audio file and return the waveform."""
    try:
        y, sp = librosa.load(file_path, sr=sr, duration=duration, res_type='kaiser_fast')
        # Pad if audio is shorter than expected duration
        if len(y) < sr * duration:
            y = np.pad(y, (0, int(sr * duration) - len(y)))
        return y,sp
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return np.zeros(int(sr * duration))  # Return silence if there's an error

def extract_melspectrogram(y, sr=32000, n_mels=128, n_fft=2048, hop_length=512):
    """Extract mel-spectrogram from an audio waveform."""
    try:
        melspec = librosa.feature.melspectrogram(
            y=y, sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length
        )
        melspec_db = librosa.power_to_db(melspec, ref=np.max)
        return melspec_db
    except Exception as e:
        print(f"Error extracting melspectrogram: {e}")
        return np.zeros((n_mels, int(1 + (len(y) - n_fft) / hop_length)))

In [None]:
# Function to process training data
def prepare_training_data(df, max_samples_per_species=50):
    X = []
    y = []
    species_labels = []
    species_counts = df['primary_label'].value_counts()
    
    # Create a dictionary to keep track of samples per species
    samples_per_species = {}
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing training data"):
        species = row['primary_label']
        
        # Skip if we already have enough samples for this species
        if species in samples_per_species and samples_per_species[species] >= max_samples_per_species:
            continue
            
        # Initialize counter for this species if not already present
        if species not in samples_per_species:
            samples_per_species[species] = 0
        
        file_path = f"train_audio/{row['filename']}"
        
        # Load audio and extract features
        try:
            y_audio = load_audio_file(file_path)
            melspec = extract_melspectrogram(y_audio)
            
            X.append(melspec)
            y.append(species)
            
            if species not in species_labels:
                species_labels.append(species)
                
            # Increment counter for this species
            samples_per_species[species] += 1
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
    
    # Convert species labels to one-hot encoding
    species_to_idx = {species: idx for idx, species in enumerate(species_labels)}
    y_encoded = np.zeros((len(y), len(species_labels)))
    for i, species in enumerate(y):
        y_encoded[i, species_to_idx[species]] = 1.0
    
    # Convert to numpy arrays
    X = np.array(X)
    X = X.reshape(X.shape[0], X.shape[1], X.shape[2], 1)  # Add channel dimension for CNN
    
    return X, y_encoded, species_labels, species_to_idx

In [None]:
import glob
import random
import os

def load_random_noise_mel():
    """
    Pick a random .ogg file from cfg.test_audio_dir,
    load it and convert to a normalized log-mel spectrogram.
    """
    # find all .ogg files
    files = glob.glob(os.path.join(cfg.train_soundscapes_dir, '*.ogg'))
    if not files:
        raise FileNotFoundError(f"No .ogg files found in {cfg.train_soundscapes_dir}")
    
    # pick one at random
    fname = random.choice(files)
    print("SOUNDSCAPES:"+fname)
    # load & preprocess
    mel = load_audio_to_mel(fname)
    return mel

In [None]:
import torch
import numpy as np
import random
import librosa

def augment_waveform(waveform: torch.Tensor, sample_rate: int) -> torch.Tensor:
    """
    waveform: [1, n_samples] torch.Tensor on CPU
    sample_rate: int, e.g. 32000
    Returns: augmented waveform [1, *] torch.Tensor
    """
    # to numpy 1D, ensure it's float32 from the start
    w = waveform.squeeze(0).cpu().numpy().astype(np.float32)

    # 1) Gaussian noise
    if random.random() < 0.5:
        amp = np.random.uniform(0.001, 0.015)
        noise = np.random.randn(len(w)).astype(np.float32) * amp # Ensure noise is float32
        w = w + noise

    # 2) Time-stretch
    if random.random() < 0.5:
        rate = np.random.uniform(0.8, 1.25)
        # librosa.effects.time_stretch preserves dtype of input if it's float32 or float64
        w = librosa.effects.time_stretch(w, rate=rate)

    # 3) Pitch-shift (sr and n_steps as keywords)
    if random.random() < 0.5:
        n_steps = np.random.uniform(-4, 4)  # fractional semitones
        # librosa.effects.pitch_shift preserves dtype
        w = librosa.effects.pitch_shift(w, sr=sample_rate, n_steps=n_steps)

    # 4) Time-shift (circular roll ±10% length)
    if random.random() < 0.5:
        max_shift = int(0.1 * len(w))
        shift = np.random.randint(-max_shift, max_shift)
        w = np.roll(w, shift)

    # Ensure w is float32 before converting back to tensor
    w = w.astype(np.float32)
    
    # back to torch [1, n_samples]
    w_tensor = torch.from_numpy(w).unsqueeze(0)
    return w_tensor

In [None]:
# 1. Create the 'target' column by mapping primary_label to class indices
sub_df = pd.read_csv(cfg.sample_submission)
species_columns = sub_df.columns[1:]   # all species columns
label_to_index = {species: i for i, species in enumerate(species_columns)}

train_df['target'] = train_df['primary_label'].map(label_to_index)
train_df = train_df[train_df['target'].notna()].reset_index(drop=True)
train_df['target'] = train_df['target'].astype(int)
print("Columns now:", train_df.columns.tolist())

# 2. Define SpecAugment transforms (to be used only on training data)
time_masker = torchaudio.transforms.TimeMasking(time_mask_param=cfg.time_mask_param)
freq_masker = torchaudio.transforms.FrequencyMasking(freq_mask_param=cfg.freq_mask_param)

# assume augment_waveform(waveform: Tensor, sr: int) is defined elsewhere
# assume mel_transform and amp_to_db_transform are defined
# assume freq_masker and time_masker are defined
# assume target_samples (in samples) and cfg.sample_rate, cfg.use_wave_augment, cfg.use_spec_augment are available

class BirdClefDataset(Dataset):
    def __init__(self, dataframe, audio_dir, augment=False):
        """
        dataframe: DataFrame with 'filename' and 'target' columns.
        audio_dir: Directory where audio files are stored.
        augment: whether to apply waveform+SpecAugment.
        """
        self.data      = dataframe
        self.audio_dir = audio_dir
        self.augment   = augment
        # It's good practice to define fixed_frames once, perhaps based on a dummy input
        # to ensure all spectrograms have the same time dimension before batching.
        dummy_waveform = torch.zeros(1, int(cfg.sample_rate * cfg.clip_duration)) # Ensure this matches target_samples logic
        with torch.no_grad():
            mel_dummy = mel_transform(dummy_waveform) # [1, n_mels, T]
        self.fixed_frames = mel_dummy.shape[-1]


    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        fp = os.path.join(self.audio_dir, row['filename'])
        fp = fp.strip() # Ensure no leading/trailing whitespace in filepath

        # 1) Load & mono-mix & resample
        try:
            waveform, sr = torchaudio.load(fp)
        except Exception as e:
            print(f"Error loading audio {fp}: {e}")
            # Return a dummy tensor or handle appropriately
            waveform = torch.zeros(1, int(cfg.sample_rate * cfg.clip_duration))
            sr = cfg.sample_rate

        if waveform.shape[0] > 1:
            waveform = waveform.mean(0, keepdim=True)
        if sr != cfg.sample_rate:
            resampler = torchaudio.transforms.Resample(sr, cfg.sample_rate)
            waveform = resampler(waveform)
            sr = cfg.sample_rate # Update sr after resampling

        # Ensure waveform is float for augmentations and model input
        waveform = waveform.float()

        # 2) Enforce fixed length (target_samples) BEFORE augmentation if augmentations change length
        current_length = waveform.shape[1]
        if current_length > target_samples:
            waveform = waveform[:, :target_samples]
        elif current_length < target_samples:
            pad_amt = target_samples - current_length
            waveform = F.pad(waveform, (0, pad_amt))


        # 3) Waveform‐level augmentations
        if self.augment and cfg.use_wave_augment:
            waveform = augment_waveform(waveform, sr) # augment_waveform should handle [1, n_samples]

        # Re-ensure fixed length AFTER augmentation if augmentations might change length
        # This step might be redundant if augment_waveform preserves length or if padding is done carefully after it.
        current_length = waveform.shape[1]
        if current_length > target_samples:
            # This could happen if time-stretching makes it longer
            start = (current_length - target_samples) // 2 # Center crop
            waveform = waveform[:, start:start + target_samples]
        elif current_length < target_samples:
            # This could happen if time-stretching makes it shorter
            pad_amt = target_samples - current_length
            waveform = F.pad(waveform, (0, pad_amt))


        # 4) Convert to mel, to dB, normalize
        # Ensure waveform is on CPU for torchaudio transforms if not already
        #mel = mel_transform(waveform.cpu())            # [1, n_mels, T]
        mel = mel_transform(waveform.cpu().float())
        mel = amp_to_db_transform(mel)
        # Normalize dB values to [0, 1]
        mel_min = mel.min()
        mel_max = mel.max()
        if mel_max - mel_min == 0: # Avoid division by zero if mel is flat (e.g., silence)
             mel = torch.zeros_like(mel)
        else:
            mel = (mel - mel_min) / (mel_max - mel_min + 1e-6)


        # Ensure fixed time frames for mel spectrogram
        T = mel.size(-1)
        if T > self.fixed_frames:
            mel = mel[..., :self.fixed_frames]
        elif T < self.fixed_frames:
            mel = F.pad(mel, (0, self.fixed_frames - T))


        # 5) SpecAugment on mel
        if self.augment and cfg.use_spec_augment:
            if random.random() < 0.5:
                mel = freq_masker(mel)
            if random.random() < 0.5:
                mel = time_masker(mel)

        label = row['target']
        return mel, label

from sklearn.model_selection import KFold # <-- Add this line
# K-Fold Cross Validation Setup
kf = KFold(n_splits=cfg.n_splits, shuffle=True, random_state=42) # random_state for reproducibility

# We will create DataLoaders inside the training loop for each fold

In [None]:

import timm

# Create EfficientNet-B0 model
model = timm.create_model('efficientnetv2_rw_m.agc_in1k', pretrained=True, in_chans=1, num_classes=len(species_columns))
#if torch.cuda.device_count() > 1:
#    print(f"Using {torch.cuda.device_count()} GPUs")
#    model = nn.DataParallel(model)
model.to(device)
print(model.architecture if hasattr(model, 'architecture') else type(model))

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=cfg.learning_rate, weight_decay=cfg.weight_decay)


## Model Definition

We'll use a Convolutional Neural Network (CNN) to classify the spectrograms.

In [None]:
# Lists to store metrics from each fold
fold_train_losses = []
fold_val_losses = []
fold_val_accuracies = []

for fold, (train_idx, val_idx) in enumerate(kf.split(train_df)):
    print(f"--- Fold {fold+1}/{cfg.n_splits} ---")

    # --- Data for current fold ---
    train_data_fold = train_df.iloc[train_idx].reset_index(drop=True)
    val_data_fold = train_df.iloc[val_idx].reset_index(drop=True)

    print(f"Training samples for fold {fold+1}: {len(train_data_fold)}, Validation samples: {len(val_data_fold)}")

    train_dataset_fold = BirdClefDataset(train_data_fold, audio_dir=cfg.train_audio_dir, augment=True)
    val_dataset_fold = BirdClefDataset(val_data_fold, audio_dir=cfg.train_audio_dir, augment=False)

    train_loader_fold = DataLoader(train_dataset_fold, batch_size=cfg.batch_size, shuffle=True, num_workers=4, pin_memory=True)
    val_loader_fold = DataLoader(val_dataset_fold, batch_size=cfg.batch_size, shuffle=False, num_workers=4, pin_memory=True)

    # --- Re-initialize model and optimizer for each fold ---
    # Create EfficientNet-B0 model (or your chosen model)
    model = timm.create_model('efficientnetv2_rw_m.agc_in1k', pretrained=True, in_chans=1, num_classes=len(species_columns))
    model.to(device)
    # if torch.cuda.device_count() > 1:
    # print(f"Using {torch.cuda.device_count()} GPUs")
    # model = nn.DataParallel(model)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=cfg.learning_rate, weight_decay=cfg.weight_decay)
    # Optional: Re-initialize learning rate scheduler here if you are using one

    # --- Training loop for the current fold ---
    best_val_loss_fold = float('inf')
    best_model_state_fold = None # Store best model for this fold
    # best_path_fold = f\"/kaggle/working/best_model_fold_{fold+1}.pth\" # Save best model per fold

    epochs_no_improve = 0
    patience = 5 # You can also put this in cfg

    for epoch in range(1, cfg.num_epochs + 1):
        model.train()
        running_loss = 0.0

        for batch_idx, (inputs, targets) in enumerate(train_loader_fold):
            inputs, targets = inputs.to(device), targets.to(device)
            
            if cfg.use_mixup:
                lam = np.random.beta(0.2, 0.2)
                indices = torch.randperm(inputs.size(0)).to(device) # Ensure indices are on the same device
                inputs_shuffled = inputs[indices]
                targets_shuffled = targets[indices]
                
                inputs_mixed = lam * inputs + (1 - lam) * inputs_shuffled
                outputs = model(inputs_mixed)
                loss = lam * criterion(outputs, targets) + (1 - lam) * criterion(outputs, targets_shuffled)
            else:
                outputs = model(inputs)
                loss = criterion(outputs, targets)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()

        avg_train_loss = running_loss / len(train_loader_fold)

        # Validation for the current fold
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, targets in val_loader_fold:
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)
                loss_val_batch = criterion(outputs, targets) # Use a different variable name
                val_loss += loss_val_batch.item()
                _, predicted = torch.max(outputs.data, 1)
                total += targets.size(0)
                correct += (predicted == targets).sum().item()
        
        avg_val_loss = val_loss / len(val_loader_fold)
        val_acc = correct / total

        improved = False
        if avg_val_loss < best_val_loss_fold:
            best_val_loss_fold = avg_val_loss
            # torch.save(model.state_dict(), best_path_fold) # Optionally save best model for this fold
            best_model_state_fold = model.state_dict().copy() # Important to copy
            epochs_no_improve = 0
            improved = True
        else:
            epochs_no_improve += 1

        print(f"Fold {fold+1}, Epoch {epoch}: Train Loss = {avg_train_loss:.4f},"
              f"Val Loss = {avg_val_loss:.4f}, Val Acc = {val_acc:.4f}"
              f"{'  <-- improved' if improved else ''}")

        if epochs_no_improve >= patience:
            print(f"No improvement for {patience} epochs in fold {fold+1}. Early stopping at epoch {epoch}.")
            break
    
    # After all epochs for the current fold are done
    fold_train_losses.append(avg_train_loss) # Store last epoch's train loss, or best, as you prefer
    fold_val_losses.append(best_val_loss_fold) # Store best validation loss for this fold
    fold_val_accuracies.append(val_acc) # Store validation accuracy corresponding to best_val_loss_fold (or last epoch)

    # Optional: Load the best model state for this fold if you plan to use it immediately
    # if best_model_state_fold:
    #     model.load_state_dict(best_model_state_fold)
    #     print(f"Loaded best model weights for fold {fold+1} with Val Loss = {best_val_loss_fold:.4f}")

# --- After all folds are completed ---
print(" K-Fold Cross-Validation Summary")
print(f"Average Training Loss across folds: {np.mean(fold_train_losses):.4f}")
print(f"Average Validation Loss across folds: {np.mean(fold_val_losses):.4f} (Std: {np.std(fold_val_losses):.4f})")
print(f"Average Validation Accuracy across folds: {np.mean(fold_val_accuracies):.4f} (Std: {np.std(fold_val_accuracies):.4f})")


In [None]:
# Load best model weights before inference (if we saved any)
if best_model_state is not None:
    model.load_state_dict(best_model_state)
model.eval()

# Prepare submission dataframe for output
submission_df = sub_df.copy()

# Group row_ids by soundscape file
submission_df['audio_id'] = submission_df['row_id'].apply(lambda x: x.rsplit('_', 1)[0])  # everything except last underscore part is audio id
unique_files = submission_df['audio_id'].unique()

# Iterate over each unique soundscape file
for audio_id in unique_files:
    audio_path = os.path.join(cfg.test_audio_dir, audio_id + ".ogg")
    if not os.path.exists(audio_path):
        print(f"Warning: audio file {audio_path} not found. Skipping.")
        continue
    # Load full audio
    waveform, sr = torchaudio.load(audio_path)
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)  # convert to mono
    if sr != cfg.sample_rate:
        resampler = torchaudio.transforms.Resample(sr, cfg.sample_rate)
        waveform = resampler(waveform)
        sr = cfg.sample_rate
    waveform = waveform.squeeze(0)  # shape: [samples]
    total_len = waveform.shape[0]
    # Get all rows for this audio file
    file_rows = submission_df[submission_df['audio_id'] == audio_id]
    for idx, row in file_rows.iterrows():
        row_id = row['row_id']
        # Parse end_time from row_id (everything after last underscore)
        end_time = int(row_id.split('_')[-1])
        start_time = end_time - 5
        # Convert times to sample indices
        start_sample = int(start_time * cfg.sample_rate)
        end_sample = int(end_time * cfg.sample_rate)
        if start_sample < 0: start_sample = 0
        if end_sample > total_len: end_sample = total_len
        segment_waveform = waveform[start_sample:end_sample]
        # Pad or crop segment_waveform to exactly 5 seconds in length
        segment_len = segment_waveform.shape[0]
        if segment_len < target_samples:
            pad_needed = target_samples - segment_len
            segment_waveform = torch.nn.functional.pad(segment_waveform, (0, pad_needed))
        elif segment_len > target_samples:
            segment_waveform = segment_waveform[:target_samples]
        # Now compute spectrogram for this segment
        segment_waveform = segment_waveform.unsqueeze(0)  # add channel dim
        mel_spec = mel_transform(segment_waveform)          # [1, n_mels, time]
        mel_spec_db = amp_to_db_transform(mel_spec)        # [1, n_mels, time] in dB
        # Normalize to [0,1] (use same method as in training)
        mel_spec_db = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min() + 1e-6)
        mel_spec_db = mel_spec_db.to(device)
        # Predict with model
        with torch.no_grad():
            output = model(mel_spec_db)
            probs = torch.softmax(output, dim=1)  # get probabilities over classes
            probs = probs.squeeze().cpu().numpy()  # shape: (206,)
        # Fill the probabilities into the submission dataframe for this row
        submission_df.loc[idx, species_columns] = probs


In [None]:
# Drop the helper column
submission_df = submission_df.drop(columns=['audio_id'])
# Save to CSV
submission_df.to_csv("/kaggle/input/birdclef-2025/sample_submission.csv", index=False)
print("Submission file saved with shape:", submission_df.shape)
print(submission_df.head(10))
