In [1]:
import os
import librosa
import numpy as np
import pandas as pd
import torch
import torchvision
from skimage.transform import resize

In [2]:
# Set seed for reproducibility
np.random.seed(42)

In [3]:
# Define class labels from training audio directories
class_labels = sorted(os.listdir('/kaggle/input/birdclef-2025/train_audio/'))

In [4]:
# Define the model architecture
class BirdCLEFModel(torch.nn.Module):
    def __init__(self, num_classes):
        super(BirdCLEFModel, self).__init__()
        # Load ResNet18 without pre-trained weights (will load custom weights)
        self.resnet = torchvision.models.resnet18(pretrained=False)
        # Modify first conv layer for 1-channel input (spectrograms)
        self.resnet.conv1 = torch.nn.Conv2d(
            in_channels=1,
            out_channels=64,
            kernel_size=(7, 7),
            stride=(2, 2),
            padding=(3, 3),
            bias=False
        )
        # Modify final fully connected layer for num_classes outputs
        self.resnet.fc = torch.nn.Linear(self.resnet.fc.in_features, num_classes)
        self.sigmoid = torch.nn.Sigmoid()  # For probability outputs

    def forward(self, x):
        x = self.resnet(x)
        x = self.sigmoid(x)  # Output probabilities between 0 and 1
        return x

In [6]:
# Instantiate the model
model = BirdCLEFModel(num_classes=len(class_labels))

# Load pre-trained weights from uploaded dataset
# Note: Replace '/kaggle/input/birdclef-model-weights/model_weights.pth' with your dataset path
model.load_state_dict(
    torch.load(
        # '/kaggle/input/birdclef-model-weights/model_weights.pth',
        '/kaggle/input/model_weight/pytorch/default/1/model_weights.pth',
        map_location=torch.device('cpu')
    )
)
model.eval()  # Set to evaluation mode

  torch.load(


BirdCLEFModel(
  (resnet): ResNet(
    (conv1): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, trac

In [7]:
# Function to convert audio to mel-spectrogram
def audio_to_spectrogram(audio, sr, n_mels=128, fmin=0, fmax=None, n_fft=1024, hop_length=512):
    """Convert audio to mel-spectrogram."""
    S = librosa.feature.melspectrogram(
        y=audio,
        sr=sr,
        n_mels=n_mels,
        fmin=fmin,
        fmax=fmax,
        n_fft=n_fft,
        hop_length=hop_length
    )
    S_dB = librosa.power_to_db(S, ref=np.max)  # Convert to dB scale
    return S_dB

In [8]:
# List test soundscapes
test_soundscape_path = '/kaggle/input/birdclef-2025/test_soundscapes/'
test_soundscapes = [
    os.path.join(test_soundscape_path, afile)
    for afile in sorted(os.listdir(test_soundscape_path))
    if afile.endswith('.ogg')
]

In [9]:
# Initialize predictions DataFrame
predictions = pd.DataFrame(columns=['row_id'] + class_labels)

In [10]:
# Process each soundscape
for soundscape in test_soundscapes:
    # Load audio
    sig, rate = librosa.load(soundscape, sr=None)
    chunk_length = 5 * rate  # 5 seconds in samples
    num_chunks = len(sig) // chunk_length  # Number of full 5-second chunks
    spectrograms = []

    # Split into 5-second chunks and compute spectrograms
    for i in range(num_chunks):
        chunk = sig[i * chunk_length:(i + 1) * chunk_length]
        S = audio_to_spectrogram(chunk, rate)
        # Resize spectrogram to fixed size (e.g., 128x256) for model input
        S_resized = resize(S, (128, 256), anti_aliasing=True)
        spectrograms.append(S_resized)

    # Convert to tensor and batch process
    if spectrograms:  # Ensure there are chunks to process
        S_tensor = torch.tensor(np.stack(spectrograms)).unsqueeze(1)  # Shape: (num_chunks, 1, 128, 256)
        # Make predictions
        with torch.no_grad():
            outputs = model(S_tensor)
        probs = outputs.cpu().numpy()  # Shape: (num_chunks, num_classes)

        # Generate row_ids (e.g., soundscape_1_5, soundscape_1_10, ...)
        soundscape_id = os.path.basename(soundscape).split('.')[0]
        end_times = [(i + 1) * 5 for i in range(num_chunks)]
        row_ids = [f"{soundscape_id}_{end_time}" for end_time in end_times]

        # Create DataFrame for this soundscape
        soundscape_preds = pd.DataFrame(probs, columns=class_labels)
        soundscape_preds.insert(0, 'row_id', row_ids)
        # Append to predictions
        predictions = pd.concat([predictions, soundscape_preds], axis=0, ignore_index=True)

In [12]:
# Save to submission.csv
predictions.to_csv('submission.csv', index=False)
print("Submission file 'submission.csv' created successfully.")

Submission file 'submission.csv' created successfully.
