In [None]:
!pip install /kaggle/input/torchlibrosa/torchlibrosa-0.0.4-py3-none-any.whl

In [None]:
import sys
sys.path.insert(0, '/kaggle/input/geffnet-blend2/')

In [None]:
from geffnet import tf_efficientnet_b4_ns, tf_efficientnet_b3_ns, tf_efficientnet_b0_ns, tf_efficientnet_b2_ns

In [None]:
import os
import gc
import time
import math
import shutil
import random
import warnings
warnings.filterwarnings("ignore")
import typing as tp
from pathlib import Path

import cv2
import librosa
import tqdm
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Conv2d, Module, Linear, BatchNorm2d, ReLU
from torch.nn.modules.utils import _pair
import torch.utils.data as data

from sklearn.preprocessing import LabelEncoder
import torchlibrosa
from argparse import Namespace

In [None]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.cuda.manual_seed_all(seed)
    #torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = False  # type: ignore
    
set_seed(1213)

In [None]:
ROOT = Path.cwd().parent
INPUT_ROOT = ROOT / "input"
RAW_DATA = INPUT_ROOT / "birdsong-recognition"
TRAIN_AUDIO_DIR = RAW_DATA / "train_audio"
TEST_AUDIO_DIR = RAW_DATA / "test_audio"

if not TEST_AUDIO_DIR.exists():
    TEST_AUDIO_DIR = INPUT_ROOT / "birdcall-check" / "test_audio"
    test = pd.read_csv(INPUT_ROOT / "birdcall-check" / "test.csv")
else:
    test = pd.read_csv(RAW_DATA / "test.csv")

In [None]:
train = pd.read_csv(RAW_DATA / "train.csv")
label_encoder = LabelEncoder().fit(train.ebird_code.values)

In [None]:
class model_config:
    ROOT_PATH = "../input/train_audio"
    num_classes = 264
    max_duration = 5
    sample_rate = 32000
    sigmoid = True

    batch_size = 32
    num_workers = 4
    nmels = 128

    melspectrogram_parameters = {
        "n_mels": 128, 
        "fmin": 20, 
        "fmax": 16000, 
        "hop_length": 320, 
        "n_fft": 1024
    }
    threshold = 0.3
    augm_spec_prob = 0.
    res_type = "kaiser_best"

In [None]:
def init_layer(layer):
    """Initialize a Linear or Convolutional layer. """
    nn.init.xavier_uniform_(layer.weight)

    if hasattr(layer, "bias"):
        if layer.bias is not None:
            layer.bias.data.fill_(0.0)


def init_bn(bn):
    """Initialize a Batchnorm layer. """
    bn.bias.data.fill_(0.0)
    bn.weight.data.fill_(1.0)

    
class LogMel(nn.Module):
    def __init__(self, params):
        super(LogMel, self).__init__()
        params = Namespace(**params)
        self.spectrogram_extractor = torchlibrosa.stft.Spectrogram(n_fft=params.n_fft, hop_length=params.hop_length)
        self.logmel_extractor = torchlibrosa.stft.LogmelFilterBank(
            sr=32000, 
            n_fft=params.n_fft, 
            n_mels=params.n_mels, 
            top_db=None, 
            fmin=params.fmin, 
            fmax=params.fmax, 
            is_log=False)

    def forward(self, sound):
        spec = self.spectrogram_extractor(sound)
        spec = self.logmel_extractor(spec)
        spec = torch.log(1e-8 + spec)

        return spec / 10.

class AttBlock(nn.Module):
    def __init__(self, n_in, n_out, activation="linear", temperature=1.0):
        super(AttBlock, self).__init__()

        self.activation = activation
        self.temperature = temperature
        self.att = nn.Conv1d(
            in_channels=n_in,
            out_channels=n_out,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True,
        )
        self.cla = nn.Conv1d(
            in_channels=n_in,
            out_channels=n_out,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True,
        )

        self.bn_att = nn.BatchNorm1d(n_out)
        self.init_weights()

    def init_weights(self):
        init_layer(self.att)
        init_layer(self.cla)
        init_bn(self.bn_att)

    def forward(self, x):
        # x: (n_samples, n_in, n_time)
        norm_att = torch.softmax(torch.clamp(self.att(x), -10, 10), dim=-1)
        cla = self.nonlinear_transform(self.cla(x))
        x = torch.sum(norm_att * cla, dim=2)
        return x, norm_att, cla

    def nonlinear_transform(self, x):
        if self.activation == "linear":
            return x
        elif self.activation == "sigmoid":
            return torch.sigmoid(x)


class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels):

        super(ConvBlock, self).__init__()

        self.conv1 = nn.Conv2d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=(3, 3),
            stride=(1, 1),
            padding=(1, 1),
            bias=False,
        )

        self.conv2 = nn.Conv2d(
            in_channels=out_channels,
            out_channels=out_channels,
            kernel_size=(3, 3),
            stride=(1, 1),
            padding=(1, 1),
            bias=False,
        )

        self.bn1 = nn.BatchNorm2d(out_channels)
        self.bn2 = nn.BatchNorm2d(out_channels)

        self.init_weight()

    def init_weight(self):
        init_layer(self.conv1)
        init_layer(self.conv2)
        init_bn(self.bn1)
        init_bn(self.bn2)

    def forward(self, input, pool_size=(2, 2), pool_type="avg"):

        x = input
        x = F.relu_(self.bn1(self.conv1(x)))
        x = F.relu_(self.bn2(self.conv2(x)))
        if pool_type == "max":
            x = F.max_pool2d(x, kernel_size=pool_size)
        elif pool_type == "avg":
            x = F.avg_pool2d(x, kernel_size=pool_size)
        elif pool_type == "avg+max":
            x1 = F.avg_pool2d(x, kernel_size=pool_size)
            x2 = F.max_pool2d(x, kernel_size=pool_size)
            x = x1 + x2
        else:
            raise Exception("Incorrect argument!")

        return x


def interpolate(x, ratio):
    """Interpolate data in time domain. This is used to compensate the 
    resolution reduction in downsampling of a CNN.
    
    Args:
      x: (batch_size, time_steps, classes_num)
      ratio: int, ratio to interpolate
    Returns:
      upsampled: (batch_size, time_steps * ratio, classes_num)
    """
    (batch_size, time_steps, classes_num) = x.shape
    upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
    upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
    return upsampled


def pad_framewise_output(framewise_output, frames_num):
    """Pad framewise_output to the same length as input frames. The pad value 
    is the same as the value of the last frame.
    Args:
      framewise_output: (batch_size, frames_num, classes_num)
      frames_num: int, number of frames to pad
    Outputs:
      output: (batch_size, frames_num, classes_num)
    """
    pad = framewise_output[:, -1:, :].repeat(
        1, frames_num - framewise_output.shape[1], 1
    )
    """tensor for padding"""

    output = torch.cat((framewise_output, pad), dim=1)
    """(batch_size, frames_num, classes_num)"""

    return output


class Cnn14_DecisionLevelAtt(nn.Module):
    def __init__(self, classes_num, config):
        super(Cnn14_DecisionLevelAtt, self).__init__()
        self.interpolate_ratio = 32  # Downsampled ratio

        self.bn0 = nn.BatchNorm2d(config.melspectrogram_parameters["n_mels"])

        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)

        self.fc1 = nn.Linear(2048, 2048, bias=True)
        self.att_block = AttBlock(2048, classes_num, activation="sigmoid")
        self.logmel = LogMel(config.melspectrogram_parameters)
        
        self.init_weight()

    def init_weight(self):
        init_bn(self.bn0)
        init_layer(self.fc1)

    def forward(self, x):
        #print(x.shape)
        x = self.logmel(x)
        #print(x.shape)
        #x = x.transpose(2, 3)
        frames_num = x.shape[2]

        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)

        x = self.conv_block1(x, pool_size=(2, 2), pool_type="avg")
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block2(x, pool_size=(2, 2), pool_type="avg")
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block3(x, pool_size=(2, 2), pool_type="avg")
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block4(x, pool_size=(2, 2), pool_type="avg")
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block5(x, pool_size=(2, 2), pool_type="avg")
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block6(x, pool_size=(1, 1), pool_type="avg")
        x = F.dropout(x, p=0.2, training=self.training)
        x = torch.mean(x, dim=3)
        

        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
        x = x1 + x2
        x = F.dropout(x, p=0.5, training=self.training)
        x = x.transpose(1, 2)
        x = F.relu_(self.fc1(x))
        x = x.transpose(1, 2)
        x = F.dropout(x, p=0.5, training=self.training)
        (clipwise_output, _, segmentwise_output) = self.att_block(x)
#         segmentwise_output = segmentwise_output.transpose(1, 2)

#         # Get framewise output
#         framewise_output = interpolate(segmentwise_output, self.interpolate_ratio)
#         framewise_output = pad_framewise_output(framewise_output, frames_num)

#         output_dict = {
#             "framewise_output": framewise_output,
#             "clipwise_output": clipwise_output,
#         }

        # print(clipwise_output.min(), clipwise_output.max())
        return clipwise_output

In [None]:
def _resnet_conv3x3(in_planes, out_planes):
    #3x3 convolution with padding
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=1,
                     padding=1, groups=1, bias=False, dilation=1)


def _resnet_conv1x1(in_planes, out_planes):
    #1x1 convolution
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, bias=False)


class _ResnetBasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                 base_width=64, dilation=1, norm_layer=None):
        super(_ResnetBasicBlock, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        if groups != 1 or base_width != 64:
            raise ValueError('_ResnetBasicBlock only supports groups=1 and base_width=64')
        if dilation > 1:
            raise NotImplementedError("Dilation > 1 not supported in _ResnetBasicBlock")
        # Both self.conv1 and self.downsample layers downsample the input when stride != 1

        self.stride = stride

        self.conv1 = _resnet_conv3x3(inplanes, planes)
        self.bn1 = norm_layer(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = _resnet_conv3x3(planes, planes)
        self.bn2 = norm_layer(planes)
        self.downsample = downsample
        self.stride = stride

        self.init_weights()

    def init_weights(self):
        init_layer(self.conv1)
        init_bn(self.bn1)
        init_layer(self.conv2)
        init_bn(self.bn2)
        nn.init.constant_(self.bn2.weight, 0)

    def forward(self, x):
        identity = x

        if self.stride == 2:
            out = F.avg_pool2d(x, kernel_size=(2, 2))
        else:
            out = x

        out = self.conv1(out)
        out = self.bn1(out)
        out = self.relu(out)
        out = F.dropout(out, p=0.1, training=self.training)

        out = self.conv2(out)
        out = self.bn2(out)
        
        if self.downsample is not None:
            identity = self.downsample(identity)

        out += identity
        out = self.relu(out)

        return out


class _ResnetBottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                 base_width=64, dilation=1, norm_layer=None):
        super(_ResnetBottleneck, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        width = int(planes * (base_width / 64.)) * groups
        self.stride = stride
        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
        self.conv1 = _resnet_conv1x1(inplanes, width)
        self.bn1 = norm_layer(width)
        self.conv2 = _resnet_conv3x3(width, width)
        self.bn2 = norm_layer(width)
        self.conv3 = _resnet_conv1x1(width, planes * self.expansion)
        self.bn3 = norm_layer(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

        self.init_weights()

    def init_weights(self):
        init_layer(self.conv1)
        init_bn(self.bn1)
        init_layer(self.conv2)
        init_bn(self.bn2)
        init_layer(self.conv3)
        init_bn(self.bn3)
        nn.init.constant_(self.bn3.weight, 0)

    def forward(self, x):
        identity = x

        if self.stride == 2:
            x = F.avg_pool2d(x, kernel_size=(2, 2))

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)
        out = F.dropout(out, p=0.1, training=self.training)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(identity)

        out += identity
        out = self.relu(out)

        return out


class _ResNet(nn.Module):
    def __init__(self, block, layers, zero_init_residual=False,
                 groups=1, width_per_group=64, replace_stride_with_dilation=None,
                 norm_layer=None):
        super(_ResNet, self).__init__()

        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        self._norm_layer = norm_layer

        self.inplanes = 64
        self.dilation = 1
        if replace_stride_with_dilation is None:
            # each element in the tuple indicates if we should replace
            # the 2x2 stride with a dilated convolution instead
            replace_stride_with_dilation = [False, False, False]
        if len(replace_stride_with_dilation) != 3:
            raise ValueError("replace_stride_with_dilation should be None "
                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
        self.groups = groups
        self.base_width = width_per_group

        self.layer1 = self._make_layer(block, 64, layers[0], stride=1)
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
                                       dilate=replace_stride_with_dilation[0])
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
                                       dilate=replace_stride_with_dilation[1])
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
                                       dilate=replace_stride_with_dilation[2])

    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
        norm_layer = self._norm_layer
        downsample = None
        previous_dilation = self.dilation
        if dilate:
            self.dilation *= stride
            stride = 1
        if stride != 1 or self.inplanes != planes * block.expansion:
            if stride == 1:
                downsample = nn.Sequential(
                    _resnet_conv1x1(self.inplanes, planes * block.expansion),
                    norm_layer(planes * block.expansion),
                )
                init_layer(downsample[0])
                init_bn(downsample[1])
            elif stride == 2:
                downsample = nn.Sequential(
                    nn.AvgPool2d(kernel_size=2), 
                    _resnet_conv1x1(self.inplanes, planes * block.expansion),
                    norm_layer(planes * block.expansion),
                )
                init_layer(downsample[1])
                init_bn(downsample[2])

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
                            self.base_width, previous_dilation, norm_layer))
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes, groups=self.groups,
                                base_width=self.base_width, dilation=self.dilation,
                                norm_layer=norm_layer))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        return x

class ResNet38(nn.Module):
    def __init__(self, classes_num, config):
        super(ResNet38, self).__init__()

        self.bn0 = nn.BatchNorm2d(config.nmels)

        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)

        self.resnet = _ResNet(block=_ResnetBasicBlock, layers=[3, 4, 6, 3], zero_init_residual=True)

        self.conv_block_after1 = ConvBlock(in_channels=512, out_channels=2048)

        self.fc1 = nn.Linear(2048, 2048)
        self.classifier = nn.Linear(2048, classes_num, bias=True)

        self.init_weights()

        self.logmel = LogMel(config.melspectrogram_parameters)
        self.spec_augm = torchlibrosa.augmentation.SpecAugmentation(
            time_drop_width=64,
            time_stripes_num=2 * (config.max_duration // 5),
            freq_drop_width=8,
            freq_stripes_num=2,
        )
        self.spec_augm_prob = config.augm_spec_prob

    def init_weights(self):
        init_bn(self.bn0)
        init_layer(self.fc1)
        init_layer(self.classifier)


    def forward(self, input):
        x = self.logmel(input)
        if self.training:
            mask = (torch.rand(x.size(0)) > 0.33).to(device)
            x = torch.where(
                torch.repeat_interleave(mask, x.size(2) * x.size(3)).reshape(x.size()),
                x, self.spec_augm(x))
        
        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)
    
        
        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training, inplace=True)
        x = self.resnet(x)
        x = F.avg_pool2d(x, kernel_size=(2, 2))
        x = F.dropout(x, p=0.2, training=self.training, inplace=True)
        x = self.conv_block_after1(x, pool_size=(1, 1), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training, inplace=True)
        x = torch.mean(x, dim=3)
        
        (x1, _) = torch.max(x, dim=2)
        x2 = torch.mean(x, dim=2)
        x = x1 + x2
        x = F.dropout(x, p=0.5, training=self.training)
        x = F.relu_(self.fc1(x))
        embedding = F.dropout(x, p=0.5, training=self.training)
        clipwise_output = self.classifier(x)

        return clipwise_output

In [None]:
def get_model(config):
    models = []
    sigmoids = []
    weights = []
    
    for fold in range(5):
        # b4_128_xeno_pshift_bg
        config.model = "b4"
        config.vertical_encoding = True
        config.batch_norm = True
        
        model = tf_efficientnet_b4_ns(
            config, pretrained=False, num_classes=config.num_classes
        ).cuda()
        
        state_dict = torch.load(f"../input/blend5-b4-best/fold_{fold}_test_1_05.pth")
        model.load_state_dict(state_dict, strict=False)
        model.eval()
        models.append(model)  
        sigmoids.append(True) 
        weights.append(1)   
        
    for fold in range(5):
        config.model = "b3"
        config.vertical_encoding = True
        config.batch_norm = True
        
        model = tf_efficientnet_b3_ns(
            config, pretrained=False, num_classes=config.num_classes
        ).cuda()
        state_dict = torch.load(f"../input/blend5-b3-london/fold_{fold}_test_1_05.pth")
        model.load_state_dict(state_dict, strict=False)
        model.eval()
        models.append(model) 
        sigmoids.append(True)  
        weights.append(1)   
        
    for fold in range(5):
        config.model = "b2"
        config.vertical_encoding = True
        config.batch_norm = True
        
        model = tf_efficientnet_b2_ns(
            config, pretrained=False, num_classes=config.num_classes
        ).cuda()
        state_dict = torch.load(f"../input/blend5-b2-xbet/fold_{fold}_test_1_05.pth")
        model.load_state_dict(state_dict, strict=False)
        model.eval()
        models.append(model)  
        sigmoids.append(True) 
        weights.append(1)   
        
    for fold in range(5):
        config.model = "b0"
        config.vertical_encoding = True
        config.batch_norm = True
        
        model = tf_efficientnet_b0_ns(
            config, pretrained=False, num_classes=config.num_classes
        ).cuda()
        state_dict = torch.load(f"../input/blend5-b0-detka/fold_{fold}_test_1_05.pth")
        model.load_state_dict(state_dict, strict=False)
        model.eval()
        models.append(model)  
        sigmoids.append(True) 
        weights.append(1)   
        
    for fold in range(5):
        config.model = "cnn14_att"        
        model = Cnn14_DecisionLevelAtt(config.num_classes, config).cuda()
        state_dict = torch.load(f"../input/blend5-cnn14-mega/fold_{fold}_test_1_05.pth")
        model.load_state_dict(state_dict, strict=False)
        model.eval()
        models.append(model)  
        sigmoids.append(False) 
        weights.append(2)   
        
    for fold in range(5):
        config.model = "r38"        
        model = ResNet38(config.num_classes, config).cuda()
        state_dict = torch.load(f"../input/blend5-r38-oxford/fold_{fold}_test_1_05.pth")
        model.load_state_dict(state_dict, strict=False)
        model.eval()
        models.append(model)  
        sigmoids.append(True) 
        weights.append(2)   
    
    return models, sigmoids, weights

In [None]:
from librosa.core.audio import __audioread_load as audioread_load

In [None]:
class TestItem:
    def __init__(self, audio_id, site, df, spectrograms):
        self.audio_id = audio_id
        self.site = site
        self.df = df
        self.spectrograms = torch.from_numpy(spectrograms)
        
    def pin_memory(self):
        self.spectrograms = self.spectrograms.pin_memory()
        return self
    
    def __len__(self):
        return len(self.df)


class TestDataset(data.Dataset):
    def __init__(self, test, config):
        self.test = test
        self.config = config
        self.unique_audio_id = test.audio_id.unique()

    def __len__(self):
        return len(self.unique_audio_id)

    def _load_audio(self, filename):
        clip, _ = librosa.load(
            TEST_AUDIO_DIR / (filename + ".mp3"), sr=model_config.sample_rate, mono=True#, res_type="kaiser_best"
        )

        return clip.astype(np.float32)
    
    def _load_audio_faster(self, filename):
        path = TEST_AUDIO_DIR / (filename + '.mp3')
        clip, sr_native = audioread_load(path, offset=0.0, duration=None, dtype=np.float32)
        clip = librosa.to_mono(clip)
        if sr_native > 0:
            clip = librosa.resample(clip, sr_native, model_config.sample_rate, res_type=model_config.res_type)
            
        return clip.astype(np.float32)

    def _transform(self, audio, max_chunks):
        audio = audio[:self.config.sample_rate * self.config.max_duration * max_chunks]
        return audio.reshape(max_chunks, self.config.sample_rate * self.config.max_duration)


    def __getitem__(self, idx: int):
        audio_id = self.unique_audio_id[idx]
        test_df_for_audio_id = test.query(f"audio_id == '{audio_id}'").reset_index(
            drop=True
        )
        site = test_df_for_audio_id.iloc[0].site
        audio = self._load_audio_faster(audio_id)

        if site != "site_3":
            specs = self._transform(audio, len(test_df_for_audio_id))
        else:
            num_chunks = (len(audio) / self.config.sample_rate) // 5
            specs = self._transform(audio, int(num_chunks))

        return TestItem(audio_id, site, test_df_for_audio_id, specs)

def mo_(output):
    if type(output) == tuple:
        return output[0]
    else:
        return output
    
def prediction_for_clip(model, spectrograms, config):
    model.eval()
    with torch.no_grad():
        predictions = []
        batches = math.ceil(spectrograms.shape[0] / config.batch_size)

        for i in range(batches):
            batch = spectrograms[i * config.batch_size : (i + 1) * config.batch_size]
            if batch.ndim == 3:
                batch = batch.unsqueeze(0)
                
            pred = mo_(model(batch))
            predictions.append(pred)

        predictions = torch.cat(predictions, dim=0)

        if config.sigmoid:
            predictions = predictions.sigmoid()

        return predictions

In [None]:
models, sigmoids, weights = get_model(model_config)

In [None]:
predictions_top = []
test_items_top = []

for test_item in tqdm.tqdm(
    torch.utils.data.DataLoader(
        TestDataset(test, model_config), collate_fn=lambda x: x, num_workers=4, shuffle=False, pin_memory=True
    ),
    disable=False
):
    test_item = test_item[0]
    spectrograms = test_item.spectrograms.cuda()
    
    predictions_all = []
    for model, to_sigmoid, weight in zip(models, sigmoids, weights):
        model_config.sigmoid = to_sigmoid
        predictions = prediction_for_clip(model, spectrograms, model_config)
        predictions_all.append(predictions)
        
        # ugly code I know
        if weight == 2:
            predictions_all.append(predictions)
        
    predictions = torch.stack(predictions_all, dim=0).mean(dim=0)
    predictions = predictions.detach().cpu().numpy()
    
    if test_item.site == "site_3":
        predictions = predictions.max(axis=0)
        predictions = np.expand_dims(predictions, 0)
        
    assert len(predictions) == len(test_item)
    
    predictions_top.extend(list(predictions))
    test_items_top.extend(test_item.df.row_id.values)

In [None]:
prediction_dict = {}
for i, row_id in enumerate(test_items_top):
    events = (predictions_top[i] >= model_config.threshold)
    labels = np.argwhere(events).reshape(-1).tolist()

    if len(labels) == 0:
        prediction_dict[row_id] = "nocall"
    else:
        labels_str_list = label_encoder.inverse_transform(labels)
        label_string = " ".join(labels_str_list)
        prediction_dict[row_id] = label_string

In [None]:
row_id = list(prediction_dict.keys())
birds = list(prediction_dict.values())
submission = pd.DataFrame({
    "row_id": row_id,
    "birds": birds
})
submission.to_csv("submission.csv", index=False)

In [None]:
submission.head(50)