# Inference Kernel

## Intialization

### Imports

In [None]:
import os
import cv2
import sys
import time
import math

import random
import librosa
import warnings
import torchaudio
import torchvision
import numpy as np
import pandas as pd
import typing as tp
import IPython.display as ipd
import matplotlib.pyplot as plt

from pathlib import Path
from collections import Counter

import torch
import torch.nn as nn
import torch.utils.data as data
import torch.nn.functional as F

from torch.utils.data import DataLoader
from torch.nn.modules.utils import _pair
from torch.nn import Conv2d, Module, Linear, BatchNorm2d, ReLU


pd.options.display.max_rows = 500
pd.options.display.max_columns = 500

### Utils

In [None]:
def load_audio(path, sr):
    clip, _ = librosa.load(path, sr=sr, mono=True, res_type="kaiser_fast")
    return clip

In [None]:
def load_model_weights(model, weights):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    state_dict = torch.load(weights, map_location=device)
    model.load_state_dict(state_dict)

In [None]:
def seed_everything(seed):
    """
    Seeds basic parameters for reproductibility of results
    
    Arguments:
        seed {int} -- Number of the seed
    """
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True #False

## Data

In [None]:
SEED = 1213
seed_everything(SEED)

In [None]:
ROOT = Path.cwd().parent
INPUT_ROOT = ROOT / "input"
RAW_DATA = INPUT_ROOT / "birdsong-recognition"
TRAIN_AUDIO_DIR = RAW_DATA / "train_audio"

In [None]:
train = pd.read_csv(RAW_DATA / "train.csv")

In [None]:
TEST_AUDIO_DIR = RAW_DATA / "test_audio"

if not TEST_AUDIO_DIR.exists():
    TEST_AUDIO_DIR = INPUT_ROOT / "birdcall-check" / "test_audio"
    test = pd.read_csv(INPUT_ROOT / "birdcall-check" / "test.csv")
else:
    test = pd.read_csv(RAW_DATA / "test.csv")

## Parameters

In [None]:
CLASSES = sorted(os.listdir(TRAIN_AUDIO_DIR))
NUM_CLASSES = len(CLASSES)
NUM_WORKERS = 4

In [None]:
class AudioParams:
    sr = 32000
    stride = 5
    true_kernel_size = 5

    img_size = None
    
    # Melspectrogram
    n_mels = 128
    fmin = 20
    fmax = 16000

## Dataset

In [None]:
def convert_site_3(df, clip_length, params):
    n_samples = clip_length // (params.sr * params.true_kernel_size)  # may lose the end 
    
    audio_id = [df['audio_id'].values[0]] * n_samples
    site = ['site_3'] * n_samples
    seconds = [i * params.true_kernel_size for i in range(1, n_samples + 1)]
    row_id = [f'site_3_{audio_id[0]}_{int(s)}' for s in seconds]
    
    new_df = pd.DataFrame(data={'site': site,
                                'row_id': row_id,
                                'seconds': seconds,
                                'audio_id': audio_id
                               })
    
    return new_df

In [None]:
def compute_melspec(y, params):
    melspec = librosa.feature.melspectrogram(
        y,
        sr=params.sr,
        n_mels=params.n_mels,
        fmin=params.fmin,
        fmax=params.fmax
    )
    
    melspec = librosa.power_to_db(melspec).astype(np.float32)
    
    return melspec

In [None]:
def mono_to_color(X, eps=1e-6, mean=None, std=None):
    X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = mean or X.mean()
    std = std or X.std()
    X = (X - mean) / (std + eps)

    # Normalize to [0, 255]
    _min, _max = X.min(), X.max()

    if (_max - _min) > eps:
        V = np.clip(X, _min, _max)
        V = 255 * (V - _min) / (_max - _min)
        V = V.astype(np.uint8)
    else:
        V = np.zeros_like(X, dtype=np.uint8)

    return V


def resize(image, size=None):
    if size is not None:
        h, w, _ = image.shape
        new_w, new_h = int(w * size / h), size
        image = cv2.resize(image, (new_w, new_h))

    return image


def normalize(image, mean=None, std=None):
    image = image / 255.0
    if mean is not None and std is not None:
        image = (image - mean) / std
    return np.moveaxis(image, 2, 0).astype(np.float32)

In [None]:
class TestDataset(data.Dataset):
    def __init__(self, df, clip, params):
        self.df = df
        self.clip = clip
        self.params = params
        
        if df['site'].values[0] == 'site_3':
            self.df = convert_site_3(df, len(clip), params)
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):      
        end_seconds = int(self.df['seconds'][idx])
        start_seconds = int(end_seconds - 5)

        start_index = self.params.sr * start_seconds
        end_index = self.params.sr * end_seconds

        y = self.clip[start_index:end_index].astype(np.float32)

        melspec = compute_melspec(y, self.params)
        
        image = mono_to_color(melspec)
        image = resize(image, self.params.img_size)
        image = normalize(image, mean=None, std=None)
        
        return image

## Modeling


### Resnest
> From https://github.com/zhanghang1989/ResNeSt

In [None]:
class SplAtConv2d(Module):
    """Split-Attention Conv2d
    """
    def __init__(self, in_channels, channels, kernel_size, stride=(1, 1), padding=(0, 0),
                 dilation=(1, 1), groups=1, bias=True,
                 radix=2, reduction_factor=4,
                 rectify=False, rectify_avg=False, norm_layer=None,
                 dropblock_prob=0.0, **kwargs):
        super(SplAtConv2d, self).__init__()
        padding = _pair(padding)
        self.rectify = rectify and (padding[0] > 0 or padding[1] > 0)
        self.rectify_avg = rectify_avg
        inter_channels = max(in_channels*radix//reduction_factor, 32)
        self.radix = radix
        self.cardinality = groups
        self.channels = channels
        self.dropblock_prob = dropblock_prob
        if self.rectify:
            from rfconv import RFConv2d
            self.conv = RFConv2d(in_channels, channels*radix, kernel_size, stride, padding, dilation,
                                 groups=groups*radix, bias=bias, average_mode=rectify_avg, **kwargs)
        else:
            self.conv = Conv2d(in_channels, channels*radix, kernel_size, stride, padding, dilation,
                               groups=groups*radix, bias=bias, **kwargs)
        self.use_bn = norm_layer is not None
        if self.use_bn:
            self.bn0 = norm_layer(channels*radix)
        self.relu = ReLU(inplace=True)
        self.fc1 = Conv2d(channels, inter_channels, 1, groups=self.cardinality)
        if self.use_bn:
            self.bn1 = norm_layer(inter_channels)
        self.fc2 = Conv2d(inter_channels, channels*radix, 1, groups=self.cardinality)
        if dropblock_prob > 0.0:
            self.dropblock = DropBlock2D(dropblock_prob, 3)
        self.rsoftmax = rSoftMax(radix, groups)

    def forward(self, x):
        x = self.conv(x)
        if self.use_bn:
            x = self.bn0(x)
        if self.dropblock_prob > 0.0:
            x = self.dropblock(x)
        x = self.relu(x)

        batch, rchannel = x.shape[:2]
        if self.radix > 1:
            if torch.__version__ < '1.5':
                splited = torch.split(x, int(rchannel//self.radix), dim=1)
            else:
                splited = torch.split(x, rchannel//self.radix, dim=1)
            gap = sum(splited) 
        else:
            gap = x
        gap = F.adaptive_avg_pool2d(gap, 1)
        gap = self.fc1(gap)

        if self.use_bn:
            gap = self.bn1(gap)
        gap = self.relu(gap)

        atten = self.fc2(gap)
        atten = self.rsoftmax(atten).view(batch, -1, 1, 1)

        if self.radix > 1:
            if torch.__version__ < '1.5':
                attens = torch.split(atten, int(rchannel//self.radix), dim=1)
            else:
                attens = torch.split(atten, rchannel//self.radix, dim=1)
            out = sum([att*split for (att, split) in zip(attens, splited)])
        else:
            out = atten * x
        return out.contiguous()

class rSoftMax(nn.Module):
    def __init__(self, radix, cardinality):
        super().__init__()
        self.radix = radix
        self.cardinality = cardinality

    def forward(self, x):
        batch = x.size(0)
        if self.radix > 1:
            x = x.view(batch, self.cardinality, self.radix, -1).transpose(1, 2)
            x = F.softmax(x, dim=1)
            x = x.reshape(batch, -1)
        else:
            x = torch.sigmoid(x)
        return x

In [None]:
class DropBlock2D(object):
    def __init__(self, *args, **kwargs):
        raise NotImplementedError

class GlobalAvgPool2d(nn.Module):
    def __init__(self):
        """Global average pooling over the input's spatial dimensions"""
        super(GlobalAvgPool2d, self).__init__()

    def forward(self, inputs):
        return nn.functional.adaptive_avg_pool2d(inputs, 1).view(inputs.size(0), -1)

class Bottleneck(nn.Module):
    """ResNet Bottleneck
    """
    # pylint: disable=unused-argument
    expansion = 4
    def __init__(self, inplanes, planes, stride=1, downsample=None,
                 radix=1, cardinality=1, bottleneck_width=64,
                 avd=False, avd_first=False, dilation=1, is_first=False,
                 rectified_conv=False, rectify_avg=False,
                 norm_layer=None, dropblock_prob=0.0, last_gamma=False):
        super(Bottleneck, self).__init__()
        group_width = int(planes * (bottleneck_width / 64.)) * cardinality
        self.conv1 = nn.Conv2d(inplanes, group_width, kernel_size=1, bias=False)
        self.bn1 = norm_layer(group_width)
        self.dropblock_prob = dropblock_prob
        self.radix = radix
        self.avd = avd and (stride > 1 or is_first)
        self.avd_first = avd_first

        if self.avd:
            self.avd_layer = nn.AvgPool2d(3, stride, padding=1)
            stride = 1

        if dropblock_prob > 0.0:
            self.dropblock1 = DropBlock2D(dropblock_prob, 3)
            if radix == 1:
                self.dropblock2 = DropBlock2D(dropblock_prob, 3)
            self.dropblock3 = DropBlock2D(dropblock_prob, 3)

        if radix >= 1:
            self.conv2 = SplAtConv2d(
                group_width, group_width, kernel_size=3,
                stride=stride, padding=dilation,
                dilation=dilation, groups=cardinality, bias=False,
                radix=radix, rectify=rectified_conv,
                rectify_avg=rectify_avg,
                norm_layer=norm_layer,
                dropblock_prob=dropblock_prob)
        elif rectified_conv:
            from rfconv import RFConv2d
            self.conv2 = RFConv2d(
                group_width, group_width, kernel_size=3, stride=stride,
                padding=dilation, dilation=dilation,
                groups=cardinality, bias=False,
                average_mode=rectify_avg)
            self.bn2 = norm_layer(group_width)
        else:
            self.conv2 = nn.Conv2d(
                group_width, group_width, kernel_size=3, stride=stride,
                padding=dilation, dilation=dilation,
                groups=cardinality, bias=False)
            self.bn2 = norm_layer(group_width)

        self.conv3 = nn.Conv2d(
            group_width, planes * 4, kernel_size=1, bias=False)
        self.bn3 = norm_layer(planes*4)

        if last_gamma:
            from torch.nn.init import zeros_
            zeros_(self.bn3.weight)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.dilation = dilation
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        if self.dropblock_prob > 0.0:
            out = self.dropblock1(out)
        out = self.relu(out)

        if self.avd and self.avd_first:
            out = self.avd_layer(out)

        out = self.conv2(out)
        if self.radix == 0:
            out = self.bn2(out)
            if self.dropblock_prob > 0.0:
                out = self.dropblock2(out)
            out = self.relu(out)

        if self.avd and not self.avd_first:
            out = self.avd_layer(out)

        out = self.conv3(out)
        out = self.bn3(out)
        if self.dropblock_prob > 0.0:
            out = self.dropblock3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

class ResNet(nn.Module):
    """ResNet Variants
    Parameters
    ----------
    block : Block
        Class for the residual block. Options are BasicBlockV1, BottleneckV1.
    layers : list of int
        Numbers of layers in each block
    classes : int, default 1000
        Number of classification classes.
    dilated : bool, default False
        Applying dilation strategy to pretrained ResNet yielding a stride-8 model,
        typically used in Semantic Segmentation.
    norm_layer : object
        Normalization layer used in backbone network (default: :class:`mxnet.gluon.nn.BatchNorm`;
        for Synchronized Cross-GPU BachNormalization).
    Reference:
        - He, Kaiming, et al. "Deep residual learning for image recognition." Proceedings of the IEEE conference on computer vision and pattern recognition. 2016.
        - Yu, Fisher, and Vladlen Koltun. "Multi-scale context aggregation by dilated convolutions."
    """
    # pylint: disable=unused-variable
    def __init__(self, block, layers, radix=1, groups=1, bottleneck_width=64,
                 num_classes=1000, dilated=False, dilation=1,
                 deep_stem=False, stem_width=64, avg_down=False,
                 rectified_conv=False, rectify_avg=False,
                 avd=False, avd_first=False,
                 final_drop=0.0, dropblock_prob=0,
                 last_gamma=False, norm_layer=nn.BatchNorm2d):
        self.cardinality = groups
        self.bottleneck_width = bottleneck_width
        # ResNet-D params
        self.inplanes = stem_width*2 if deep_stem else 64
        self.avg_down = avg_down
        self.last_gamma = last_gamma
        # ResNeSt params
        self.radix = radix
        self.avd = avd
        self.avd_first = avd_first

        super(ResNet, self).__init__()
        self.rectified_conv = rectified_conv
        self.rectify_avg = rectify_avg
        if rectified_conv:
            from rfconv import RFConv2d
            conv_layer = RFConv2d
        else:
            conv_layer = nn.Conv2d
        conv_kwargs = {'average_mode': rectify_avg} if rectified_conv else {}
        if deep_stem:
            self.conv1 = nn.Sequential(
                conv_layer(3, stem_width, kernel_size=3, stride=2, padding=1, bias=False, **conv_kwargs),
                norm_layer(stem_width),
                nn.ReLU(inplace=True),
                conv_layer(stem_width, stem_width, kernel_size=3, stride=1, padding=1, bias=False, **conv_kwargs),
                norm_layer(stem_width),
                nn.ReLU(inplace=True),
                conv_layer(stem_width, stem_width*2, kernel_size=3, stride=1, padding=1, bias=False, **conv_kwargs),
            )
        else:
            self.conv1 = conv_layer(3, 64, kernel_size=7, stride=2, padding=3,
                                   bias=False, **conv_kwargs)
        self.bn1 = norm_layer(self.inplanes)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0], norm_layer=norm_layer, is_first=False)
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2, norm_layer=norm_layer)
        if dilated or dilation == 4:
            self.layer3 = self._make_layer(block, 256, layers[2], stride=1,
                                           dilation=2, norm_layer=norm_layer,
                                           dropblock_prob=dropblock_prob)
            self.layer4 = self._make_layer(block, 512, layers[3], stride=1,
                                           dilation=4, norm_layer=norm_layer,
                                           dropblock_prob=dropblock_prob)
        elif dilation==2:
            self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
                                           dilation=1, norm_layer=norm_layer,
                                           dropblock_prob=dropblock_prob)
            self.layer4 = self._make_layer(block, 512, layers[3], stride=1,
                                           dilation=2, norm_layer=norm_layer,
                                           dropblock_prob=dropblock_prob)
        else:
            self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
                                           norm_layer=norm_layer,
                                           dropblock_prob=dropblock_prob)
            self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
                                           norm_layer=norm_layer,
                                           dropblock_prob=dropblock_prob)
        self.avgpool = GlobalAvgPool2d()
        self.drop = nn.Dropout(final_drop) if final_drop > 0.0 else None
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, norm_layer):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride=1, dilation=1, norm_layer=None,
                    dropblock_prob=0.0, is_first=True):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            down_layers = []
            if self.avg_down:
                if dilation == 1:
                    down_layers.append(nn.AvgPool2d(kernel_size=stride, stride=stride,
                                                    ceil_mode=True, count_include_pad=False))
                else:
                    down_layers.append(nn.AvgPool2d(kernel_size=1, stride=1,
                                                    ceil_mode=True, count_include_pad=False))
                down_layers.append(nn.Conv2d(self.inplanes, planes * block.expansion,
                                             kernel_size=1, stride=1, bias=False))
            else:
                down_layers.append(nn.Conv2d(self.inplanes, planes * block.expansion,
                                             kernel_size=1, stride=stride, bias=False))
            down_layers.append(norm_layer(planes * block.expansion))
            downsample = nn.Sequential(*down_layers)

        layers = []
        if dilation == 1 or dilation == 2:
            layers.append(block(self.inplanes, planes, stride, downsample=downsample,
                                radix=self.radix, cardinality=self.cardinality,
                                bottleneck_width=self.bottleneck_width,
                                avd=self.avd, avd_first=self.avd_first,
                                dilation=1, is_first=is_first, rectified_conv=self.rectified_conv,
                                rectify_avg=self.rectify_avg,
                                norm_layer=norm_layer, dropblock_prob=dropblock_prob,
                                last_gamma=self.last_gamma))
        elif dilation == 4:
            layers.append(block(self.inplanes, planes, stride, downsample=downsample,
                                radix=self.radix, cardinality=self.cardinality,
                                bottleneck_width=self.bottleneck_width,
                                avd=self.avd, avd_first=self.avd_first,
                                dilation=2, is_first=is_first, rectified_conv=self.rectified_conv,
                                rectify_avg=self.rectify_avg,
                                norm_layer=norm_layer, dropblock_prob=dropblock_prob,
                                last_gamma=self.last_gamma))
        else:
            raise RuntimeError("=> unknown dilation size: {}".format(dilation))

        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes,
                                radix=self.radix, cardinality=self.cardinality,
                                bottleneck_width=self.bottleneck_width,
                                avd=self.avd, avd_first=self.avd_first,
                                dilation=dilation, rectified_conv=self.rectified_conv,
                                rectify_avg=self.rectify_avg,
                                norm_layer=norm_layer, dropblock_prob=dropblock_prob,
                                last_gamma=self.last_gamma))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        #x = x.view(x.size(0), -1)
        x = torch.flatten(x, 1)
        if self.drop:
            x = self.drop(x)
        x = self.fc(x)

        return x

In [None]:
MODEL_CONFIGS = {
    "resnest50_fast_1s1x64d":
    {
        "num_classes": 264,
        "block": Bottleneck,
        "layers": [3, 4, 6, 3],
        "radix": 1,
        "groups": 1,
        "bottleneck_width": 64,
        "deep_stem": True,
        "stem_width": 32,
        "avg_down": True,
        "avd" : True,
        "avd_first" : True
    },
    "resnest101":
    {
        "num_classes": 264,
        "block": Bottleneck,
        "layers": [3, 4, 23, 3],
        "radix": 2,
        "groups": 1,
        "bottleneck_width": 64,
        "deep_stem": True,
        "stem_width": 64,
        "avg_down": True,
        "avd" : True,
        "avd_first" : False
    },
}

### Model loader

In [None]:
def get_model(name):
    if "resnest" in name:
        model = ResNet(**MODEL_CONFIGS[name])
    elif "resnext101" in name:
#         model = resnext101_32x8d()
        model = torchvision.models.resnext101_32x8d(pretrained=False)
    elif "resnext50" in name:
        model = torchvision.models.resnext50_32x4d(pretrained=False)
    else:
        raise NotImplementedError

    nb_ft = model.fc.in_features
    del model.fc
    model.fc = nn.Linear(nb_ft, NUM_CLASSES)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    
    return model

## Inference

### Predict

In [None]:
def predict(model, dataset, batch_size=16):
    model.eval()
    preds = np.empty((0, NUM_CLASSES))
    
    loader = DataLoader(
        dataset, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True
    )

    with torch.no_grad():
        for x in loader:
            y_pred = model(x.cuda()).detach()
            preds = np.concatenate([preds, torch.sigmoid(y_pred).cpu().numpy()])
    
    return preds

### Post-process

In [None]:
def post_process_site_12(preds, threshold=0.5, maxpreds=3):
    preds = preds * (preds >= threshold)   # remove preds < threshold

    next_preds = np.concatenate([preds[1:], np.zeros((1, preds.shape[-1]))])   # pred corresponding to next window
    prev_preds = np.concatenate([np.zeros((1, preds.shape[-1])), preds[:-1]])  # pred corresponding to previous window
    
    score = preds + 0.5 * next_preds + 0.5 * prev_preds  # Aggregating with neighbouring predictions
    
    n_birds = (score >= threshold).sum(-1)   # Counting birds
    n_birds = np.clip(n_birds, 0, maxpreds)  # keep at most maxpreds birds
    
    labels = [np.argsort(- score[i])[:n_birds[i]] for i in range(len(preds))]  # Getting the n_birds most likely class indices
    
    class_labels = [" ".join([CLASSES[l] for l in label]) for label in labels]  # Getting class names
    
    return class_labels

In [None]:
def post_process_site_3(preds, threshold=0.5, maxpreds=3):
    preds = preds * (preds >= threshold)     # remove preds < threshold

    score = np.sum(preds, 0)                 # Aggregating all the predictions
    
    n_birds = (score >= threshold).sum(-1)   # Counting birds
    n_birds = np.clip(n_birds, 0, maxpreds)  # keep at most maxpreds birds
    
    label = np.argsort(- score)[:n_birds]    # Getting the n_birds most likely class indices
    
    class_labels = " ".join([CLASSES[l] for l in label])  # Getting class names
    
    return class_labels

In [None]:
def max_pred_gen(site, duration):
    if site != "site_3":
        return 3
    else:
        rets = [(7,2), (15, 3), (30, 5), (60, 7)]
        
        for ref_duration,thresh in rets:
            if ref_duration >= duration:
                return thresh
        return 10

In [None]:
def reformat_preds(preds, df, site):
    prediction_df = pd.DataFrame({
        "row_id": df['row_id'].values,
        "birds": preds
    })
    
    prediction_df['birds'] = prediction_df['birds'].replace([''],'nocall')
    
    return prediction_df

### Inference

In [None]:
def inference(test_df, test_audio, configs, params, threshold=0.5):
    unique_audio_id = test_df.audio_id.unique()
    
    models = []
    for config in configs:
        models_ = []
        for weights in config["weights"]:
            model = get_model(config['name'])
            load_model_weights(model, weights)
            models_.append(model)
        models.append(models_)
        
    print(f'\t -> Using {len(models)} models, with {len(models[0])} weights per model.')
        
    pred_dfs = []
    for audio_id in unique_audio_id :
        
        audio_df = test_df[test_df['audio_id'] == audio_id].reset_index(drop=True)
        site = audio_df["site"].values[0]      
        
        print(f'\nMaking predictions for audio {audio_id} in {site} ')

        clip = load_audio(test_audio / (audio_id + ".mp3"), params.sr)
        clip_duration = len(clip) // params.sr
        
        dataset = TestDataset(audio_df, clip, params)
        
        preds = []
        for i, config in enumerate(configs):
            for j, weights in enumerate(config["weights"]):
                pred = predict(models[i][j], dataset, batch_size=16)
                preds.append(pred)
        preds = np.mean(preds, 0)
        
        maxpreds = max_pred_gen(site, clip_duration)
        print(f'Limiting the number of birds to {maxpreds}')
        
        if site == 'site_3':
            preds_pp = post_process_site_3(preds, threshold=threshold, maxpreds=maxpreds)
        else:
            preds_pp = post_process_site_12(preds, threshold=threshold, maxpreds=maxpreds)
        
        print("Predicted classes :", preds_pp)
        
        pred_df = reformat_preds(preds_pp, audio_df, site)
        pred_dfs.append(pred_df)
    
    sub = pd.concat(pred_dfs, axis=0, sort=False).reset_index(drop=True)
    return sub

### Inferene with voting

In [None]:
def vote(preds, min_votes=3):
    votes = Counter(preds)
    return [c for c, count in votes.items() if count >= min_votes]

In [None]:
def inference_voting(test_df, test_audio, configs, params, threshold=0.5, min_votes=3):
    unique_audio_id = test_df.audio_id.unique()
    
    models = []
    for config in configs:
        models_ = []
        for weights in config["weights"]:
            model = get_model(config['name'])
            load_model_weights(model, weights)
            models_.append(model)
        models.append(models_)
        
    print(f'\t -> Using {len(models)} models, with {len(models[0])} weights per model.')
        
    pred_dfs = []
    for audio_id in unique_audio_id :
        
        audio_df = test_df[test_df['audio_id'] == audio_id].reset_index(drop=True)
        site = audio_df["site"].values[0]   
        
        print(f'\nMaking predictions for audio {audio_id} in {site} ')

        clip = load_audio(test_audio / (audio_id + ".mp3"), params.sr)
        clip_duration = len(clip) // params.sr
        
        dataset = TestDataset(audio_df, clip, params)
        
        
        all_preds = []
        for i, config in enumerate(configs):
            
            preds = []
            for j, weights in enumerate(config["weights"]):
                pred = predict(models[i][j], dataset, batch_size=16)
                preds.append(pred)
            preds = np.mean(preds, 0)

            maxpreds = max_pred_gen(site, clip_duration)
            print(f'Limiting the number of birds to {maxpreds}')

            if site == 'site_3':
                preds_pp = post_process_site_3(preds, threshold=threshold, maxpreds=maxpreds)
                preds_pp = [preds_pp]
            else:
                preds_pp = post_process_site_12(preds, threshold=threshold, maxpreds=maxpreds)

            all_preds.append(preds_pp)
            print("Predicted classes :", preds_pp)
        
        final_preds = []
        for i in range(len(all_preds[0])):
            preds = []
            for m in range(len(all_preds)):
                preds += all_preds[m][i].split(' ')
                
            final_pred = vote(preds, min_votes=min_votes)
            final_preds.append(' '.join(final_pred))
        
        print("\n    -> Voted classes :", final_preds)
        
        pred_df = reformat_preds(final_preds, audio_df, site)
        pred_dfs.append(pred_df)
    
    sub = pd.concat(pred_dfs, axis=0, sort=False).reset_index(drop=True)
    return sub

## Prediction

### Used models

In [None]:
configs = []

In [None]:
model_name = "resnext50_32x4d"
weights = [f"../input/birds-cp-2/{model_name}_extra_{i}.pt" for i in range(5)]

for w in weights:
    assert os.path.isfile(w), f"Weights {w} not found"
    
configs.append({
    "name": model_name,
    "weights": weights,
})

In [None]:
model_name = "resnext101_32x8d_wsl"
weights = [f"../input/birds-cp-2/{model_name}_extra_{i}.pt" for i in range(5)]

for w in weights:
    assert os.path.isfile(w), f"Weights {w} not found"
    
configs.append({
    "name": model_name,
    "weights": weights,
})

In [None]:
model_name = "resnest50_fast_1s1x64d"
weights = [f"../input/birds-cp-1/{model_name}_conf_{i}.pt" for i in range(5)]

for w in weights:
    assert os.path.isfile(w), f"Weights {w} not found"
    
configs.append({
    "name": model_name,
    "weights": weights,
})

In [None]:
# model_name = "resnest50_fast_1s1x64d"
# weights = [f"../input/birds-cp-1/{model_name}_mixup5_{i}.pt" for i in range(5)]

# for w in weights:
#     assert os.path.isfile(w), f"Weights {w} not found"
    
# configs.append({
#     "name": model_name,
#     "weights": weights,
# })

In [None]:
configs

### Running inference

In [None]:
threshold = 0.5
min_votes = 2

In [None]:
warnings.filterwarnings("ignore")

# submission = inference(test, TEST_AUDIO_DIR, configs, AudioParams, threshold=threshold)
submission = inference_voting(test, TEST_AUDIO_DIR, configs, AudioParams, threshold=threshold, min_votes=min_votes)

### Submission

In [None]:
submission.to_csv("submission.csv", index=False)
# submission