In [None]:
!pip install onnxruntime --no-index --find-links=file:///kaggle/input/save-out-pip-libraries-without-internet/onnxrunt/

In [None]:
import os
import gc
import math
import random
from pathlib import Path

import cv2
import librosa

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Conv2d, Module, Linear, BatchNorm2d, ReLU
from torch.nn.modules.utils import _pair
import torch.utils.data as data

In [None]:
import onnxruntime

In [None]:
ROOT = Path.cwd().parent
INPUT_ROOT = ROOT / "input"
RAW_DATA = INPUT_ROOT / "birdsong-recognition"
TEST_AUDIO_DIR = RAW_DATA / "test_audio"

In [None]:
if not TEST_AUDIO_DIR.exists():
    TEST_AUDIO_DIR = INPUT_ROOT / "birdcall-check" / "test_audio"
    test = pd.read_csv(INPUT_ROOT / "birdcall-check" / "test.csv")
else:
    test = pd.read_csv(RAW_DATA / "test.csv")

In [None]:
sub = pd.read_csv("../input/birdsong-recognition/sample_submission.csv")
#sub['ebird_code'] = 'evegro'
#sub.to_csv("submission.csv", index=False)  # this will be overwritten if everything goes well

In [None]:
TARGET_SR = 32000

model_config = {
    "base_model_name": "resnest50_fast_1s1x64d",
    "pretrained": False,
    "num_classes": 264,
    "trained_weights": "../input/training-birdsong-baseline-resnest50-fast/best_model.pth"
}

melspectrogram_parameters = {
    "n_mels": 128,
    "fmin": 20,
    "fmax": 16000
}

In [None]:
BIRD_CODE = {
    'aldfly': 0, 'ameavo': 1, 'amebit': 2, 'amecro': 3, 'amegfi': 4,
    'amekes': 5, 'amepip': 6, 'amered': 7, 'amerob': 8, 'amewig': 9,
    'amewoo': 10, 'amtspa': 11, 'annhum': 12, 'astfly': 13, 'baisan': 14,
    'baleag': 15, 'balori': 16, 'banswa': 17, 'barswa': 18, 'bawwar': 19,
    'belkin1': 20, 'belspa2': 21, 'bewwre': 22, 'bkbcuc': 23, 'bkbmag1': 24,
    'bkbwar': 25, 'bkcchi': 26, 'bkchum': 27, 'bkhgro': 28, 'bkpwar': 29,
    'bktspa': 30, 'blkpho': 31, 'blugrb1': 32, 'blujay': 33, 'bnhcow': 34,
    'boboli': 35, 'bongul': 36, 'brdowl': 37, 'brebla': 38, 'brespa': 39,
    'brncre': 40, 'brnthr': 41, 'brthum': 42, 'brwhaw': 43, 'btbwar': 44,
    'btnwar': 45, 'btywar': 46, 'buffle': 47, 'buggna': 48, 'buhvir': 49,
    'bulori': 50, 'bushti': 51, 'buwtea': 52, 'buwwar': 53, 'cacwre': 54,
    'calgul': 55, 'calqua': 56, 'camwar': 57, 'cangoo': 58, 'canwar': 59,
    'canwre': 60, 'carwre': 61, 'casfin': 62, 'caster1': 63, 'casvir': 64,
    'cedwax': 65, 'chispa': 66, 'chiswi': 67, 'chswar': 68, 'chukar': 69,
    'clanut': 70, 'cliswa': 71, 'comgol': 72, 'comgra': 73, 'comloo': 74,
    'commer': 75, 'comnig': 76, 'comrav': 77, 'comred': 78, 'comter': 79,
    'comyel': 80, 'coohaw': 81, 'coshum': 82, 'cowscj1': 83, 'daejun': 84,
    'doccor': 85, 'dowwoo': 86, 'dusfly': 87, 'eargre': 88, 'easblu': 89,
    'easkin': 90, 'easmea': 91, 'easpho': 92, 'eastow': 93, 'eawpew': 94,
    'eucdov': 95, 'eursta': 96, 'evegro': 97, 'fiespa': 98, 'fiscro': 99,
    'foxspa': 100, 'gadwal': 101, 'gcrfin': 102, 'gnttow': 103, 'gnwtea': 104,
    'gockin': 105, 'gocspa': 106, 'goleag': 107, 'grbher3': 108, 'grcfly': 109,
    'greegr': 110, 'greroa': 111, 'greyel': 112, 'grhowl': 113, 'grnher': 114,
    'grtgra': 115, 'grycat': 116, 'gryfly': 117, 'haiwoo': 118, 'hamfly': 119,
    'hergul': 120, 'herthr': 121, 'hoomer': 122, 'hoowar': 123, 'horgre': 124,
    'horlar': 125, 'houfin': 126, 'houspa': 127, 'houwre': 128, 'indbun': 129,
    'juntit1': 130, 'killde': 131, 'labwoo': 132, 'larspa': 133, 'lazbun': 134,
    'leabit': 135, 'leafly': 136, 'leasan': 137, 'lecthr': 138, 'lesgol': 139,
    'lesnig': 140, 'lesyel': 141, 'lewwoo': 142, 'linspa': 143, 'lobcur': 144,
    'lobdow': 145, 'logshr': 146, 'lotduc': 147, 'louwat': 148, 'macwar': 149,
    'magwar': 150, 'mallar3': 151, 'marwre': 152, 'merlin': 153, 'moublu': 154,
    'mouchi': 155, 'moudov': 156, 'norcar': 157, 'norfli': 158, 'norhar2': 159,
    'normoc': 160, 'norpar': 161, 'norpin': 162, 'norsho': 163, 'norwat': 164,
    'nrwswa': 165, 'nutwoo': 166, 'olsfly': 167, 'orcwar': 168, 'osprey': 169,
    'ovenbi1': 170, 'palwar': 171, 'pasfly': 172, 'pecsan': 173, 'perfal': 174,
    'phaino': 175, 'pibgre': 176, 'pilwoo': 177, 'pingro': 178, 'pinjay': 179,
    'pinsis': 180, 'pinwar': 181, 'plsvir': 182, 'prawar': 183, 'purfin': 184,
    'pygnut': 185, 'rebmer': 186, 'rebnut': 187, 'rebsap': 188, 'rebwoo': 189,
    'redcro': 190, 'redhea': 191, 'reevir1': 192, 'renpha': 193, 'reshaw': 194,
    'rethaw': 195, 'rewbla': 196, 'ribgul': 197, 'rinduc': 198, 'robgro': 199,
    'rocpig': 200, 'rocwre': 201, 'rthhum': 202, 'ruckin': 203, 'rudduc': 204,
    'rufgro': 205, 'rufhum': 206, 'rusbla': 207, 'sagspa1': 208, 'sagthr': 209,
    'savspa': 210, 'saypho': 211, 'scatan': 212, 'scoori': 213, 'semplo': 214,
    'semsan': 215, 'sheowl': 216, 'shshaw': 217, 'snobun': 218, 'snogoo': 219,
    'solsan': 220, 'sonspa': 221, 'sora': 222, 'sposan': 223, 'spotow': 224,
    'stejay': 225, 'swahaw': 226, 'swaspa': 227, 'swathr': 228, 'treswa': 229,
    'truswa': 230, 'tuftit': 231, 'tunswa': 232, 'veery': 233, 'vesspa': 234,
    'vigswa': 235, 'warvir': 236, 'wesblu': 237, 'wesgre': 238, 'weskin': 239,
    'wesmea': 240, 'wessan': 241, 'westan': 242, 'wewpew': 243, 'whbnut': 244,
    'whcspa': 245, 'whfibi': 246, 'whtspa': 247, 'whtswi': 248, 'wilfly': 249,
    'wilsni1': 250, 'wiltur': 251, 'winwre3': 252, 'wlswar': 253, 'wooduc': 254,
    'wooscj2': 255, 'woothr': 256, 'y00475': 257, 'yebfly': 258, 'yebsap': 259,
    'yehbla': 260, 'yelwar': 261, 'yerwar': 262, 'yetvir': 263
}

INV_BIRD_CODE = {v: k for k, v in BIRD_CODE.items()}

In [None]:
settings = {} # A dictionary to hold the settings, easier to understand than yaml...

# Used for generating the MelSpectrogram image
settings['dataset'] = {}
settings['dataset']['params'] = {}
settings['dataset']['params']['img_size'] = 300 # The image size of the spectrogram that we are CNN'ing over
settings['dataset']['params']['melspectrogram_parameters'] = {} # https://librosa.org/doc/latest/generated/librosa.filters.mel.html#librosa.filters.mel
settings['dataset']['params']['melspectrogram_parameters']['n_mels'] = 300 # The number of Melspectrograms bands to create. the higher this is, the more compelx.
settings['dataset']['params']['melspectrogram_parameters']['fmin'] = 650 # Lowest frequency to use (usually 0)
settings['dataset']['params']['melspectrogram_parameters']['fmax'] = 16000 # Highest frequency to use (usually sr / 2)
settings['dataset']['params']['melspectrogram_parameters']['n_fft'] = 2400 # Usually n_mels * 8
settings['dataset']['params']['melspectrogram_parameters']['hop_length'] = 533 # usually len(sig) / n_mels, so 32000*5 / n_mels

In [None]:
settings_effnetb1 = {} # A dictionary to hold the settings_effnetb1, easier to understand than yaml...

# Used for generating the MelSpectrogram image
settings_effnetb1['dataset'] = {}
settings_effnetb1['dataset']['params'] = {}
settings_effnetb1['dataset']['params']['img_size'] = 240 # The image size of the spectrogram that we are CNN'ing over
settings_effnetb1['dataset']['params']['melspectrogram_parameters'] = {} # https://librosa.org/doc/latest/generated/librosa.filters.mel.html#librosa.filters.mel
settings_effnetb1['dataset']['params']['melspectrogram_parameters']['n_mels'] = 240 # The number of Melspectrograms bands to create. the higher this is, the more compelx.
settings_effnetb1['dataset']['params']['melspectrogram_parameters']['fmin'] = 1000 # Lowest frequency to use (usually 0)
settings_effnetb1['dataset']['params']['melspectrogram_parameters']['fmax'] = 12500 # Highest frequency to use (usually sr / 2)
settings_effnetb1['dataset']['params']['melspectrogram_parameters']['n_fft'] = 1920 # Usually n_mels * 8
settings_effnetb1['dataset']['params']['melspectrogram_parameters']['hop_length'] = 666 # usually len(sig) / n_mels, so 32000*5 / n_mels

In [None]:
settings_thefirst = {}

# Used for generating the MelSpectrogram image
settings_thefirst['dataset'] = {}
settings_thefirst['dataset']['params'] = {}
settings_thefirst['dataset']['params']['img_size'] = 260 # The image size of the spectrogram that we are CNN'ing over
settings_thefirst['dataset']['params']['melspectrogram_parameters'] = {} # https://librosa.org/doc/latest/generated/librosa.filters.mel.html#librosa.filters.mel
settings_thefirst['dataset']['params']['melspectrogram_parameters']['n_mels'] = 260 # The number of Melspectrograms bands to create. the higher this is, the more compelx.
settings_thefirst['dataset']['params']['melspectrogram_parameters']['fmin'] = 20 # Lowest frequency to use (usually 0)
settings_thefirst['dataset']['params']['melspectrogram_parameters']['fmax'] = 15000 # Highest frequency to use (usually sr / 2)
settings_thefirst['dataset']['params']['melspectrogram_parameters']['n_fft'] = 1500 # Usually n_mels * 8
settings_thefirst['dataset']['params']['melspectrogram_parameters']['hop_length'] = 800 # usually len(sig) / n_mels, so 32000*5 / n_mels

In [None]:
settings_thesecond = {} # A dictionary to hold the settings_thesecond, easier to understand than yaml...

# Used for generating the MelSpectrogram image
settings_thesecond['dataset'] = {}
settings_thesecond['dataset']['params'] = {}
settings_thesecond['dataset']['params']['img_size'] = 260 # The image size of the spectrogram that we are CNN'ing over
settings_thesecond['dataset']['params']['melspectrogram_parameters'] = {} # https://librosa.org/doc/latest/generated/librosa.filters.mel.html#librosa.filters.mel
settings_thesecond['dataset']['params']['melspectrogram_parameters']['n_mels'] = 260 # The number of Melspectrograms bands to create. the higher this is, the more compelx.
settings_thesecond['dataset']['params']['melspectrogram_parameters']['fmin'] = 260 # Lowest frequency to use (usually 0)
settings_thesecond['dataset']['params']['melspectrogram_parameters']['fmax'] = 16000 # Highest frequency to use (usually sr / 2)
settings_thesecond['dataset']['params']['melspectrogram_parameters']['n_fft'] = 2080 # Usually n_mels * 8
settings_thesecond['dataset']['params']['melspectrogram_parameters']['hop_length'] = 615 # usually len(sig) / n_mels, so 32000*5 / n_mels

In [None]:
settings_inception = {} # A dictionary to hold the settings_inception, easier to understand than yaml...

# Used for generating the MelSpectrogram image
settings_inception['dataset'] = {}
settings_inception['dataset']['params'] = {}
settings_inception['dataset']['params']['img_size'] = 299 # The image size of the spectrogram that we are CNN'ing over
settings_inception['dataset']['params']['melspectrogram_parameters'] = {} # https://librosa.org/doc/latest/generated/librosa.filters.mel.html#librosa.filters.mel
settings_inception['dataset']['params']['melspectrogram_parameters']['n_mels'] = 299 # The number of Melspectrograms bands to create. the higher this is, the more compelx.
settings_inception['dataset']['params']['melspectrogram_parameters']['fmin'] = 299 # Lowest frequency to use (usually 0)
settings_inception['dataset']['params']['melspectrogram_parameters']['fmax'] = 16000 # Highest frequency to use (usually sr / 2)
settings_inception['dataset']['params']['melspectrogram_parameters']['n_fft'] = 2392 # Usually n_mels * 8
settings_inception['dataset']['params']['melspectrogram_parameters']['hop_length'] = 535 # usually len(sig) / n_mels, so 32000*5 / n_mels

In [None]:
settings_se = {} # A dictionary to hold the settings_se, easier to understand than yaml...

# Used for generating the MelSpectrogram image
settings_se['dataset'] = {}
settings_se['dataset']['params'] = {}
settings_se['dataset']['params']['img_size'] = 224 # The image size of the spectrogram that we are CNN'ing over
settings_se['dataset']['params']['melspectrogram_parameters'] = {} # https://librosa.org/doc/latest/generated/librosa.filters.mel.html#librosa.filters.mel
settings_se['dataset']['params']['melspectrogram_parameters']['n_mels'] = 224 # The number of Melspectrograms bands to create. the higher this is, the more compelx.
settings_se['dataset']['params']['melspectrogram_parameters']['fmin'] = 224 # Lowest frequency to use (usually 0)
settings_se['dataset']['params']['melspectrogram_parameters']['fmax'] = 16000 # Highest frequency to use (usually sr / 2)
settings_se['dataset']['params']['melspectrogram_parameters']['n_fft'] = 1792 # Usually n_mels * 8
settings_se['dataset']['params']['melspectrogram_parameters']['hop_length'] = 714 # usually len(sig) / n_mels, so 32000*5 / n_mels

In [None]:
settings_mixup = {} # A dictionary to hold the settings_mixup, easier to understand than yaml...

# Used for generating the MelSpectrogram image
settings_mixup['dataset'] = {}
settings_mixup['dataset']['params'] = {}
settings_mixup['dataset']['params']['img_size'] = 260 # The image size of the spectrogram that we are CNN'ing over
settings_mixup['dataset']['params']['melspectrogram_parameters'] = {} # https://librosa.org/doc/latest/generated/librosa.filters.mel.html#librosa.filters.mel
settings_mixup['dataset']['params']['melspectrogram_parameters']['n_mels'] = 208 # The number of Melspectrograms bands to create. the higher this is, the more compelx.
settings_mixup['dataset']['params']['melspectrogram_parameters']['fmin'] = 2000 # Lowest frequency to use (usually 0)
settings_mixup['dataset']['params']['melspectrogram_parameters']['fmax'] = 15000 # Highest frequency to use (usually sr / 2)
settings_mixup['dataset']['params']['melspectrogram_parameters']['n_fft'] = 1664 # Usually n_mels * 8
settings_mixup['dataset']['params']['melspectrogram_parameters']['hop_length'] = 769 # usually len(sig) / n_mels, so 32000*5 / n_mels

In [None]:
settings_ls = {} # A dictionary to hold the settings_ls, easier to understand than yaml...

# Used for generating the MelSpectrogram image
settings_ls['dataset'] = {}
settings_ls['dataset']['params'] = {}
settings_ls['dataset']['params']['img_size'] = 260 # The image size of the spectrogram that we are CNN'ing over
settings_ls['dataset']['params']['melspectrogram_parameters'] = {} # https://librosa.org/doc/latest/generated/librosa.filters.mel.html#librosa.filters.mel
settings_ls['dataset']['params']['melspectrogram_parameters']['n_mels'] = 168 # The number of Melspectrograms bands to create. the higher this is, the more compelx.
settings_ls['dataset']['params']['melspectrogram_parameters']['fmin'] = 800 # Lowest frequency to use (usually 0)
settings_ls['dataset']['params']['melspectrogram_parameters']['fmax'] = 14000 # Highest frequency to use (usually sr / 2)
settings_ls['dataset']['params']['melspectrogram_parameters']['n_fft'] = 1344 # Usually n_mels * 8
settings_ls['dataset']['params']['melspectrogram_parameters']['hop_length'] = 1024 # usually len(sig) / n_mels, so 32000*5 / n_mels

In [None]:
def mono_to_color(X: np.ndarray,
                  eps=1e-6):
    """
    Code from https://www.kaggle.com/daisukelab/creating-fat2019-preprocessed-data
    """
    # Stack X as [X,X,X]
    X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = X.mean()
    X = X - mean
    std = X.std()
    Xstd = X / (std + eps)
    norm_min, norm_max = Xstd.min(), Xstd.max()
    if (norm_max - norm_min) > eps:
        # Normalize to [0, 255]
        V = Xstd
        V = 255 * (V - norm_min) / (norm_max - norm_min)
        V = V.astype(np.uint8)
    else:
        # Just zero
        V = np.zeros_like(Xstd, dtype=np.uint8)
    return V


class TestDataset(data.Dataset):
    def __init__(self, df: pd.DataFrame, clip: np.ndarray,
                 img_size=300, melspectrogram_parameters={}, imgsize_resnest=224, resnest_melspec_params={},
                 img_size_b1=240, melspec_b1= {}, img_size_thefirst=260, melspec_thefirst={}, 
                 img_size_thesecond=260, melspec_thesecond={},
                 img_size_inception=299, melspec_inception={},
                 #img_size_res50_melspec=224, melspec_res50_melspec={}):
                 img_size_se=224, melspec_se={},
                 img_size_mixup=260, melspec_mixup={},
                 img_size_ls=260, melspec_ls={}
                ):
        self.df = df
        self.clip = clip
        self.img_size = img_size
        self.melspectrogram_parameters = melspectrogram_parameters
        self.imgsize_resnest = imgsize_resnest
        self.resnest_melspec_params = resnest_melspec_params
        self.imgsize_b1 = img_size_b1
        self.b1_params = melspec_b1
        self.imgsize_thefirst = img_size_thefirst
        self.melspec_thefirst = melspec_thefirst
        self.imgsize_thesecond = img_size_thesecond
        self.melspec_thesecond = melspec_thesecond
        self.imgsize_inception = img_size_inception
        self.melspec_inception = melspec_inception
        #self.imgsize_res50_melspec = img_size_res50_melspec
        #self.melspec_res50_melspec = melspec_res50_melspec
        self.imgsize_se = img_size_se
        self.melspec_se = melspec_se
        self.imgsize_mixup = img_size_mixup
        self.melspec_mixup = melspec_mixup
        self.imgsize_ls = img_size_ls
        self.melspec_ls = melspec_ls
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx: int):
        SR = 32000
        sample = self.df.loc[idx, :]
        site = sample.site
        row_id = sample.row_id
        
        if site == "site_3":
            y = self.clip.astype(np.float32)
            len_y = len(y)
            start = 0
            end = SR * 5
            images = []
            resnest_images = []
            images_b1 = []
            images_thefirst = []
            images_thesecond = []
            images_inception = []
            #images_res50_melspec = []
            images_se = []
            images_mixup = []
            images_ls = []
            while len_y > start:
                y_batch = y[start:end].astype(np.float32)
                if len(y_batch) != (SR * 5):
                    break
                start = end
                end = end + SR * 5
                
                # Efficientnet B3 melspec+PCEN
                melspec = librosa.feature.melspectrogram(y_batch,
                                                         sr=SR,
                                                         **self.melspectrogram_parameters)
                melspec = librosa.pcen(melspec, sr=SR, hop_length=settings['dataset']['params']['melspectrogram_parameters']['hop_length'],gain=0.80,bias=10,power=0.5,time_constant=0.06,eps=1e-6)
                image = mono_to_color(melspec)
                height, width, _ = image.shape
                image = cv2.resize(image, (int(width * self.img_size / height), self.img_size))
                image = np.moveaxis(image, 2, 0)
                image = (image / 255.0).astype(np.float32)
                images.append(image)
                
                # Resnest melspec
                melspec = librosa.feature.melspectrogram(y_batch,
                                                         sr=SR,
                                                         **self.resnest_melspec_params)
                melspec = librosa.power_to_db(melspec).astype(np.float32)
                image = mono_to_color(melspec)
                height, width, _ = image.shape
                image = cv2.resize(image, (int(width * self.imgsize_resnest / height), self.imgsize_resnest))
                image = np.moveaxis(image, 2, 0)
                image = (image / 255.0).astype(np.float32)
                resnest_images.append(image)
                
                # Efficientnet B1 melspec+PCEN}
                melspec = librosa.feature.melspectrogram(y_batch,
                                                         sr=SR,
                                                         **self.b1_params)
                melspec = librosa.pcen(melspec, sr=SR, hop_length=settings_effnetb1['dataset']['params']['melspectrogram_parameters']['hop_length'],gain=0.80,bias=10,power=0.5,time_constant=0.06,eps=1e-6)
                image = mono_to_color(melspec)
                height, width, _ = image.shape
                image = cv2.resize(image, (int(width * self.imgsize_b1 / height), self.imgsize_b1))
                image = np.moveaxis(image, 2, 0)
                image = (image / 255.0).astype(np.float32)
                images_b1.append(image)
                
                # EfficientNet B2s
                melspec = librosa.feature.melspectrogram(y_batch,
                                                         sr=SR,
                                                         **self.melspec_thefirst)
                melspec = librosa.power_to_db(melspec).astype(np.float32)
                image = mono_to_color(melspec)
                height, width, _ = image.shape
                image = cv2.resize(image, (int(width * self.imgsize_thefirst / height), self.imgsize_thefirst))
                image = np.moveaxis(image, 2, 0)
                image = (image / 255.0).astype(np.float32)
                images_thefirst.append(image)
                ####
                melspec = librosa.feature.melspectrogram(y_batch,
                                                         sr=SR,
                                                         **self.melspec_thesecond)
                melspec = librosa.power_to_db(melspec).astype(np.float32)
                image = mono_to_color(melspec)
                height, width, _ = image.shape
                image = cv2.resize(image, (int(width * self.imgsize_thesecond / height), self.imgsize_thesecond))
                image = np.moveaxis(image, 2, 0)
                image = (image / 255.0).astype(np.float32)
                images_thesecond.append(image)
                
                # Inception V4
                melspec = librosa.feature.melspectrogram(y_batch,
                                                         sr=SR,
                                                         **self.melspec_inception)
                melspec = librosa.power_to_db(melspec).astype(np.float32)
                image = mono_to_color(melspec)
                height, width, _ = image.shape
                image = cv2.resize(image, (int(width * self.imgsize_inception / height), self.imgsize_inception))
                image = np.moveaxis(image, 2, 0)
                image = (image / 255.0).astype(np.float32)
                images_inception.append(image)
                
                # Seresnext50
                melspec = librosa.feature.melspectrogram(y_batch,
                                                         sr=SR,
                                                         **self.melspec_se)
                melspec = librosa.power_to_db(melspec).astype(np.float32)
                image = mono_to_color(melspec)
                height, width, _ = image.shape
                image = cv2.resize(image, (int(width * self.imgsize_se / height), self.imgsize_se))
                image = np.moveaxis(image, 2, 0)
                image = (image / 255.0).astype(np.float32)
                images_se.append(image)
                
                # EfficientnetB2 Mixup
                melspec = librosa.feature.melspectrogram(y_batch,
                                                         sr=SR,
                                                         **self.melspec_mixup)
                melspec = librosa.pcen(melspec, sr=SR, hop_length=settings_mixup['dataset']['params']['melspectrogram_parameters']['hop_length'],gain=0.80,bias=10,power=0.5,time_constant=0.06,eps=1e-6)
                image = mono_to_color(melspec)
                height, width, _ = image.shape
                image = cv2.resize(image, (int(width * self.imgsize_mixup / height), self.imgsize_mixup))
                image = np.moveaxis(image, 2, 0)
                image = (image / 255.0).astype(np.float32)
                images_mixup.append(image)
                
                # EfficientnetB2 Labelsmooth
                melspec = librosa.feature.melspectrogram(y_batch,
                                                         sr=SR,
                                                         **self.melspec_ls)
                melspec = librosa.pcen(melspec, sr=SR, hop_length=settings_ls['dataset']['params']['melspectrogram_parameters']['hop_length'],gain=0.80,bias=10,power=0.5,time_constant=0.06,eps=1e-6)
                image = mono_to_color(melspec)
                height, width, _ = image.shape
                image = cv2.resize(image, (int(width * self.imgsize_ls / height), self.imgsize_ls))
                image = np.moveaxis(image, 2, 0)
                image = (image / 255.0).astype(np.float32)
                images_ls.append(image)
                
                # Resnest50 (Melspec + Powertodb)
#                 melspec = librosa.feature.melspectrogram(y_batch,
#                                                          sr=SR,
#                                                          **self.melspec_res50_melspec)
#                 melspec = librosa.power_to_db(melspec).astype(np.float32)
#                 image = mono_to_color(melspec)
#                 height, width, _ = image.shape
#                 image = cv2.resize(image, (int(width * self.imgsize_res50_melspec / height), self.imgsize_res50_melspec))
#                 image = np.moveaxis(image, 2, 0)
#                 image = (image / 255.0).astype(np.float32)
#                 images_res50_melspec.append(image)
                
                
            images = np.asarray(images)
            resnest_images = np.asarray(resnest_images)
            images_b1 = np.array(images_b1)
            images_thefirst = np.array(images_thefirst)
            images_thesecond = np.array(images_thesecond)
            images_inception = np.array(images_inception)
            #images_res50_melspec = np.array(images_res50_melspec)
            images_se = np.array(images_se)
            images_mixup = np.array(images_mixup)
            images_ls = np.array(images_ls)
            return images, row_id, site, resnest_images, images_b1, images_thefirst, images_thesecond, images_inception, images_se, images_mixup, images_ls
        else:
            end_seconds = int(sample.seconds)
            start_seconds = int(end_seconds - 5)
            
            start_index = SR * start_seconds
            end_index = SR * end_seconds
            
            y = self.clip[start_index:end_index].astype(np.float32)

            # Efficientnet B3 Melspec+PCEN
            melspec = librosa.feature.melspectrogram(y, sr=SR, **self.melspectrogram_parameters)
            #melspec = librosa.power_to_db(melspec).astype(np.float32)
            melspec = librosa.pcen(melspec, sr=SR, hop_length=settings['dataset']['params']['melspectrogram_parameters']['hop_length'],gain=0.80,bias=10,power=0.5,time_constant=0.06,eps=1e-6)

            image = mono_to_color(melspec)
            height, width, _ = image.shape
            image = cv2.resize(image, (int(width * self.img_size / height), self.img_size))
            image = np.moveaxis(image, 2, 0)
            image = (image / 255.0).astype(np.float32)
            
            # Resnest melspec
            melspec = librosa.feature.melspectrogram(y, sr=SR, **self.resnest_melspec_params)
            melspec = librosa.power_to_db(melspec).astype(np.float32)

            image2 = mono_to_color(melspec)
            height, width, _ = image2.shape
            image2 = cv2.resize(image2, (int(width * self.imgsize_resnest / height), self.imgsize_resnest))
            image2 = np.moveaxis(image2, 2, 0)
            image2 = (image2 / 255.0).astype(np.float32)
            
            # Efficientnet B1 Melspec+PCEN
            melspec = librosa.feature.melspectrogram(y, sr=SR, **self.b1_params)
            #melspec = librosa.power_to_db(melspec).astype(np.float32)
            melspec = librosa.pcen(melspec, sr=SR, hop_length=settings_effnetb1['dataset']['params']['melspectrogram_parameters']['hop_length'],gain=0.80,bias=10,power=0.5,time_constant=0.06,eps=1e-6)

            image3 = mono_to_color(melspec)
            height, width, _ = image3.shape
            image3 = cv2.resize(image3, (int(width * self.imgsize_b1 / height), self.imgsize_b1))
            image3 = np.moveaxis(image3, 2, 0)
            image3 = (image3 / 255.0).astype(np.float32)
            
            # Efficientnet B2's
            melspec = librosa.feature.melspectrogram(y, sr=SR, **self.melspec_thefirst)
            melspec = librosa.power_to_db(melspec).astype(np.float32)

            image4 = mono_to_color(melspec)
            height, width, _ = image4.shape
            image4 = cv2.resize(image4, (int(width * self.imgsize_thefirst / height), self.imgsize_thefirst))
            image4 = np.moveaxis(image4, 2, 0)
            image4 = (image4 / 255.0).astype(np.float32)
            #####
            melspec = librosa.feature.melspectrogram(y, sr=SR, **self.melspec_thesecond)
            melspec = librosa.power_to_db(melspec).astype(np.float32)

            image5 = mono_to_color(melspec)
            height, width, _ = image5.shape
            image5 = cv2.resize(image5, (int(width * self.imgsize_thesecond / height), self.imgsize_thesecond))
            image5 = np.moveaxis(image5, 2, 0)
            image5 = (image5 / 255.0).astype(np.float32)

            # Inception V4
            melspec = librosa.feature.melspectrogram(y, sr=SR, **self.melspec_inception)
            melspec = librosa.power_to_db(melspec).astype(np.float32)

            image6 = mono_to_color(melspec)
            height, width, _ = image6.shape
            image6 = cv2.resize(image6, (int(width * self.imgsize_inception / height), self.imgsize_inception))
            image6 = np.moveaxis(image6, 2, 0)
            image6 = (image6 / 255.0).astype(np.float32)
            
            # Seresnext50
            melspec = librosa.feature.melspectrogram(y, sr=SR, **self.melspec_se)
            melspec = librosa.power_to_db(melspec).astype(np.float32)

            image7 = mono_to_color(melspec)
            height, width, _ = image7.shape
            image7 = cv2.resize(image7, (int(width * self.imgsize_se / height), self.imgsize_se))
            image7 = np.moveaxis(image7, 2, 0)
            image7 = (image7 / 255.0).astype(np.float32)
            
            # EfficientnetB2 Mixup
            melspec = librosa.feature.melspectrogram(y,
                                                     sr=SR,
                                                     **self.melspec_mixup)
            melspec = librosa.pcen(melspec, sr=SR, hop_length=settings_mixup['dataset']['params']['melspectrogram_parameters']['hop_length'],gain=0.80,bias=10,power=0.5,time_constant=0.06,eps=1e-6)
            image12 = mono_to_color(melspec)
            height, width, _ = image12.shape
            image12 = cv2.resize(image12, (int(width * self.imgsize_mixup / height), self.imgsize_mixup))
            image12 = np.moveaxis(image12, 2, 0)
            image12 = (image12 / 255.0).astype(np.float32)
            
            # EfficientnetB2 Labelsmooth
            melspec = librosa.feature.melspectrogram(y,
                                                     sr=SR,
                                                     **self.melspec_ls)
            melspec = librosa.pcen(melspec, sr=SR, hop_length=settings_ls['dataset']['params']['melspectrogram_parameters']['hop_length'],gain=0.80,bias=10,power=0.5,time_constant=0.06,eps=1e-6)
            image13 = mono_to_color(melspec)
            height, width, _ = image13.shape
            image13 = cv2.resize(image13, (int(width * self.imgsize_ls / height), self.imgsize_ls))
            image13 = np.moveaxis(image13, 2, 0)
            image13 = (image13 / 255.0).astype(np.float32)
            
#             # Resnest50 (Melspec + PowerToDb)
#             melspec = librosa.feature.melspectrogram(y, sr=SR, **self.melspec_res50_melspec)
#             melspec = librosa.power_to_db(melspec).astype(np.float32)

#             image7 = mono_to_color(melspec)
#             height, width, _ = image7.shape
#             image7 = cv2.resize(image7, (int(width * self.imgsize_res50_melspec / height), self.imgsize_res50_melspec))
#             image7 = np.moveaxis(image7, 2, 0)
#             image7 = (image7 / 255.0).astype(np.float32)
            return image, row_id, site, image2, image3, image4, image5, image6, image7, image12, image13

### model

* I forked this code from authors' original implementation. [GitHub](https://github.com/zhanghang1989/ResNeSt)

In [None]:
class SplAtConv2d(Module):
    """Split-Attention Conv2d
    """
    def __init__(self, in_channels, channels, kernel_size, stride=(1, 1), padding=(0, 0),
                 dilation=(1, 1), groups=1, bias=True,
                 radix=2, reduction_factor=4,
                 rectify=False, rectify_avg=False, norm_layer=None,
                 dropblock_prob=0.0, **kwargs):
        super(SplAtConv2d, self).__init__()
        padding = _pair(padding)
        self.rectify = rectify and (padding[0] > 0 or padding[1] > 0)
        self.rectify_avg = rectify_avg
        inter_channels = max(in_channels*radix//reduction_factor, 32)
        self.radix = radix
        self.cardinality = groups
        self.channels = channels
        self.dropblock_prob = dropblock_prob
        if self.rectify:
            from rfconv import RFConv2d
            self.conv = RFConv2d(in_channels, channels*radix, kernel_size, stride, padding, dilation,
                                 groups=groups*radix, bias=bias, average_mode=rectify_avg, **kwargs)
        else:
            self.conv = Conv2d(in_channels, channels*radix, kernel_size, stride, padding, dilation,
                               groups=groups*radix, bias=bias, **kwargs)
        self.use_bn = norm_layer is not None
        if self.use_bn:
            self.bn0 = norm_layer(channels*radix)
        self.relu = ReLU(inplace=True)
        self.fc1 = Conv2d(channels, inter_channels, 1, groups=self.cardinality)
        if self.use_bn:
            self.bn1 = norm_layer(inter_channels)
        self.fc2 = Conv2d(inter_channels, channels*radix, 1, groups=self.cardinality)
        if dropblock_prob > 0.0:
            self.dropblock = DropBlock2D(dropblock_prob, 3)
        self.rsoftmax = rSoftMax(radix, groups)

    def forward(self, x):
        x = self.conv(x)
        if self.use_bn:
            x = self.bn0(x)
        if self.dropblock_prob > 0.0:
            x = self.dropblock(x)
        x = self.relu(x)

        batch, rchannel = x.shape[:2]
        if self.radix > 1:
            if torch.__version__ < '1.5':
                splited = torch.split(x, int(rchannel//self.radix), dim=1)
            else:
                splited = torch.split(x, rchannel//self.radix, dim=1)
            gap = sum(splited) 
        else:
            gap = x
        gap = F.adaptive_avg_pool2d(gap, 1)
        gap = self.fc1(gap)

        if self.use_bn:
            gap = self.bn1(gap)
        gap = self.relu(gap)

        atten = self.fc2(gap)
        atten = self.rsoftmax(atten).view(batch, -1, 1, 1)

        if self.radix > 1:
            if torch.__version__ < '1.5':
                attens = torch.split(atten, int(rchannel//self.radix), dim=1)
            else:
                attens = torch.split(atten, rchannel//self.radix, dim=1)
            out = sum([att*split for (att, split) in zip(attens, splited)])
        else:
            out = atten * x
        return out.contiguous()

class rSoftMax(nn.Module):
    def __init__(self, radix, cardinality):
        super().__init__()
        self.radix = radix
        self.cardinality = cardinality

    def forward(self, x):
        batch = x.size(0)
        if self.radix > 1:
            x = x.view(batch, self.cardinality, self.radix, -1).transpose(1, 2)
            x = F.softmax(x, dim=1)
            x = x.reshape(batch, -1)
        else:
            x = torch.sigmoid(x)
        return x

In [None]:
class DropBlock2D(object):
    def __init__(self, *args, **kwargs):
        raise NotImplementedError

class GlobalAvgPool2d(nn.Module):
    def __init__(self):
        """Global average pooling over the input's spatial dimensions"""
        super(GlobalAvgPool2d, self).__init__()

    def forward(self, inputs):
        return nn.functional.adaptive_avg_pool2d(inputs, 1).view(inputs.size(0), -1)

class Bottleneck(nn.Module):
    """ResNet Bottleneck
    """
    # pylint: disable=unused-argument
    expansion = 4
    def __init__(self, inplanes, planes, stride=1, downsample=None,
                 radix=1, cardinality=1, bottleneck_width=64,
                 avd=False, avd_first=False, dilation=1, is_first=False,
                 rectified_conv=False, rectify_avg=False,
                 norm_layer=None, dropblock_prob=0.0, last_gamma=False):
        super(Bottleneck, self).__init__()
        group_width = int(planes * (bottleneck_width / 64.)) * cardinality
        self.conv1 = nn.Conv2d(inplanes, group_width, kernel_size=1, bias=False)
        self.bn1 = norm_layer(group_width)
        self.dropblock_prob = dropblock_prob
        self.radix = radix
        self.avd = avd and (stride > 1 or is_first)
        self.avd_first = avd_first

        if self.avd:
            self.avd_layer = nn.AvgPool2d(3, stride, padding=1)
            stride = 1

        if dropblock_prob > 0.0:
            self.dropblock1 = DropBlock2D(dropblock_prob, 3)
            if radix == 1:
                self.dropblock2 = DropBlock2D(dropblock_prob, 3)
            self.dropblock3 = DropBlock2D(dropblock_prob, 3)

        if radix >= 1:
            self.conv2 = SplAtConv2d(
                group_width, group_width, kernel_size=3,
                stride=stride, padding=dilation,
                dilation=dilation, groups=cardinality, bias=False,
                radix=radix, rectify=rectified_conv,
                rectify_avg=rectify_avg,
                norm_layer=norm_layer,
                dropblock_prob=dropblock_prob)
        elif rectified_conv:
            from rfconv import RFConv2d
            self.conv2 = RFConv2d(
                group_width, group_width, kernel_size=3, stride=stride,
                padding=dilation, dilation=dilation,
                groups=cardinality, bias=False,
                average_mode=rectify_avg)
            self.bn2 = norm_layer(group_width)
        else:
            self.conv2 = nn.Conv2d(
                group_width, group_width, kernel_size=3, stride=stride,
                padding=dilation, dilation=dilation,
                groups=cardinality, bias=False)
            self.bn2 = norm_layer(group_width)

        self.conv3 = nn.Conv2d(
            group_width, planes * 4, kernel_size=1, bias=False)
        self.bn3 = norm_layer(planes*4)

        if last_gamma:
            from torch.nn.init import zeros_
            zeros_(self.bn3.weight)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.dilation = dilation
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        if self.dropblock_prob > 0.0:
            out = self.dropblock1(out)
        out = self.relu(out)

        if self.avd and self.avd_first:
            out = self.avd_layer(out)

        out = self.conv2(out)
        if self.radix == 0:
            out = self.bn2(out)
            if self.dropblock_prob > 0.0:
                out = self.dropblock2(out)
            out = self.relu(out)

        if self.avd and not self.avd_first:
            out = self.avd_layer(out)

        out = self.conv3(out)
        out = self.bn3(out)
        if self.dropblock_prob > 0.0:
            out = self.dropblock3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

class ResNet(nn.Module):
    """ResNet Variants
    Parameters
    ----------
    block : Block
        Class for the residual block. Options are BasicBlockV1, BottleneckV1.
    layers : list of int
        Numbers of layers in each block
    classes : int, default 1000
        Number of classification classes.
    dilated : bool, default False
        Applying dilation strategy to pretrained ResNet yielding a stride-8 model,
        typically used in Semantic Segmentation.
    norm_layer : object
        Normalization layer used in backbone network (default: :class:`mxnet.gluon.nn.BatchNorm`;
        for Synchronized Cross-GPU BachNormalization).
    Reference:
        - He, Kaiming, et al. "Deep residual learning for image recognition." Proceedings of the IEEE conference on computer vision and pattern recognition. 2016.
        - Yu, Fisher, and Vladlen Koltun. "Multi-scale context aggregation by dilated convolutions."
    """
    # pylint: disable=unused-variable
    def __init__(self, block, layers, radix=1, groups=1, bottleneck_width=64,
                 num_classes=1000, dilated=False, dilation=1,
                 deep_stem=False, stem_width=64, avg_down=False,
                 rectified_conv=False, rectify_avg=False,
                 avd=False, avd_first=False,
                 final_drop=0.0, dropblock_prob=0,
                 last_gamma=False, norm_layer=nn.BatchNorm2d):
        self.cardinality = groups
        self.bottleneck_width = bottleneck_width
        # ResNet-D params
        self.inplanes = stem_width*2 if deep_stem else 64
        self.avg_down = avg_down
        self.last_gamma = last_gamma
        # ResNeSt params
        self.radix = radix
        self.avd = avd
        self.avd_first = avd_first

        super(ResNet, self).__init__()
        self.rectified_conv = rectified_conv
        self.rectify_avg = rectify_avg
        if rectified_conv:
            from rfconv import RFConv2d
            conv_layer = RFConv2d
        else:
            conv_layer = nn.Conv2d
        conv_kwargs = {'average_mode': rectify_avg} if rectified_conv else {}
        if deep_stem:
            self.conv1 = nn.Sequential(
                conv_layer(3, stem_width, kernel_size=3, stride=2, padding=1, bias=False, **conv_kwargs),
                norm_layer(stem_width),
                nn.ReLU(inplace=True),
                conv_layer(stem_width, stem_width, kernel_size=3, stride=1, padding=1, bias=False, **conv_kwargs),
                norm_layer(stem_width),
                nn.ReLU(inplace=True),
                conv_layer(stem_width, stem_width*2, kernel_size=3, stride=1, padding=1, bias=False, **conv_kwargs),
            )
        else:
            self.conv1 = conv_layer(3, 64, kernel_size=7, stride=2, padding=3,
                                   bias=False, **conv_kwargs)
        self.bn1 = norm_layer(self.inplanes)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0], norm_layer=norm_layer, is_first=False)
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2, norm_layer=norm_layer)
        if dilated or dilation == 4:
            self.layer3 = self._make_layer(block, 256, layers[2], stride=1,
                                           dilation=2, norm_layer=norm_layer,
                                           dropblock_prob=dropblock_prob)
            self.layer4 = self._make_layer(block, 512, layers[3], stride=1,
                                           dilation=4, norm_layer=norm_layer,
                                           dropblock_prob=dropblock_prob)
        elif dilation==2:
            self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
                                           dilation=1, norm_layer=norm_layer,
                                           dropblock_prob=dropblock_prob)
            self.layer4 = self._make_layer(block, 512, layers[3], stride=1,
                                           dilation=2, norm_layer=norm_layer,
                                           dropblock_prob=dropblock_prob)
        else:
            self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
                                           norm_layer=norm_layer,
                                           dropblock_prob=dropblock_prob)
            self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
                                           norm_layer=norm_layer,
                                           dropblock_prob=dropblock_prob)
        self.avgpool = GlobalAvgPool2d()
        self.drop = nn.Dropout(final_drop) if final_drop > 0.0 else None
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, norm_layer):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride=1, dilation=1, norm_layer=None,
                    dropblock_prob=0.0, is_first=True):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            down_layers = []
            if self.avg_down:
                if dilation == 1:
                    down_layers.append(nn.AvgPool2d(kernel_size=stride, stride=stride,
                                                    ceil_mode=True, count_include_pad=False))
                else:
                    down_layers.append(nn.AvgPool2d(kernel_size=1, stride=1,
                                                    ceil_mode=True, count_include_pad=False))
                down_layers.append(nn.Conv2d(self.inplanes, planes * block.expansion,
                                             kernel_size=1, stride=1, bias=False))
            else:
                down_layers.append(nn.Conv2d(self.inplanes, planes * block.expansion,
                                             kernel_size=1, stride=stride, bias=False))
            down_layers.append(norm_layer(planes * block.expansion))
            downsample = nn.Sequential(*down_layers)

        layers = []
        if dilation == 1 or dilation == 2:
            layers.append(block(self.inplanes, planes, stride, downsample=downsample,
                                radix=self.radix, cardinality=self.cardinality,
                                bottleneck_width=self.bottleneck_width,
                                avd=self.avd, avd_first=self.avd_first,
                                dilation=1, is_first=is_first, rectified_conv=self.rectified_conv,
                                rectify_avg=self.rectify_avg,
                                norm_layer=norm_layer, dropblock_prob=dropblock_prob,
                                last_gamma=self.last_gamma))
        elif dilation == 4:
            layers.append(block(self.inplanes, planes, stride, downsample=downsample,
                                radix=self.radix, cardinality=self.cardinality,
                                bottleneck_width=self.bottleneck_width,
                                avd=self.avd, avd_first=self.avd_first,
                                dilation=2, is_first=is_first, rectified_conv=self.rectified_conv,
                                rectify_avg=self.rectify_avg,
                                norm_layer=norm_layer, dropblock_prob=dropblock_prob,
                                last_gamma=self.last_gamma))
        else:
            raise RuntimeError("=> unknown dilation size: {}".format(dilation))

        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes,
                                radix=self.radix, cardinality=self.cardinality,
                                bottleneck_width=self.bottleneck_width,
                                avd=self.avd, avd_first=self.avd_first,
                                dilation=dilation, rectified_conv=self.rectified_conv,
                                rectify_avg=self.rectify_avg,
                                norm_layer=norm_layer, dropblock_prob=dropblock_prob,
                                last_gamma=self.last_gamma))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        #x = x.view(x.size(0), -1)
        x = torch.flatten(x, 1)
        if self.drop:
            x = self.drop(x)
        x = self.fc(x)

        return x

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Prediction loop

In [None]:
def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

In [None]:
def prediction_for_clip(test_df: pd.DataFrame, 
                        clip: np.ndarray, 
                        model, # EffNetB3 + PCEN, trained 50-60 epochs
                        #model2: EfficientNet,
                        #model3: EfficientNet,
                        model4, # EffNetB3 + PCEN, trained 60-70 epochs + RohanRao External
                        #model5: EfficientNet,
                        #model6: EfficientNet,
                        #model7: EfficientNet, # EffNetB3 + PCEN, trained 60-70 epochs
                        #model8: EfficientNet,
                        #model9: EfficientNet,
                        #model10: EfficientNet, # EffNetB3 + PCEN, trained 40-50 epochs
                        #model11: EfficientNet,
                        #model12: EfficientNet,
                        model13, # EffNetB1 + PCEN, trained 40-45 epochs
                        #model14: EfficientNet,
                        #model15: EfficientNet,
                        model16, # EffnetB2
                        #model17: EfficientNet,
                        #model18: EfficientNet,
                        model19, # EffnetB2
                        #model20: EfficientNet,
                        #model21: EfficientNet,
                        model_inception,
                        model_inception2,
                        #model_res50_melspec,
                        #model_res50_melspec2,
                        model_se,
                        model_mixup,
                        model_ls,
                        model_resnest,
                        mel_params: dict, 
                        mel_params_resnest: dict,
                        mel_params_b1: dict,
                        mel_params_thefirst: dict,
                        mel_params_thesecond: dict,
                        mel_params_inception: dict,
                        #mel_params_res50_melspec: dict,
                        mel_params_se: dict,
                        mel_params_mixup: dict,
                        mel_params_ls: dict,
                        threshold=0.5,
                        maxpreds=3, # New param --> @kkiller
                       ):
    
    """
    Original code:  @hidehisaarai1213
    First refacto : @ttahara
    Second refacto: @kkiller
    """

    dataset = TestDataset(df=test_df, 
                          clip=clip,
                          img_size=300,
                          melspectrogram_parameters=mel_params,
                         imgsize_resnest=224,
                         resnest_melspec_params=mel_params_resnest,
                         img_size_b1=240, melspec_b1= mel_params_b1,
                         img_size_thefirst=260,melspec_thefirst=mel_params_thefirst,
                         img_size_thesecond=260,melspec_thesecond=mel_params_thesecond,
                         img_size_inception=299, melspec_inception=mel_params_inception,
                         #img_size_res50_melspec=224, melspec_res50_melspec=mel_params_res50_melspec)
                          img_size_se=224, melspec_se=mel_params_se,
                          img_size_mixup=260, melspec_mixup = mel_params_mixup,
                          img_size_ls=260, melspec_ls = mel_params_ls
                         )
    
    loader = data.DataLoader(dataset, batch_size=1, shuffle=False)
    
    prediction_dict = {}
    for image, row_id, site, image2, image3, image4, image5, image6, image7, image8, image9 in loader:
        site = site[0]
        row_id = row_id[0]
        if site in {"site_1", "site_2"}:
            #continue
            #image = image.to(device)
            #image2 = image2.to(device)
            #image3 = image3.to(device)
            #image4 = image4.to(device)
            #image5 = image5.to(device)
            
            with torch.no_grad():
            
                inps1 = {model.get_inputs()[0].name: to_numpy(image)}
                inps2 = {model4.get_inputs()[0].name: to_numpy(image)}
                inps3 = {model13.get_inputs()[0].name: to_numpy(image3)}
                inps4 = {model16.get_inputs()[0].name: to_numpy(image4)}
                inps5 = {model19.get_inputs()[0].name: to_numpy(image5)}
                inps6 = {model_resnest.get_inputs()[0].name: to_numpy(image2)}
                inps7 = {model_inception.get_inputs()[0].name: to_numpy(image6)}
                inps8 = {model_inception2.get_inputs()[0].name: to_numpy(image6)}
                inps9 = {model_se.get_inputs()[0].name: to_numpy(image7)}
                inps10 = {model_mixup.get_inputs()[0].name: to_numpy(image8)}
                inps11 = {model_ls.get_inputs()[0].name: to_numpy(image9)}
                #inps9 = {model_res50_melspec.get_inputs()[0].name: to_numpy(image7)}
                #inps10 = {model_res50_melspec2.get_inputs()[0].name: to_numpy(image7)}

                outs1 = model.run(None, inps1)
                outs2 = model4.run(None, inps2)
                outs3 = model13.run(None, inps3)
                outs4 = model16.run(None, inps4)
                outs5 = model19.run(None, inps5)
                outs6 = model_resnest.run(None, inps6)
                outs7 = model_inception.run(None, inps7)
                outs8 = model_inception2.run(None, inps8)
                outs9 = model_se.run(None, inps9)
                outs10 = model_mixup.run(None, inps10)
                outs11 = model_ls.run(None, inps11)
                #outs9 = model_res50_melspec.run(None, inps9)
                #outs10 = model_res50_melspec2.run(None, inps10)

                outs1 = outs1[0][0]
                outs2 = outs2[0][0]
                outs3 = outs3[0][0]
                outs4 = outs4[0][0]
                outs5 = outs5[0][0]
                outs6 = outs6[0][0]
                outs7 = outs7[0][0]
                outs8 = outs8[0][0]
                outs9 = outs9[0][0]
                outs10 = outs10[0][0]
                outs11 = outs11[0][0]
                #outs9 = outs9[0][0]
                #outs10 = outs10[0][0]

                # Sigmoid
                outs1 = 1/(1 + np.exp(-outs1))
                outs2 = 1/(1 + np.exp(-outs2))
                outs3 = 1/(1 + np.exp(-outs3))
                outs4 = 1/(1 + np.exp(-outs4))
                outs5 = 1/(1 + np.exp(-outs5))
                outs6 = 1/(1 + np.exp(-outs6))
                outs7 = 1/(1 + np.exp(-outs7))
                outs8 = 1/(1 + np.exp(-outs8))
                outs9 = 1/(1 + np.exp(-outs9))
                outs10 = 1/(1 + np.exp(-outs10))
                outs11 = 1/(1 + np.exp(-outs11))
                #outs9 = 1/(1 + np.exp(-outs9))
                #outs10 = 1/(1 + np.exp(-outs10))

                # Square
                outs1 = outs1**2
                outs2 = outs2**2
                outs3 = outs3**2
                outs4 = outs4**2
                outs5 = outs5**2
                outs6 = outs6**2
                outs7 = outs7**2
                outs8 = outs8**2
                outs9 = outs9**2
                outs10 = outs10**2
                outs11 = outs11**2
                #outs9 = outs9**2
                #outs10 = outs10**2

                # Mean
                proba = np.sqrt((outs1+outs2+outs3+outs4+outs5+outs6+outs7+outs8+outs9+outs10+outs11)/11)

            events = proba >= threshold
            labels = np.argsort(-proba)[:events.sum()].tolist()

        else:
            # to avoid prediction on large batch
            image = image.squeeze(0)
            image2 = image2.squeeze(0)
            image3 = image3.squeeze(0)
            image4 = image4.squeeze(0)
            image5 = image5.squeeze(0)
            image6 = image6.squeeze(0)
            image7 = image7.squeeze(0)
            image8 = image8.squeeze(0)
            image9 = image9.squeeze(0)
            batch_size = 16
            whole_size = image.size(0)
            if whole_size % batch_size == 0:
                n_iter = whole_size // batch_size
            else:
                n_iter = whole_size // batch_size + 1
                
            all_events = set()
            probas = []
            for batch_i in range(n_iter):
                batch = image[batch_i * batch_size:(batch_i + 1) * batch_size]
                batch2 = image2[batch_i * batch_size:(batch_i + 1) * batch_size]
                batch3 = image3[batch_i * batch_size:(batch_i + 1) * batch_size]
                batch4 = image4[batch_i * batch_size:(batch_i + 1) * batch_size]
                batch5 = image5[batch_i * batch_size:(batch_i + 1) * batch_size]
                batch6 = image6[batch_i * batch_size:(batch_i + 1) * batch_size]
                batch7 = image7[batch_i * batch_size:(batch_i + 1) * batch_size]
                batch8 = image8[batch_i * batch_size:(batch_i + 1) * batch_size]
                batch9 = image9[batch_i * batch_size:(batch_i + 1) * batch_size]
                if batch.ndim == 3:
                    batch = batch.unsqueeze(0)
                if batch2.ndim == 3:
                    batch2 = batch2.unsqueeze(0)
                if batch3.ndim == 3:
                    batch3 = batch3.unsqueeze(0)
                if batch4.ndim == 3:
                    batch4 = batch4.unsqueeze(0)
                if batch5.ndim == 3:
                    batch5 = batch5.unsqueeze(0)
                if batch6.ndim == 3:
                    batch6 = batch6.unsqueeze(0)
                if batch7.ndim == 3:
                    batch7 = batch7.unsqueeze(0)
                if batch8.ndim == 3:
                    batch8 = batch8.unsqueeze(0)
                if batch9.ndim == 3:
                    batch9 = batch9.unsqueeze(0)

#                 batch = batch.to(device)
#                 batch2 = batch2.to(device)
#                 batch3 = batch3.to(device)
#                 batch4 = batch4.to(device)
#                 batch5 = batch5.to(device)

                with torch.no_grad():

                    inps1 = {model.get_inputs()[0].name: to_numpy(batch)}
                    inps2 = {model4.get_inputs()[0].name: to_numpy(batch)}
                    inps3 = {model13.get_inputs()[0].name: to_numpy(batch3)}
                    inps4 = {model16.get_inputs()[0].name: to_numpy(batch4)}
                    inps5 = {model19.get_inputs()[0].name: to_numpy(batch5)}
                    inps6 = {model_resnest.get_inputs()[0].name: to_numpy(batch2)}
                    inps7 = {model_inception.get_inputs()[0].name: to_numpy(batch6)}
                    inps8 = {model_inception2.get_inputs()[0].name: to_numpy(batch6)}
                    inps9 = {model_se.get_inputs()[0].name: to_numpy(batch7)}
                    inps10 = {model_mixup.get_inputs()[0].name: to_numpy(batch8)}
                    inps11 = {model_ls.get_inputs()[0].name: to_numpy(batch9)}
                    #inps9 = {model_res50_melspec.get_inputs()[0].name: to_numpy(batch7)}
                    #inps10 = {model_res50_melspec2.get_inputs()[0].name: to_numpy(batch7)}

                    outs1 = model.run(None, inps1)
                    outs2 = model4.run(None, inps2)
                    outs3 = model13.run(None, inps3)
                    outs4 = model16.run(None, inps4)
                    outs5 = model19.run(None, inps5)
                    outs6 = model_resnest.run(None, inps6)
                    outs7 = model_inception.run(None, inps7)
                    outs8 = model_inception2.run(None, inps8)
                    outs9 = model_se.run(None, inps9)
                    outs10 = model_mixup.run(None, inps10)
                    outs11 = model_ls.run(None, inps11)
                    #outs9 = model_res50_melspec.run(None, inps9)
                    #outs10 = model_res50_melspec2.run(None, inps10)

                    del inps1,inps2,inps3,inps4,inps5,inps6,inps8,inps9,inps10,inps11
                    gc.collect()

                    outs1 = outs1[0]
                    outs2 = outs2[0]
                    outs3 = outs3[0]
                    outs4 = outs4[0]
                    outs5 = outs5[0]
                    outs6 = outs6[0]
                    outs7 = outs7[0]
                    outs8 = outs8[0]
                    outs9 = outs9[0]
                    outs10 = outs10[0]
                    outs11 = outs11[0]
                    #outs9 = outs9[0]
                    #outs10 = outs10[0]

                    # Sigmoid
                    outs1 = 1/(1 + np.exp(-outs1))
                    outs2 = 1/(1 + np.exp(-outs2))
                    outs3 = 1/(1 + np.exp(-outs3))
                    outs4 = 1/(1 + np.exp(-outs4))
                    outs5 = 1/(1 + np.exp(-outs5))
                    outs6 = 1/(1 + np.exp(-outs6))
                    outs7 = 1/(1 + np.exp(-outs7))
                    outs8 = 1/(1 + np.exp(-outs8))
                    outs9 = 1/(1 + np.exp(-outs9))
                    outs10 = 1/(1 + np.exp(-outs10))
                    outs11 = 1/(1 + np.exp(-outs11))
                    #outs9 = 1/(1 + np.exp(-outs9))
                    #outs10 = 1/(1 + np.exp(-outs10))

                    # Square
                    outs1 = outs1**2
                    outs2 = outs2**2
                    outs3 = outs3**2
                    outs4 = outs4**2
                    outs5 = outs5**2
                    outs6 = outs6**2
                    outs7 = outs7**2
                    outs8 = outs8**2
                    outs9 = outs9**2
                    outs10 = outs10**2
                    outs11 = outs11**2
                    #outs9 = outs9**2
                    #outs10 = outs10**2


                    # Mean
                    probas.append(np.sqrt((outs1+outs2+outs3+outs4+outs5+outs6+outs7+outs8+outs9+outs10+outs11)/11))
                    del outs1,outs2,outs3,outs4,outs5,outs8,outs9,outs10,outs11
                    gc.collect()
                
            probas = np.vstack(probas)
            probas = probas.max(0)
            events = (probas>=threshold)
            labels = np.argsort(-probas)[:events.sum()].tolist()
            
        if len(labels) == 0:
            prediction_dict[row_id] = "nocall"
        else:
            labels_str_list = list(map(lambda x: INV_BIRD_CODE[x], labels))
            
            # Only apply maxpreds to site1 and site2
            if site in ('site_1','site_2'):
                label_string = " ".join(labels_str_list[:maxpreds])
            else:
                label_string = " ".join(labels_str_list)
            prediction_dict[row_id] = label_string
    return prediction_dict

In [None]:
def prediction(test_df: pd.DataFrame,
               test_audio: Path,
               model_config: dict,
               mel_params: dict,
               resnest_melparams: dict,
               b1_melparams: dict,
               thefirst_melparams: dict,
               thesecond_melparams: dict,
               inception_melparams: dict,
               #res50_melspec_melparams: dict,
               se_melparams: dict,
               mixup_melparams: dict,
               ls_melparams: dict,
               target_sr: int,
               threshold=0.5,
               maxpreds = 3, # New param --> @kkiller
              ):
    #model = get_model('../input/effnetb3-with-augs-v2-run3/epoch_51_valloss_3.985397423211764.pt')
    model = onnxruntime.InferenceSession("../input/save-out-onnx-models/model.onnx") #get_model('../input/effnetb3-with-augs-v2-run3/epoch_55_valloss_3.238097043361179.pt')
    #model3 = get_model('../input/effnetb3-with-augs-v2-run3/epoch_59_valloss_3.8764151854444147.pt')
    #model5 = get_model('../input/effnetb3-with-augs-v2-run4-vlad-rohan/epoch_71_valloss_0.1133052933314615.pt')
    model5 = onnxruntime.InferenceSession("../input/save-out-onnx-models/model4.onnx") # get_model('../input/effnetb3-with-augs-v2-run4-vlad-rohan/epoch_67_valloss_0.10827804547696074.pt')
    #model7 = get_model('../input/effnetb3-with-augs-v2-run4-vlad-rohan/epoch_61_valloss_0.2174776398445949.pt')
    #model8 = get_model('../input/effnetb3-with-augs-v2-run4/epoch_63_valloss_3.1640012733380876.pt')
    #model9 = get_model('../input/effnetb3-with-augs-v2-run4/epoch_67_valloss_2.875856534533217.pt')
    #model10 = get_model('../input/effnetb3-with-augs-v2-run4/epoch_71_valloss_3.2250966523378914.pt')
    #model11 = get_model('../input/effnetb3-with-augs-v2-run3/epoch_41_valloss_3.654982790722728.pt')
    #model12 = get_model('../input/effnetb3-with-augs-v2-run3/epoch_45_valloss_3.820303558173541.pt')
    #model13 = get_model('../input/effnetb3-with-augs-v2-run3/epoch_49_valloss_3.7013871521043686.pt')
    model14 = onnxruntime.InferenceSession("../input/save-out-onnx-models/model13.onnx") # get_model_b1('../input/effnetb1-with-augs-v2-run2/epoch_47_valloss_0.144121603593846.pt')
    #model15 = get_model_b1('../input/effnetb1-with-augs-v2-run2/epoch_41_valloss_0.13852256191663506.pt')
    #model16 = get_model_b1('../input/effnetb1-with-augs-v2-run2/epoch_39_valloss_0.1459553754034121.pt')
    #model17 = get_model_b2('../input/effnetb2-one/effnetb2_melspec_v2_epochepoch_49_valloss_0.060138213486710855.pt')
    model17 = onnxruntime.InferenceSession("../input/save-out-onnx-models/model16.onnx") # get_model_b2('../input/effnetb2-one/effnetb2_melspec_v2_epochepoch_47_valloss_0.062170032011575935.pt')
    #model19 = get_model_b2('../input/effnetb2-one/effnetb2_melspec_v2_epochepoch_41_valloss_0.07619252517696254.pt')
    #model20 = get_model_b2('../input/effnetb2-two/effnetb2_melspec_epochepoch_49_valloss_0.05903274682927723.pt')
    model20 = onnxruntime.InferenceSession("../input/save-out-onnx-models/model19.onnx") # get_model_b2('../input/effnetb2-two/effnetb2_melspec_epochepoch_47_valloss_0.06680513079018882.pt')
    #model22 = get_model_b2('../input/effnetb2-two/effnetb2_melspec_epochepoch_43_valloss_0.050417204000717725.pt')
    
    model_inception = onnxruntime.InferenceSession('../input/save-out-onnx-models/model_inception.onnx')
    model_inception2 = onnxruntime.InferenceSession('../input/save-out-onnx-models/model_inception2.onnx')
    #model_res50_melspec = onnxruntime.InferenceSession('../input/save-out-onnx-models/model_res50_melspec.onnx')
    #model_res50_melspec2 = onnxruntime.InferenceSession('../input/save-out-onnx-models/model_res50_melspec2.onnx')
    model_se = onnxruntime.InferenceSession('../input/save-out-onnx-models/model_se.onnx')
    model_mixup = onnxruntime.InferenceSession('../input/save-out-onnx-models/model_mixup.onnx')
    model_ls = onnxruntime.InferenceSession('../input/save-out-onnx-models/model_ls.onnx')
    model4 = onnxruntime.InferenceSession("../input/save-out-onnx-models/model_resnest.onnx") #get_model_resnest(model_config)
    unique_audio_id = test_df.audio_id.unique()

    prediction_dfs = []
    for audio_id in unique_audio_id:
        clip, sr_native = librosa.core.audio.__audioread_load(test_audio / (audio_id + '.mp3'),
                                                              offset=0,
                                                              duration=None,
                                                              dtype=np.float32)
        clip = librosa.to_mono(clip)
        cpmp_sr = 32000
        if sr_native > 0:
            clip = librosa.resample(clip, sr_native, cpmp_sr, res_type='kaiser_fast')
        
        test_df_for_audio_id = test_df.query(
            f"audio_id == '{audio_id}'").reset_index(drop=True)
        prediction_dict = prediction_for_clip(test_df_for_audio_id,
                                              clip=clip,
                                              model=model,
                                              #model2=model2,
                                              #model3=model3,
                                              model4=model5,
                                              #model5=model6,
                                              #model6=model7,
                                             # model7=model8,
                                             # model8=model9,
                                             # model9=model10,
                                             # model10=model11,
                                             # model11=model12,
                                             # model12=model13,
                                              model13=model14,
                                              #model14=model15,
                                              #model15=model16,
                                              model16=model17,
                                              #model17=model18,
                                              #model18=model19,
                                              model19=model20,
                                              #model20=model21,
                                              #model21=model22,
                                              model_inception=model_inception,
                                              model_inception2=model_inception2,
                                              #model_res50_melspec=model_res50_melspec,
                                              #model_res50_melspec2=model_res50_melspec2,
                                              model_se=model_se,
                                              model_mixup=model_mixup,
                                              model_ls=model_ls,
                                              model_resnest=model4,
                                              mel_params=mel_params,
                                              mel_params_resnest=resnest_melparams,
                                              mel_params_b1 = b1_melparams,
                                              mel_params_thefirst = thefirst_melparams,
                                              mel_params_thesecond= thesecond_melparams,
                                              mel_params_inception=inception_melparams,
                                              #mel_params_res50_melspec=res50_melspec_melparams,
                                              mel_params_se=se_melparams,
                                              mel_params_mixup=mixup_melparams,
                                              mel_params_ls=ls_melparams,
                                              threshold=threshold,
                                              maxpreds = maxpreds, # New param --> @kkiller
                                             )

            
        row_id = list(prediction_dict.keys())
        birds = list(prediction_dict.values())
        prediction_df = pd.DataFrame({
            "row_id": row_id,
            "birds": birds
        })
        prediction_dfs.append(prediction_df)
    
    prediction_df = pd.concat(prediction_dfs, axis=0, sort=False).reset_index(drop=True)
    return prediction_df

## Prediction

In [None]:
submission = prediction(test_df=test,
                        test_audio=TEST_AUDIO_DIR,
                        model_config = model_config,
                        mel_params=settings['dataset']['params']['melspectrogram_parameters'],
                        resnest_melparams = melspectrogram_parameters,
                        b1_melparams = settings_effnetb1['dataset']['params']['melspectrogram_parameters'],
                        thefirst_melparams = settings_thefirst['dataset']['params']['melspectrogram_parameters'],
                        thesecond_melparams = settings_thesecond['dataset']['params']['melspectrogram_parameters'],
                        inception_melparams = settings_inception['dataset']['params']['melspectrogram_parameters'],
                        #res50_melspec_melparams = settings_res50_melspec['dataset']['params']['melspectrogram_parameters'],
                        se_melparams = settings_se['dataset']['params']['melspectrogram_parameters'],
                        mixup_melparams=settings_mixup['dataset']['params']['melspectrogram_parameters'],
                        ls_melparams=settings_ls['dataset']['params']['melspectrogram_parameters'],
                        target_sr=TARGET_SR,
                        threshold=0.33,
                        maxpreds=3, # New param --> @kkiller
                       )
submission.to_csv("submission.csv", index=False)