In [None]:
!pip install ../input/timmfork
!pip install ../input/torchlibrosa/torchlibrosa-0.0.5-py3-none-any.whl

## prep

In [None]:
import cv2
import audioread
import logging
import os, sys
import random
import time
import warnings

import librosa
import numpy as np
import pandas as pd
import soundfile as sf
import timm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as torchdata
import torchaudio
import torchaudio.transforms as T

from contextlib import contextmanager
from pathlib import Path
from typing import Optional

from albumentations.core.transforms_interface import ImageOnlyTransform
from sklearn.metrics import f1_score, classification_report
from torchlibrosa.stft import LogmelFilterBank, Spectrogram
from torchlibrosa.augmentation import SpecAugmentation
from tqdm.notebook import tqdm

In [None]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

set_seed(1213)

In [None]:
class CFG:
    ######################
    # Globals #
    ######################
    seed = 1213
    epochs = 35
    train = True
    folds = [0]
    img_size = 224
    main_metric = "epoch_f1_at_05"
    minimize_metric = False

    ######################
    # Data #
    ######################
    train_datadir = Path("../../input/birdclef-2021/birdclef-2021/train_short_audio")
    train_csv = "../../input/birdclef-2021/input/birdclef-2021/train_metadata.csv"
    train_soundscape = "../../input/birdclef-2021/birdclef-2021/train_soundscape_labels.csv"

    ######################
    # Dataset #
    ######################
    transforms = {
        "train": [],
        "valid": [],
        "test": []
    }
    period = 20
    n_mels = 256
    fmin = 0
    fmax = 16000
    n_fft = 2048
    hop_length = 512
    sample_rate = 32000
    melspectrogram_parameters = {
        "n_mels": 224,
        "fmin": 20,
        "fmax": 16000
    }

    target_columns = [
        'acafly', 'acowoo', 'aldfly', 'ameavo', 'amecro',
        'amegfi', 'amekes', 'amepip', 'amered', 'amerob',
        'amewig', 'amtspa', 'andsol1', 'annhum', 'astfly',
        'azaspi1', 'babwar', 'baleag', 'balori', 'banana',
        'banswa', 'banwre1', 'barant1', 'barswa', 'batpig1',
        'bawswa1', 'bawwar', 'baywre1', 'bbwduc', 'bcnher',
        'belkin1', 'belvir', 'bewwre', 'bkbmag1', 'bkbplo',
        'bkbwar', 'bkcchi', 'bkhgro', 'bkmtou1', 'bknsti', 'blbgra1',
        'blbthr1', 'blcjay1', 'blctan1', 'blhpar1', 'blkpho',
        'blsspa1', 'blugrb1', 'blujay', 'bncfly', 'bnhcow', 'bobfly1',
        'bongul', 'botgra', 'brbmot1', 'brbsol1', 'brcvir1', 'brebla',
        'brncre', 'brnjay', 'brnthr', 'brratt1', 'brwhaw', 'brwpar1',
        'btbwar', 'btnwar', 'btywar', 'bucmot2', 'buggna', 'bugtan',
        'buhvir', 'bulori', 'burwar1', 'bushti', 'butsal1', 'buwtea',
        'cacgoo1', 'cacwre', 'calqua', 'caltow', 'cangoo', 'canwar',
        'carchi', 'carwre', 'casfin', 'caskin', 'caster1', 'casvir',
        'categr', 'ccbfin', 'cedwax', 'chbant1', 'chbchi', 'chbwre1',
        'chcant2', 'chispa', 'chswar', 'cinfly2', 'clanut', 'clcrob',
        'cliswa', 'cobtan1', 'cocwoo1', 'cogdov', 'colcha1', 'coltro1',
        'comgol', 'comgra', 'comloo', 'commer', 'compau', 'compot1',
        'comrav', 'comyel', 'coohaw', 'cotfly1', 'cowscj1', 'cregua1',
        'creoro1', 'crfpar', 'cubthr', 'daejun', 'dowwoo', 'ducfly', 'dusfly',
        'easblu', 'easkin', 'easmea', 'easpho', 'eastow', 'eawpew', 'eletro',
        'eucdov', 'eursta', 'fepowl', 'fiespa', 'flrtan1', 'foxspa', 'gadwal',
        'gamqua', 'gartro1', 'gbbgul', 'gbwwre1', 'gcrwar', 'gilwoo',
        'gnttow', 'gnwtea', 'gocfly1', 'gockin', 'gocspa', 'goftyr1',
        'gohque1', 'goowoo1', 'grasal1', 'grbani', 'grbher3', 'grcfly',
        'greegr', 'grekis', 'grepew', 'grethr1', 'gretin1', 'greyel',
        'grhcha1', 'grhowl', 'grnher', 'grnjay', 'grtgra', 'grycat',
        'gryhaw2', 'gwfgoo', 'haiwoo', 'heptan', 'hergul', 'herthr',
        'herwar', 'higmot1', 'hofwoo1', 'houfin', 'houspa', 'houwre',
        'hutvir', 'incdov', 'indbun', 'kebtou1', 'killde', 'labwoo', 'larspa',
        'laufal1', 'laugul', 'lazbun', 'leafly', 'leasan', 'lesgol', 'lesgre1',
        'lesvio1', 'linspa', 'linwoo1', 'littin1', 'lobdow', 'lobgna5', 'logshr',
        'lotduc', 'lotman1', 'lucwar', 'macwar', 'magwar', 'mallar3', 'marwre',
        'mastro1', 'meapar', 'melbla1', 'monoro1', 'mouchi', 'moudov', 'mouela1',
        'mouqua', 'mouwar', 'mutswa', 'naswar', 'norcar', 'norfli', 'normoc', 'norpar',
        'norsho', 'norwat', 'nrwswa', 'nutwoo', 'oaktit', 'obnthr1', 'ocbfly1',
        'oliwoo1', 'olsfly', 'orbeup1', 'orbspa1', 'orcpar', 'orcwar', 'orfpar',
        'osprey', 'ovenbi1', 'pabspi1', 'paltan1', 'palwar', 'pasfly', 'pavpig2',
        'phivir', 'pibgre', 'pilwoo', 'pinsis', 'pirfly1', 'plawre1', 'plaxen1',
        'plsvir', 'plupig2', 'prowar', 'purfin', 'purgal2', 'putfru1', 'pygnut',
        'rawwre1', 'rcatan1', 'rebnut', 'rebsap', 'rebwoo', 'redcro', 'reevir1',
        'rehbar1', 'relpar', 'reshaw', 'rethaw', 'rewbla', 'ribgul', 'rinkin1',
        'roahaw', 'robgro', 'rocpig', 'rotbec', 'royter1', 'rthhum', 'rtlhum',
        'ruboro1', 'rubpep1', 'rubrob', 'rubwre1', 'ruckin', 'rucspa1', 'rucwar',
        'rucwar1', 'rudpig', 'rudtur', 'rufhum', 'rugdov', 'rumfly1', 'runwre1',
        'rutjac1', 'saffin', 'sancra', 'sander', 'savspa', 'saypho', 'scamac1',
        'scatan', 'scbwre1', 'scptyr1', 'scrtan1', 'semplo', 'shicow', 'sibtan2',
        'sinwre1', 'sltred', 'smbani', 'snogoo', 'sobtyr1', 'socfly1', 'solsan',
        'sonspa', 'soulap1', 'sposan', 'spotow', 'spvear1', 'squcuc1', 'stbori',
        'stejay', 'sthant1', 'sthwoo1', 'strcuc1', 'strfly1', 'strsal1', 'stvhum2',
        'subfly', 'sumtan', 'swaspa', 'swathr', 'tenwar', 'thbeup1', 'thbkin',
        'thswar1', 'towsol', 'treswa', 'trogna1', 'trokin', 'tromoc', 'tropar',
        'tropew1', 'tuftit', 'tunswa', 'veery', 'verdin', 'vigswa', 'warvir',
        'wbwwre1', 'webwoo1', 'wegspa1', 'wesant1', 'wesblu', 'weskin', 'wesmea',
        'westan', 'wewpew', 'whbman1', 'whbnut', 'whcpar', 'whcsee1', 'whcspa',
        'whevir', 'whfpar1', 'whimbr', 'whiwre1', 'whtdov', 'whtspa', 'whwbec1',
        'whwdov', 'wilfly', 'willet1', 'wilsni1', 'wiltur', 'wlswar', 'wooduc',
        'woothr', 'wrenti', 'y00475', 'yebcha', 'yebela1', 'yebfly', 'yebori1',
        'yebsap', 'yebsee1', 'yefgra1', 'yegvir', 'yehbla', 'yehcar1', 'yelgro',
        'yelwar', 'yeofly1', 'yerwar', 'yeteup1', 'yetvir']

    ######################
    # Loaders #
    ######################
    loader_params = {
        "train": {
            "batch_size": 64,
            "num_workers": 20,
            "shuffle": True
        },
        "valid": {
            "batch_size": 64,
            "num_workers": 20,
            "shuffle": False
        },
        "test": {
            "batch_size": 64,
            "num_workers": 20,
            "shuffle": False
        }
    }

    ######################
    # Split #
    ######################
    split = "StratifiedKFold"
    split_params = {
        "n_splits": 5,
        "shuffle": True,
        "random_state": 1213
    }

    ######################
    # Model #
    ######################
    base_model_name = [
        "tf_efficientnet_b0_ns",
        "tf_efficientnet_b0_ns"
    ]
    pooling = "max"
    pretrained = True
    num_classes = 397
    in_channels = 1

    ######################
    # Criterion #
    ######################
    loss_name = "BCEFocal2WayLoss"
    loss_params: dict = {}

    ######################
    # Optimizer #
    ######################
    optimizer_name = "Adam"
    base_optimizer = "Adam"
    optimizer_params = {
        "lr": 0.001
    }
    # For SAM optimizer
    base_optimizer = "Adam"

    ######################
    # Scheduler #
    ######################
    scheduler_name = "CosineAnnealingLR"
    scheduler_params = {
        "T_max": 10
    }

## models

In [None]:
## MODELS
def init_layer(layer):
    nn.init.xavier_uniform_(layer.weight)

    if hasattr(layer, "bias"):
        if layer.bias is not None:
            layer.bias.data.fill_(0.)


def init_bn(bn):
    bn.bias.data.fill_(0.)
    bn.weight.data.fill_(1.0)


def init_weights(model):
    classname = model.__class__.__name__
    if classname.find("Conv2d") != -1:
        nn.init.xavier_uniform_(model.weight, gain=np.sqrt(2))
        model.bias.data.fill_(0)
    elif classname.find("BatchNorm") != -1:
        model.weight.data.normal_(1.0, 0.02)
        model.bias.data.fill_(0)
    elif classname.find("GRU") != -1:
        for weight in model.parameters():
            if len(weight.size()) > 1:
                nn.init.orghogonal_(weight.data)
    elif classname.find("Linear") != -1:
        model.weight.data.normal_(0, 0.01)
        model.bias.data.zero_()


def do_mixup(x: torch.Tensor, mixup_lambda: torch.Tensor):
    """Mixup x of even indexes (0, 2, 4, ...) with x of odd indexes
    (1, 3, 5, ...).
    Args:
      x: (batch_size * 2, ...)
      mixup_lambda: (batch_size * 2,)
    Returns:
      out: (batch_size, ...)
    """
    out = (x[0::2].transpose(0, -1) * mixup_lambda[0::2] +
           x[1::2].transpose(0, -1) * mixup_lambda[1::2]).transpose(0, -1)
    return out


class Mixup(object):
    def __init__(self, mixup_alpha, random_seed=1234):
        """Mixup coefficient generator.
        """
        self.mixup_alpha = mixup_alpha
        self.random_state = np.random.RandomState(random_seed)

    def get_lambda(self, batch_size):
        """Get mixup random coefficients.
        Args:
          batch_size: int
        Returns:
          mixup_lambdas: (batch_size,)
        """
        mixup_lambdas = []
        for n in range(0, batch_size, 2):
            lam = self.random_state.beta(
                self.mixup_alpha, self.mixup_alpha, 1)[0]
            mixup_lambdas.append(lam)
            mixup_lambdas.append(1. - lam)

        return torch.from_numpy(np.array(mixup_lambdas, dtype=np.float32))


def interpolate(x: torch.Tensor, ratio: int):
    """Interpolate data in time domain. This is used to compensate the
    resolution reduction in downsampling of a CNN.
    Args:
      x: (batch_size, time_steps, classes_num)
      ratio: int, ratio to interpolate
    Returns:
      upsampled: (batch_size, time_steps * ratio, classes_num)
    """
    (batch_size, time_steps, classes_num) = x.shape
    upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
    upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
    return upsampled


def pad_framewise_output(framewise_output: torch.Tensor, frames_num: int):
    """Pad framewise_output to the same length as input frames. The pad value
    is the same as the value of the last frame.
    Args:
      framewise_output: (batch_size, frames_num, classes_num)
      frames_num: int, number of frames to pad
    Outputs:
      output: (batch_size, frames_num, classes_num)
    """
    output = F.interpolate(
        framewise_output.unsqueeze(1),
        size=(frames_num, framewise_output.size(2)),
        align_corners=True,
        mode="bilinear").squeeze(1)

    return output


def gem(x: torch.Tensor, p=3, eps=1e-6):
    return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1. / p)


class GeM(nn.Module):
    def __init__(self, p=3, eps=1e-6):
        super().__init__()
        self.p = nn.Parameter(torch.ones(1) * p)
        self.eps = eps

    def forward(self, x):
        return gem(x, p=self.p, eps=self.eps)

    def __repr__(self):
        return self.__class__.__name__ + f"(p={self.p.data.tolist()[0]:.4f}, eps={self.eps})"


class AttBlockV2(nn.Module):
    def __init__(self,
                 in_features: int,
                 out_features: int,
                 activation="linear"):
        super().__init__()

        self.activation = activation
        self.att = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True)
        self.cla = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True)

        self.init_weights()

    def init_weights(self):
        init_layer(self.att)
        init_layer(self.cla)

    def forward(self, x):
        # x: (n_samples, n_in, n_time)
        norm_att = torch.softmax(torch.tanh(self.att(x)), dim=-1)
        cla = self.nonlinear_transform(self.cla(x))
        x = torch.sum(norm_att * cla, dim=2)
        return x, norm_att, cla

    def nonlinear_transform(self, x):
        if self.activation == 'linear':
            return x
        elif self.activation == 'sigmoid':
            return torch.sigmoid(x)

    

class TimmSED(nn.Module):
    def __init__(self, encoder: str, pretrained=False, classes=397, in_channels=2, use_coordconv=False, attn_activation="sigmoid", **kwargs):
        super().__init__()

        base_model = timm.create_model(
            encoder, pretrained=pretrained, in_chans=in_channels, **kwargs)
        self.encoder = base_model

        if hasattr(base_model, "fc"):
            in_features = base_model.fc.in_features
        elif hasattr(base_model, "num_features"):
            in_features = base_model.num_features
        else:
            in_features = base_model.classifier.in_features
        self.fc1 = nn.Linear(in_features, in_features, bias=True)
        self.att_block = AttBlockV2(
            in_features, classes, activation=attn_activation)
        self.features = self.encoder.forward_features
        self.use_coordconv = use_coordconv
        self.init_weight()

    def init_weight(self):
        init_layer(self.fc1)

    def forward(self, x):

        # (batch_size, channels, freq, frames)
        B, C, H, W = x.size()

        if self.use_coordconv:
            coords = torch.arange(0, H, dtype=x.dtype)[None, None, :, None]
            coords = coords.repeat(B, C, 1, W).to(x.device) / H
            x = torch.cat([x, coords], dim=1)

        x = self.features(x)

        # (batch_size, channels, frames)
        x = torch.mean(x, dim=2)

        # channel smoothing
        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
        x = x1 + x2

        x = F.dropout(x, p=0.5, training=self.training)
        x = x.transpose(1, 2)
        x = F.relu_(self.fc1(x))
        x = x.transpose(1, 2)
        x = F.dropout(x, p=0.5, training=self.training)
        (clipwise_output, norm_att, segmentwise_output) = self.att_block(x)
        logit = torch.sum(norm_att * self.att_block.cla(x), dim=2)
        segmentwise_logit = self.att_block.cla(x).transpose(1, 2)
        segmentwise_output = segmentwise_output.transpose(1, 2)

        interpolate_ratio = W // segmentwise_output.size(1)

        # Get framewise output
        framewise_output = interpolate(segmentwise_output,
                                       interpolate_ratio)
        framewise_output = pad_framewise_output(framewise_output, W)

        framewise_logit = interpolate(segmentwise_logit, interpolate_ratio)
        framewise_logit = pad_framewise_output(framewise_logit, W)

        output_dict = {
            "framewise_output": framewise_output,
            "segmentwise_output": segmentwise_output,
            "logit": logit,
            "framewise_logit": framewise_logit,
            "cls": clipwise_output
        }

        return output_dict
    
## MODELS gpu

from dataclasses import dataclass

@dataclass
class MelConfig:
    sample_rate: int = 32000
    n_fft: int = 2048
    hop_length: int = 512
    n_mels: int = 256

        
class TimmSEDGPU(nn.Module):
    def __init__(self, encoder: str,
                 mel_config: MelConfig,
                 pretrained=False,
                 classes=24,
                 in_channels=1,
                 attn_activation='sigmoid',
                 **kwargs
                 ):
        super().__init__()
        # melextractor
        self.logmel_extractor = nn.Sequential(
            T.MelSpectrogram(sample_rate=mel_config.sample_rate, n_fft=mel_config.n_fft, win_length=mel_config.n_fft,
                             hop_length=mel_config.hop_length, power=2.0, n_mels=mel_config.n_mels),
            T.AmplitudeToDB()
        )

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
                                               freq_drop_width=8, freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(mel_config.n_mels)

        base_model = timm.create_model(
            encoder, pretrained=pretrained, in_chans=in_channels, **kwargs)
        self.encoder = base_model

        if hasattr(base_model, "fc"):
            in_features = base_model.fc.in_features
        elif hasattr(base_model, "num_features"):
            in_features = base_model.num_features
        else:
            in_features = base_model.classifier.in_features
        self.features = self.encoder.forward_features

        self.fc1 = nn.Linear(in_features, in_features, bias=True)
        self.att_block = AttBlockV2(
            in_features, classes, activation=attn_activation)

        self.init_weight()

    def init_weight(self):
        init_layer(self.fc1)
        init_bn(self.bn0)

    def forward(self, input):
        with torch.cuda.amp.autocast(False):
            x = self.logmel_extractor(input).unsqueeze(1)

        frames_num = x.size(3)

        x = x.transpose(1, 2)
        x = self.bn0(x)
        x = x.transpose(1, 2)

        if self.training:
            x = self.spec_augmenter(x.transpose(2, 3)).transpose(2, 3)

        # (batch_size, channels, freq, frames)
        x = self.features(x)

        # (batch_size, channels, frames)
        x = torch.mean(x, dim=2)

        # channel smoothing
        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
        x = x1 + x2

        x = F.dropout(x, p=0.5, training=self.training)
        x = x.transpose(1, 2)
        x = F.relu_(self.fc1(x))
        x = x.transpose(1, 2)
        x = F.dropout(x, p=0.5, training=self.training)
        (clipwise_output, norm_att, segmentwise_output) = self.att_block(x)
        logit = torch.sum(norm_att * self.att_block.cla(x), dim=2)
        segmentwise_logit = self.att_block.cla(x).transpose(1, 2)
        segmentwise_output = segmentwise_output.transpose(1, 2)

        interpolate_ratio = frames_num // segmentwise_output.size(1)

        # Get framewise output
        framewise_output = interpolate(segmentwise_output,
                                       interpolate_ratio)
        framewise_output = pad_framewise_output(framewise_output, frames_num)

        framewise_logit = interpolate(segmentwise_logit, interpolate_ratio)
        framewise_logit = pad_framewise_output(framewise_logit, frames_num)

        output_dict = {
            "framewise_output": framewise_output,
            "segmentwise_output": segmentwise_output,
            "logit": logit,
            "framewise_logit": framewise_logit,
            "clipwise_output": clipwise_output
        }

        return output_dict
    
class TimmRNNSEDGPU(nn.Module):
    def __init__(self, encoder: str,
                 mel_config: MelConfig,
                 pretrained=False,
                 classes=24,
                 in_channels=1,
                 attn_activation='sigmoid',
                 **kwargs
                 ):
        super().__init__()
        # melextractor
        self.logmel_extractor = nn.Sequential(
            T.MelSpectrogram(sample_rate=mel_config.sample_rate, n_fft=mel_config.n_fft, win_length=mel_config.n_fft,
                             hop_length=mel_config.hop_length, power=2.0, n_mels=mel_config.n_mels),
            T.AmplitudeToDB()
        )

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
                                               freq_drop_width=8, freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(mel_config.n_mels)

        base_model = timm.create_model(
            encoder, pretrained=pretrained, in_chans=in_channels, **kwargs)
        self.encoder = base_model

        if hasattr(base_model, "fc"):
            in_features = base_model.fc.in_features
        elif hasattr(base_model, "num_features"):
            in_features = base_model.num_features
        else:
            in_features = base_model.classifier.in_features
        self.features = self.encoder.forward_features

        
        self.gru = torch.nn.GRU(input_size=in_features, hidden_size=in_features, 
                        num_layers=2, dropout=0.3, batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(in_features*2, in_features*2, bias=True)
        self.att_block = AttBlockV2(
            in_features*2, classes, activation=attn_activation)

        self.init_weight()

    def init_weight(self):
        init_layer(self.fc1)
        init_bn(self.bn0)

    def forward(self, input):
        with torch.cuda.amp.autocast(False):
            x = self.logmel_extractor(input).unsqueeze(1)

        frames_num = x.size(3)

        x = x.transpose(1, 2)
        x = self.bn0(x)
        x = x.transpose(1, 2)

        if self.training:
            x = self.spec_augmenter(x.transpose(2, 3)).transpose(2, 3)
        
        # (batch_size, channels, freq, frames)
        x = self.features(x)

        # (batch_size, channels, frames)
        x = torch.mean(x, dim=2)

        # channel smoothing
        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
        x = x1 + x2

        x = F.dropout(x, p=0.5, training=self.training)
        x = x.transpose(1, 2)
        (x, _) = self.gru(x)
        x = F.relu_(self.fc1(x))
        x = x.transpose(1, 2)
        x = F.dropout(x, p=0.5, training=self.training)
        (clipwise_output, norm_att, segmentwise_output) = self.att_block(x)
        logit = torch.sum(norm_att * self.att_block.cla(x), dim=2)
        segmentwise_logit = self.att_block.cla(x).transpose(1, 2)
        segmentwise_output = segmentwise_output.transpose(1, 2)

        interpolate_ratio = frames_num // segmentwise_output.size(1)

        # Get framewise output
        framewise_output = interpolate(segmentwise_output,
                                       interpolate_ratio)
        framewise_output = pad_framewise_output(framewise_output, frames_num)

        framewise_logit = interpolate(segmentwise_logit, interpolate_ratio)
        framewise_logit = pad_framewise_output(framewise_logit, frames_num)

        output_dict = {
            "framewise_output": framewise_output,
            "segmentwise_output": segmentwise_output,
            "segmentwise_logit": segmentwise_logit,
            "logit": logit,
            "framewise_logit": framewise_logit,
            "clipwise_output": clipwise_output
        }

        return output_dict

In [None]:
conf = dict({
    "network": "TimmSED",
    "encoder": "tf_efficientnet_b3_ns",
    "encoder_params": {
        "classes": 397,
        "drop_path_rate": 0.2,
        "attn_activation": "linear",
        "use_coordconv": False,
        "in_channels": 1
    },
    "multiplier": 1,
    "use_secondary": True,
    "sample_sec": 20,
    "n_mels": 256,
    "n_fft": 2048,
    "hop_length": 512,
    "crop_width": 1240,
    "optimizer": {
        "batch_size": 32,
        "type": "AdamW",
        "weight_decay": 1e-2,
        "learning_rate": 0.0002,
        "schedule": {
            "type": "cosine",
            "mode": "step",
            "epochs": 100,
            "params": {
              "T_max": 40000,
              "eta_min": 1e-5
            }
        }
    }
})

def load_model(name:str, gpu_mel: bool, path: Path, conf: dict, prefix: str, suffix: str, fold: int):
    conf['encoder'] = f"tf_efficientnet_{name.split('_')[0]}_ns"
    conf['encoder_params']['use_coordconv'] = False
    
    
    if name == 'v2m_gpu': conf['encoder'] = 'tf_efficientnetv2_m'
    elif name == 'v2s_gpu': conf['encoder'] = 'tf_efficientnetv2_s_in21k'

    n_mels = conf["n_mels"]
    n_fft = conf["n_fft"]
    hop_length = conf["hop_length"]
    snapshot_name = "{}{}_{}_{}_{}_{}_{}_{}".format(prefix, conf['network'], conf['encoder'], fold, n_mels,
                                                    n_fft, hop_length, suffix)
    #weights_path = os.path.join("weights", snapshot_name)
    
    weights_path = path
    
    if 'gpu' in name:
        
        
        if '80' in name:
            print("LOADING RNN(80) GPU MODEL")
            conf['network'] = f"TimmSEDGPU"
            del conf['encoder_params']['use_coordconv']
            mc_80 = MelConfig
            mc_80.n_mels = 80
            conf['encoder'] = 'tf_efficientnetv2_b1'
            model = TimmRNNSEDGPU(encoder=conf['encoder'], mel_config=mc_80, **conf["encoder_params"])
            conf['encoder_params']['use_coordconv'] = False
            
        elif '320' in name:
            print("LOADING 320 GPU MODEL")
            conf['network'] = f"TimmSEDGPU"
            del conf['encoder_params']['use_coordconv']
            if 'b1' in name: conf['encoder'] = 'tf_efficientnetv2_b1'
            elif 'b3' in name: conf['encoder'] = 'tf_efficientnetv2_b3'
            mc_320 = MelConfig
            mc_320.n_mels = 320
            model = TimmSEDGPU(encoder=conf['encoder'], mel_config=mc_320, **conf["encoder_params"])
            conf['encoder_params']['use_coordconv'] = False
        else:
            print("LOADING DEFAULT(256) GPU MODEL")
            conf['network'] = f"TimmSEDGPU"
            del conf['encoder_params']['use_coordconv']
            mc_256 = MelConfig
            mc_256.n_mels = 256
            model = TimmSEDGPU(encoder=conf['encoder'], mel_config=mc_256, **conf["encoder_params"])
            
    else:
        model = TimmSED(encoder=conf['encoder'], **conf["encoder_params"])
        
        
    model = torch.nn.DataParallel(model).cuda()
    print("=> loading checkpoint '{}'".format(weights_path))
    checkpoint = torch.load(weights_path, map_location="cpu")
    print("epoch", checkpoint['epoch'])
    model.load_state_dict(checkpoint['state_dict'])
    model.eval()
    return model

In [None]:
weights_paths = [
    Path("../input/ensemble-01/e30oof_TimmSEDGPU_tf_efficientnetv2_s_in21k_0_256_2048_512_last"),
    Path("../input/ensemble-01/e40oof_TimmSEDGPU_tf_efficientnetv2_s_in21k_1_256_2048_512_best_f1"),
    # Path("../input/ensemble-01/e30oof_TimmSEDGPU_tf_efficientnetv2_s_in21k_2_256_2048_512_best_f1"),
    # Path("../input/ensemble-01/oof_TimmSEDGPU_tf_efficientnetv2_s_in21k_3_256_2048_512_best_f1"),
    Path("../input/ensemble-01/TimmSEDGPU_tf_efficientnetv2_s_in21k_0_256_2048_512_best_f1 (1)"),
    Path("../input/ensemble-01/TimmSEDGPU_tf_efficientnetv2_s_in21k_1_256_2048_512_best_f1"),
    Path("../input/ensemble-01/TimmSEDGPU_tf_efficientnetv2_s_in21k_2_256_2048_512_best_f1"),
    Path("../input/ensemble-01/TimmSEDGPU_tf_efficientnetv2_s_in21k_4_256_2048_512_best_f1"),
    Path("../input/ensemble-01/b1_5secTimmSEDGPU_tf_efficientnet_b1_ns_0_256_2048_512_best_f1"),
    Path("../input/ensemble-01/b1_5secTimmSEDGPU_tf_efficientnet_b1_ns_1_256_2048_512_best_f1"),
    Path("../input/ensemble-01/b1_5secTimmSEDGPU_tf_efficientnet_b1_ns_3_256_2048_512_best_f1"),
    Path("../input/ensemble-01/b1_pseudoTimmSEDGPU_tf_efficientnet_b1_ns_0_256_2048_512_best_f1"),
    Path("../input/ensemble-01/b1_pseudoTimmSEDGPU_tf_efficientnet_b1_ns_3_256_2048_512_best_f1"),
    # Path("../input/ensemble-01/b1TimmSEDGPU_tf_efficientnet_b1_ns_1_256_2048_512_best_f1"),
    Path("../input/ensemble-01/b1TimmSEDGPU_tf_efficientnet_b1_ns_0_256_2048_512_best_f1"),
    Path("../input/ensemble-01/b1TimmSEDGPU_tf_efficientnet_b1_ns_2_256_2048_512_best_f1"),
    # Path("../input/ensemble-01/b1TimmSEDGPU_tf_efficientnet_b1_ns_3_256_2048_512_best_f1"),
    Path("../input/ensemble-01/b1TimmSEDGPU_tf_efficientnet_b1_ns_4_256_2048_512_best_f1"),
    Path("../input/ensemble-01/b2_TimmSEDGPU_tf_efficientnet_b2_ns_3_256_2048_512_best_f1"),
    Path("../input/ensemble-01/b3fTimmSEDGPU_tf_efficientnet_b3_ns_2_256_2048_512_best_f1"),
    Path("../input/ensemble-01/b3fTimmSEDGPU_tf_efficientnet_b3_ns_3_256_2048_512_best_f1"),
    Path("../input/ensemble-01/pseudo_TimmSEDGPU_tf_efficientnet_b5_ns_0_256_2048_512_best_f1"),
    Path("../input/ensemble-01/pseudo_TimmSEDGPU_tf_efficientnet_b5_ns_1_256_2048_512_best_f1"),
    Path("../input/ensemble-01/pseudo_TimmSEDGPU_tf_efficientnet_b5_ns_2_256_2048_512_best_f1"),
    Path("../input/ensemble-01/pseudo_TimmSEDGPU_tf_efficientnet_b5_ns_3_256_2048_512_best_f1"),
    Path("../input/ensemble-01/e41_oof_TimmSEDGPU_tf_efficientnet_b7_ns_0_256_2048_512_last"),
    Path("../input/ensemble-01/b1TimmSED_tf_efficientnet_b1_ns_0_256_2048_512_best_f1"),
    Path("../input/ensemble-01/e80_TimmSED_tf_efficientnet_b3_ns_0_256_2048_512_best_f1 (1)"),
    Path("../input/ensemble-01/e63_TimmSED_tf_efficientnet_b5_ns_0_256_2048_512_best_f1"),
    Path("../input/ensemble-01/e60_TimmSED_tf_efficientnet_b7_ns_0_256_2048_512_best_f1"),
    Path('../input/ensemble-01/b1_oof_regular_100epochsTimmSEDGPU_tf_efficientnet_b1_ns_0_256_2048_512_best_f1'),
    Path('../input/ensemble-01/v2b1_nmels320TimmSEDGPU_tf_efficientnetv2_b1_0_320_2048_512_best_f1'),
    Path("../input/ensemble-01/v2b3_nmels320_psTimmSEDGPU_tf_efficientnetv2_b3_0_320_2048_512_best_f1")
]
models = []
names = [
    'v2s_gpu', 
    'v2s_gpu', 
    # 'v2s_gpu', 
    # 'v2s_gpu', 
    'v2s_gpu', 
    'v2s_gpu', 
    'v2s_gpu', 
    'v2s_gpu', 
    'b1_gpu', 
    'b1_gpu', 
    'b1_gpu', 
    'b1_gpu', 
    'b1_gpu', 
    # 'b1_gpu', 
    'b1_gpu', 
    'b1_gpu', 
    #'b1_gpu', 
    'b1_gpu', 
    'b2_gpu', 
    'b3_gpu', 'b3_gpu', 
    'b5_gpu', 'b5_gpu', 'b5_gpu', 'b5_gpu', 
    'b7_gpu', 
    'b1', 'b3', 'b5', 'b7', 
    'b1_gpu', 'b1_gpu_320', 'b3_gpu_320']

for i in range(len(weights_paths)):
    if 'gpu' in names[i]: 
        model = load_model(names[i], True, weights_paths[i], conf, "", "best_f1", 0)
    else:
        model = load_model(names[i], False, weights_paths[i], conf, "", "best_f1", 0)

    models.append(model)

## data

In [None]:
SR = 32000
F_MIN = 0.0
F_MAX = SR // 2
TOP_DB = 80
MIN_VALUE = -1

class SoundscapesDataset(torch.utils.data.Dataset):
    def __init__(self,
                 data_path,
                 n_mels,
                 n_fft,
                 hop_length,
                 seconds=20
                 ):
        super().__init__()
        assert 600 % seconds == 0
        self.data_path = data_path
        self.n_mels = n_mels
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.names = [f.replace(".ogg", "") for f in os.listdir(data_path) if f.endswith("ogg")]
        self.target_columns = CLASSES
        self.seconds = seconds

    def __len__(self):
        return len(self.names)

    def as_mel(self, audio: np.ndarray):
        audio = melspectrogram(audio, sr=SR, n_mels=self.n_mels, hop_length=self.hop_length, n_fft=self.n_fft,
                               fmax=F_MAX)
        audio = librosa.power_to_db(audio, ref=np.max, top_db=TOP_DB) / TOP_DB
        return audio

    def __getitem__(self, idx):
        file_id = self.names[idx]
        audio, _ = librosa.load(os.path.join(self.data_path, f"{file_id}.ogg"), sr=None)
        chunk_length = SR * self.seconds
        chunks = []
        start_seconds = []
        
        for i, start in enumerate(range(0, 600 - self.seconds // 2, self.seconds // 2)):
            chunk = audio[int(chunk_length // 2 * i):int(chunk_length // 2 * i) + chunk_length]
            chunks.append(chunk)
            start_seconds.append(start)
        chunks = torch.from_numpy(np.array(chunks)).float()
        start_seconds = torch.from_numpy(np.array(start_seconds))

        return chunks, file_id, start_seconds
    
CLASSES = CFG.target_columns

from librosa.feature import melspectrogram

def as_mel(audio):
    SR = 32000
    F_MIN = 0.0
    F_MAX = SR // 2
    TOP_DB = 80
    MIN_VALUE = -1
    n_mels = 256
    hop_length = 512
    n_fft = 2048
    
    audio = melspectrogram(audio, sr=SR, n_mels=n_mels, hop_length=hop_length, n_fft=n_fft,
                           fmax=F_MAX)
    audio = librosa.power_to_db(audio, ref=np.max, top_db=TOP_DB) / TOP_DB
    return audio

## infer

In [None]:

TEST = (len(list(Path("../input/birdclef-2021/test_soundscapes/").glob("*.ogg"))) != 0)
if TEST:
    data_path = "../input/birdclef-2021/test_soundscapes/"
else:
    data_path = "../input/birdclef-2021/train_soundscapes/"

    
TARGET_SR = 32000
PERIOD    = 40
test_dataset = SoundscapesDataset(
    data_path=data_path,
    n_mels=conf["n_mels"],
    n_fft=conf["n_fft"],
    hop_length=conf["hop_length"],
    seconds = PERIOD
)

print('Number of models: {}'.format(len(models)))

weights = [0.08165454021559648,
 0.21907731174089815,
 0.2224014690785836,
 0.05246388396627374,
 0.283327943267436,
 0.05529993267155051,
 0.8195556605964778,
 0.8012870998577118,
 0.18139450344600117,
 0.01733149684253748,
 0.07826873890476346,
 0.29824756907541783,
 0.19770169253477266,
 0.8314541626366747,
 0.013507043322341372,
 0.24902706130653013,
 0.2852405486945102,
 0.7589629054922609,
 0.8169482342380096,
 0.20799895247189154,
 0.07652219249891853,
 0.9510807537360569,
 0.8421765003052626,
 0.6524239243745986,
 0.45931291527150997,
 0.9521269680606435,
 0.4172814307521808,
 0.537796980088361,
 0.07247043664860703]

weights = [x / sum(weights) for x in weights]

data = {}
with torch.no_grad():
    loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, num_workers=4, shuffle=False)
    for audio, name, starts in tqdm(loader):
        starts = starts.detach().cpu().numpy()
        audio = audio[0]
        prediction_arrays_even = []
        prediction_arrays_odd = []

        for frame in range(audio.size(0)):

            probs = None
            for m_ix, model in enumerate(models):
                
                if 'gpu' in names[m_ix]:
                    mel = audio[frame].cuda()
                    mel = torch.unsqueeze(mel, 0)
                else:
                    mel = torch.from_numpy(as_mel(audio[frame].numpy())).unsqueeze(0).unsqueeze(0).cuda()
                
                with torch.cuda.amp.autocast():
                    output = model(mel)
                output = output["framewise_logit"]
                output = torch.sigmoid(output).cpu().cpu().numpy()
                output = weights[m_ix] * output

                if probs is None:
                    probs = output
                else:
                    probs += output

            probs = probs[0]
            if frame % 2 == 0:
                prediction_arrays_even.append(probs)
            else:
                prediction_arrays_odd.append(probs)
                
        prediction_even = np.concatenate(prediction_arrays_even, axis=0)
        prediction_odd = np.concatenate(prediction_arrays_odd, axis=0)
        
        offset = int(len(prediction_arrays_even[0]) / 2)
        prediction_even[offset:offset + len(prediction_odd)] = (
            prediction_even[offset:offset + len(prediction_odd)] + prediction_odd) / 2

        for subframe, split in enumerate(np.array_split(prediction_even, 600 // 5, axis=0)):
            species = []
            if species:
                birds = " ".join(species)
            audio_id, site, *_ = name[0].split("_")
            seconds = str(int((subframe + 1) * 5))
            row_id = f"{audio_id}_{site}_{seconds}"
            data[row_id] = split[PERIOD * 2:-PERIOD * 2].max(0)


## pp

In [None]:
"""
train_soundscapes = pd.read_csv("../input/birdclef-2021/train_soundscape_labels.csv").sort_values(by="row_id").reset_index()

pred_dict = data
num_classes = len(CLASSES)

gt = np.zeros([len(train_soundscapes), num_classes + 1])
pred_probs = np.zeros([len(train_soundscapes), num_classes])
gt_ids = np.empty((len(train_soundscapes),), dtype=np.object)

for i, gt_row in train_soundscapes.iterrows():
    gt_ids[i] = gt_row.row_id
    pred_probs[i] = pred_dict[gt_row.row_id]
    gt_row = gt_row.birds.split()
    for g in gt_row:
        if g == "nocall":
            gt[i, num_classes] = 1
        else:
            gt[i, CLASSES.index(g)] = 1

nocall_thresholds = np.linspace(0.25, 0.5, num=40)
call_thresholds = np.linspace(0.17, 0.35, num=40)
# nocall_thresholds = np.linspace(0.3, 0.4, num=2)
# call_thresholds = np.linspace(0.2, 0.3, num=2)
best_overall = 0
best_sum_overall = 0
best_calls_f1 = 0
for nct in nocall_thresholds:
    for ct in call_thresholds:
        # init nocalls
        pred_nocall = np.zeros([len(train_soundscapes), ])
        nct_preds = pred_probs > nct
        nct_preds = nct_preds.sum(1) == 0

        #set to nocalls and use different threshold for others
        preds = np.zeros_like(gt)
        preds[:, :num_classes] = pred_probs > ct
        preds[:, num_classes] = nct_preds

        gt_calls = gt[gt[:, num_classes] == 0]
        pred_gt_calls = preds[gt[:, num_classes] == 0]
        calls_score = f1_score(gt_calls, pred_gt_calls, average="samples")
        gt_nocalls = gt[gt[:, num_classes] == 1]
        pred_gt_nocalls = preds[gt[:, num_classes] == 1]
        nocalls_score = f1_score(gt_nocalls, pred_gt_nocalls, average="samples")
        f1_overall = 0.54 * nocalls_score + 0.46 * calls_score
        if f1_overall > best_overall:
            ## save predictions for submission
            
            saved_best_ct = ct
            saved_best_nct = nct
            best_overall = f1_overall
            print("##########found better thresholds#########")
            print(f"Call/Nocall Cls t: {nct:.04f} : Call T {ct:0.4f}")
            print(f"F1 calls:{calls_score:.04f} | F1 nocalls:{nocalls_score:.04f} ")
            print(f"Overall {f1_overall :.04f}")
        sum_overall = f1_overall + calls_score
        if sum_overall > best_sum_overall:
            print("##########alternative thresholds#########")
            print(f"Call/Nocall Cls t: {nct:.04f} : Call T {ct:0.4f}")
            print(f"F1 calls:{calls_score:.04f} | F1 nocalls:{nocalls_score:.04f} ")
            print(f"Overall {f1_overall :.04f}")
            print(f"sum of f1 and calls_score {sum_overall:.04f}")
            best_sum_overall = sum_overall
        if calls_score > best_calls_f1:
            print("##########found best calls f1#########")
            print(f"Call/Nocall Cls t: {nct:.04f} : Call T {ct:0.4f}")
            print(f"F1 calls:{calls_score:.04f} | F1 nocalls:{nocalls_score:.04f} ")
            print(f"Overall {f1_overall :.04f}")
            best_calls_f1 = calls_score
"""

In [None]:
row_ids = []
birds   = []

# nct = saved_best_nct
# ct  = saved_best_ct
nct = 0.3077
ct  = 0.2485   

masks = dict(np.load("../input/train-masks/masks.npz"))

for i, (k, v) in enumerate(data.items()):
    row_ids.append(k)

    if np.sum(v > nct) == 0:
        birds.append("nocall")
    else:
        site = k.split("_")[1]
        mask = masks[site]
        v = v * mask

        pds = [CFG.target_columns[x[0]] for x in np.argwhere(v > ct).tolist()]
        birds.append(" ".join(sorted(list(pds))))

In [None]:
submission = pd.DataFrame({
        "row_id": row_ids,
        "birds": birds,
    }).to_csv("submission.csv", index=False)