# Inference using 21Classes

#### Code copied with minimal changes from this Notebook:
https://www.kaggle.com/code/myso1987/birdclef2022-pytorch-resnet34-starter-lb-0-50

# Install required packages

In [None]:
!cp -r ../input/timm-pytorch-image-models .

In [None]:
!pip install timm-pytorch-image-models/pytorch-image-models-master/

In [None]:
!cp -r ../input/torchlibrosa .

In [None]:
!pip install torchlibrosa/torchlibrosa-0.0.5-py3-none-any.whl

In [None]:
#!pip install catalyst==20.12

## Library

In [None]:
import os
import json
import tqdm
import random
import shutil
import pandas as pd
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torchaudio
import torchaudio.transforms as T
from torchvision.models.resnet import ResNet, BasicBlock

import matplotlib.pyplot as plt
from sklearn.metrics import f1_score 

In [None]:
import gc
import os
import math
import random
import warnings

import albumentations as A
import cv2
import librosa
import numpy as np
import pandas as pd
import soundfile as sf
import timm
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as torchdata

from pathlib import Path
from typing import List

from albumentations.pytorch import ToTensorV2
from albumentations.core.transforms_interface import ImageOnlyTransform
from sklearn import model_selection
from sklearn import metrics
from timm.models.layers import SelectAdaptivePool2d
from torch.optim.optimizer import Optimizer
from torchlibrosa.stft import LogmelFilterBank, Spectrogram
from torchlibrosa.augmentation import SpecAugmentation
import logging
from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
from tqdm import tqdm

In [None]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

def init_logger(log_file='train.log'):
    
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger
    
def get_logger(out_file=None):
    logger = logging.getLogger()
    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
    logger.handlers = []
    logger.setLevel(logging.INFO)

    handler = logging.StreamHandler()
    handler.setFormatter(formatter)
    handler.setLevel(logging.INFO)
    logger.addHandler(handler)

    if out_file is not None:
        fh = logging.FileHandler(out_file)
        fh.setFormatter(formatter)
        fh.setLevel(logging.INFO)
        logger.addHandler(fh)
    logger.info("logger set up")
    return logger

def get_device() -> torch.device:
    return torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
logger = get_logger("main.log")
set_seed(1213)

In [None]:
device = get_device()

## Data Loading

In [None]:
root_path = "../input/birdclef-2022/"
input_path = root_path + '/train_audio/'
out_path = "./train/"

try:
    os.mkdir(out_path)
except FileExistsError:
    pass


train_meta = pd.read_csv(root_path + 'train_metadata.csv')

with open(root_path + '/scored_birds.json') as sbfile:
    scored_birds = json.load(sbfile)

In [None]:
len(scored_birds)

In [None]:
scored_birds

### only 21 classes

In [None]:
train_meta_21classes = train_meta[train_meta['primary_label'].isin(scored_birds)]
bird_label_21classes = sorted(train_meta_21classes["primary_label"].unique())
bird_label_total = sorted(train_meta["primary_label"].unique())
print(bird_label_21classes,"\n", bird_label_total)

In [None]:
#from ast import literal_eval
#train_meta['secondary_labels'] = train_meta['secondary_labels'].map(lambda x: literal_eval(x))

In [None]:
#total = []
#second_labels =  train_meta['secondary_labels'].tolist()
#for s in second_labels:
#    total.extend(s)
#secondary_labels = list(set(total))

In [None]:
class CFG:
    ######################
    # Globals #
    ######################
    seed = 1213
    epochs = 35
    train = True
    folds = [0]
    img_size = 224
    main_metric = "epoch_f1_at_05"
    minimize_metric = False

    ######################
    # Data #
    ######################
    train_datadir = Path("../input/birdclef-2022/train_audio")
    train_csv = train_meta
    #train_soundscape = "../input/birdclef-2021/train_soundscape_labels.csv"

    ######################
    # Dataset #
    ######################
    transforms = {
        "train": [{"name": "Normalize"}],
        "valid": [{"name": "Normalize"}],
        "test": [{"name": "Normalize"}]
    }
    period = 20
    n_mels = 256
    fmin = 20
    fmax = 16000
    n_fft = 2048
    hop_length = 512
    sample_rate = 32000
    melspectrogram_parameters = {
        "n_mels": 256,
        "fmin": 20,
        "fmax": 16000
    }

    target_columns = scored_birds

    ######################
    # Loaders #
    ######################
    loader_params = {
        "train": {
            "batch_size": 32,
            "num_workers": 20,
            "shuffle": True
        },
        "valid": {
            "batch_size": 64,
            "num_workers": 20,
            "shuffle": False
        },
        "test": {
            "batch_size": 64,
            "num_workers": 20,
            "shuffle": False
        }
    }

    ######################
    # Split #
    ######################
    split = "StratifiedKFold"
    split_params = {
        "n_splits": 5,
        "shuffle": True,
        "random_state": 1213
    }

    ######################
    # Model #
    ######################
    base_model_name = "tf_efficientnet_b0_ns"
    pooling = "max"
    pretrained = True
    num_classes = 21
    n_pretrain_classes = 131
    in_channels = 1

    ######################
    # Criterion #
    ######################
    loss_name = "BCEFocal2WayLoss"
    loss_params: dict = {}

    ######################
    # Optimizer #
    ######################
    optimizer_name = "Adam"
    base_optimizer = "Adam"
    optimizer_params = {
        "lr": 0.001
    }
    # For SAM optimizer
    base_optimizer = "Adam"

    ######################
    # Scheduler #
    ######################
    scheduler_name = "CosineAnnealingLR"
    scheduler_params = {
        "T_max": 10
    }

In [None]:
class CFG_NFNETMIXUP:
    ######################
    # Globals #
    ######################
        fold = 4
        DEBUG = False
        mixed_precision = False
        period = 5
        hop_length = 320
        seed = 1213
        epochs = 100
        train = True
        folds = [0]
        img_size = 224
        main_metric = "epoch_f1_at_02"
        minimize_metric = False

        ######################
        # Data #
        ######################
        train_datadir = Path("../input/birdclef-2022/train_audio")
        train_csv = train_meta
        #train_soundscape = "../input/birdclef-2021/train_soundscape_labels.csv"

        ######################
        # Dataset #
        ######################
        transforms = {
            "train": [{"name": "Normalize"}],
            "valid": [{"name": "Normalize"}],
            "test": [{"name": "Normalize"}]
        }
        
        n_mels = 256
        #256 change 비교 필요
        fmin = 10
        fmax = 16000
        n_fft = 2048
        
        sample_rate = 32000
        melspectrogram_parameters = {
            "n_mels": 256,
            "fmin": 20,
            "fmax": 16000
        }

        target_columns = bird_label_total

        ######################
        # Loaders #
        ######################
        loader_params = {
            "train": {
                "batch_size": 32,
                "num_workers": 20,
                "shuffle": True
            },
            "valid": {
                "batch_size": 64,
                "num_workers": 20,
                "shuffle": False
            },
            "test": {
                "batch_size": 64,
                "num_workers": 20,
                "shuffle": False
            }
        }

        ######################
        # Split #
        ######################
        split = "StratifiedKFold"
        split_params = {
            "n_splits": 5,
            "shuffle": True,
            "random_state": 1213
        }

        ######################
        # Model #
        ######################
        base_model_name = "eca_nfnet_l0"
        pooling = "max"
        pretrained = False
        num_classes = len(bird_label_total)
        in_channels = 1

        ######################
        # Criterion #
        ######################
        loss_name = "BCEFocal2WayLoss"
        loss_params: dict = {}

        ######################
        # Optimizer #
        ######################
        optimizer_name = "Adam"
        base_optimizer = "Adam"
        optimizer_params = {
            "lr": 0.0005,
            #"weight_decay":1e-2,
        }

        ######################
        # Scheduler #
        ######################
        scheduler_name = "CosineAnnealingLR"

In [None]:
class CFG_EFFV2DOUBLEMIXUP:
    ######################
    # Globals #
    ######################
        fold = 4
        DEBUG = False
        mixed_precision = False
        period = 5
        hop_length = 320
        seed = 1213
        epochs = 100
        train = True
        folds = [0]
        img_size = 224
        main_metric = "epoch_f1_at_02"
        minimize_metric = False

        ######################
        # Data #
        ######################
        train_datadir = Path("../input/birdclef-2022/train_audio")
        train_csv = train_meta
        #train_soundscape = "../input/birdclef-2021/train_soundscape_labels.csv"

        ######################
        # Dataset #
        ######################
        transforms = {
            "train": [{"name": "Normalize"}],
            "valid": [{"name": "Normalize"}],
            "test": [{"name": "Normalize"}]
        }
        
        n_mels = 128
        #256 change 비교 필요
        fmin = 10
        fmax = 16000
        n_fft = 1024
        
        sample_rate = 32000
        melspectrogram_parameters = {
            "n_mels": 128,
            "fmin": 20,
            "fmax": 16000
        }

        target_columns = bird_label_total

        ######################
        # Loaders #
        ######################
        loader_params = {
            "train": {
                "batch_size": 32,
                "num_workers": 20,
                "shuffle": True
            },
            "valid": {
                "batch_size": 64,
                "num_workers": 20,
                "shuffle": False
            },
            "test": {
                "batch_size": 64,
                "num_workers": 20,
                "shuffle": False
            }
        }

        ######################
        # Split #
        ######################
        split = "StratifiedKFold"
        split_params = {
            "n_splits": 4,
            "shuffle": True,
            "random_state": 1213
        }

        ######################
        # Model #
        ######################
        base_model_name = "tf_efficientnetv2_s_in21k"
        pooling = "max"
        pretrained = False
        num_classes = len(bird_label_total)
        in_channels = 1

        ######################
        # Criterion #
        ######################
        loss_name = "BCEFocal2WayLoss"
        loss_params: dict = {}

        ######################
        # Optimizer #
        ######################
        optimizer_name = "Adam"
        base_optimizer = "Adam"
        optimizer_params = {
            "lr": 0.0005,
            #"weight_decay":1e-2,
        }

        ######################
        # Scheduler #
        ######################
        scheduler_name = "CosineAnnealingLR"

In [None]:
class CFG_EFFV2DOUBLEMIXUP_64MEL:
    ######################
    # Globals #
    ######################
        fold = 4
        DEBUG = False
        mixed_precision = False
        period = 5
        hop_length = 512
        seed = 1213
        epochs = 100
        train = True
        folds = [0]
        img_size = 224
        main_metric = "epoch_f1_at_02"
        minimize_metric = False

        ######################
        # Data #
        ######################
        train_datadir = Path("../input/birdclef-2022/train_audio")
        train_csv = train_meta
        #train_soundscape = "../input/birdclef-2021/train_soundscape_labels.csv"

        ######################
        # Dataset #
        ######################
        transforms = {
            "train": [{"name": "Normalize"}],
            "valid": [{"name": "Normalize"}],
            "test": [{"name": "Normalize"}]
        }
        
        n_mels = 64
        #256 change 비교 필요
        fmin = 10
        fmax = 16000
        n_fft = 2048
        
        sample_rate = 32000
        melspectrogram_parameters = {
            "n_mels": 64,
            "fmin": 20,
            "fmax": 16000
        }

        target_columns = bird_label_total

        ######################
        # Loaders #
        ######################
        loader_params = {
            "train": {
                "batch_size": 32,
                "num_workers": 20,
                "shuffle": True
            },
            "valid": {
                "batch_size": 64,
                "num_workers": 20,
                "shuffle": False
            },
            "test": {
                "batch_size": 64,
                "num_workers": 20,
                "shuffle": False
            }
        }

        ######################
        # Split #
        ######################
        split = "StratifiedKFold"
        split_params = {
            "n_splits": 4,
            "shuffle": True,
            "random_state": 1213
        }

        ######################
        # Model #
        ######################
        base_model_name = "tf_efficientnetv2_s_in21k"
        pooling = "max"
        pretrained = False
        num_classes = len(bird_label_total)
        in_channels = 1

        ######################
        # Criterion #
        ######################
        loss_name = "BCEFocal2WayLoss"
        loss_params: dict = {}

        ######################
        # Optimizer #
        ######################
        optimizer_name = "Adam"
        base_optimizer = "Adam"
        optimizer_params = {
            "lr": 0.0005,
            #"weight_decay":1e-2,
        }

        ######################
        # Scheduler #
        ######################
        scheduler_name = "CosineAnnealingLR"

In [None]:
class CFG_EFFV2MDOUBLEMIXUP:
    ######################
    # Globals #
    ######################
        fold = 4
        DEBUG = False
        mixed_precision = False
        period = 5
        hop_length = 320
        seed = 1213
        epochs = 100
        train = True
        folds = [0]
        img_size = 224
        main_metric = "epoch_f1_at_02"
        minimize_metric = False

        ######################
        # Data #
        ######################
        train_datadir = Path("../input/birdclef-2022/train_audio")
        train_csv = train_meta
        #train_soundscape = "../input/birdclef-2021/train_soundscape_labels.csv"

        ######################
        # Dataset #
        ######################
        transforms = {
            "train": [{"name": "Normalize"}],
            "valid": [{"name": "Normalize"}],
            "test": [{"name": "Normalize"}]
        }
        
        n_mels = 128
        #256 change 비교 필요
        fmin = 10
        fmax = 16000
        n_fft = 1024
        
        sample_rate = 32000
        melspectrogram_parameters = {
            "n_mels": 128,
            "fmin": 20,
            "fmax": 16000
        }

        target_columns = bird_label_total

        ######################
        # Loaders #
        ######################
        loader_params = {
            "train": {
                "batch_size": 32,
                "num_workers": 20,
                "shuffle": True
            },
            "valid": {
                "batch_size": 64,
                "num_workers": 20,
                "shuffle": False
            },
            "test": {
                "batch_size": 64,
                "num_workers": 20,
                "shuffle": False
            }
        }

        ######################
        # Split #
        ######################
        split = "StratifiedKFold"
        split_params = {
            "n_splits": 4,
            "shuffle": True,
            "random_state": 1213
        }

        ######################
        # Model #
        ######################
        base_model_name = "tf_efficientnetv2_m_in21k"
        pooling = "max"
        pretrained = False
        num_classes = len(bird_label_total)
        in_channels = 1

        ######################
        # Criterion #
        ######################
        loss_name = "BCEFocal2WayLoss"
        loss_params: dict = {}

        ######################
        # Optimizer #
        ######################
        optimizer_name = "Adam"
        base_optimizer = "Adam"
        optimizer_params = {
            "lr": 0.0005,
            #"weight_decay":1e-2,
        }

        ######################
        # Scheduler #
        ######################
        scheduler_name = "CosineAnnealingLR"

In [None]:
class CFG_NFNET0_SPEC_64MEL_15SEC:
    ######################
    # Globals #
    ######################
        fold = 0
        DEBUG = False
        mixed_precision = False
        savename = f"totalclass-doublemixup-5sec-length2048-64mel-512hop-weightmixed-nfnet-pretrain-fold-{fold}"
        period = 15
        hop_length = 512
        seed = 888
        epochs = 100
        train = True
        folds = [0]
        img_size = 224
        main_metric = "epoch_f1_at_02"
        minimize_metric = False

        ######################
        # Data #
        ######################
        train_datadir = Path("train")
        train_csv = train_meta
        #train_soundscape = "../input/birdclef-2021/train_soundscape_labels.csv"

        ######################
        # Dataset #
        ######################
        transforms = {
            #"train": [{"name":"PitchShift"},{"name":"PinkNoise"},{"name":"RandomVolume"},{"name": "Normalize"}],
            "train": [{"name":"GaussianNoise"},{"name":"PinkNoise"},{"name":"RandomVolume"},{"name": "Normalize"}],
            "valid": [{"name": "Normalize"}],
            "test": [{"name": "Normalize"}]
        }
        
        n_mels = 64
        #256 change 비교 필요
        fmin = 10
        fmax = 16000
        n_fft = 2048
        
        sample_rate = 32000
        melspectrogram_parameters = {
            "n_mels": n_mels,
            "fmin": 20,
            "fmax": 16000
        }

        target_columns = bird_label_total

        ######################
        # Loaders #
        ######################
        loader_params = {
            "train": {
                "batch_size": 32,
                "num_workers": 20,
                "shuffle": True
            },
            "valid": {
                "batch_size": 64,
                "num_workers": 20,
                "shuffle": False
            },
            "test": {
                "batch_size": 64,
                "num_workers": 20,
                "shuffle": False
            }
        }

        ######################
        # Split #
        ######################
        split = "StratifiedKFold"
        split_params = {
            "n_splits": 4,
            "shuffle": True,
            "random_state": 888
        }

        ######################
        # Model #
        ######################
        base_model_name = "eca_nfnet_l0"
        pooling = "max"
        pretrained = False
        num_classes = len(bird_label_total)
        in_channels = 1

        ######################
        # Criterion #
        ######################
        loss_name = "BCEFocal2WayLoss"
        loss_params: dict = {}

        ######################
        # Optimizer #
        ######################
        optimizer_name = "Adam"
        base_optimizer = "Adam"
        optimizer_params = {
            "lr": 0.0005,
            #"weight_decay":1e-2,
        }

        ######################
        # Scheduler #
        ######################
        scheduler_name = "CosineAnnealingLR"
        #scheduler_params = {
        #    #"T_max": 25,
        #    "eta_min": 1e-5,
        #}

In [None]:
#MixupTimmNFNETSPECSED
#CFG_NFNET0_SPEC_64MEL_15SEC

In [None]:
class MixupTimmNFNETSPECSED(nn.Module):
    def __init__(self, base_model_name: str, pretrained=False, num_classes=24, in_channels=1):
        super().__init__()
        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=CFG_NFNET0_SPEC_64MEL_15SEC.n_fft, hop_length=CFG_NFNET0_SPEC_64MEL_15SEC.hop_length,
                                                 win_length=CFG_NFNET0_SPEC_64MEL_15SEC.n_fft, window="hann", center=True, pad_mode="reflect",
                                                 freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=CFG_NFNET0_SPEC_64MEL_15SEC.sample_rate, n_fft=CFG_NFNET0_SPEC_64MEL_15SEC.n_fft,
                                                 n_mels=CFG_NFNET0_SPEC_64MEL_15SEC.n_mels, fmin=CFG_NFNET0_SPEC_64MEL_15SEC.fmin, fmax=CFG_NFNET0_SPEC_64MEL_15SEC.fmax, ref=1.0, amin=1e-10, top_db=None,
                                                 freeze_parameters=True)

        
        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
                                               freq_drop_width=8, freq_stripes_num=2)

        self.mixup = Mixup()
        self.bn0 = nn.BatchNorm2d(CFG_NFNET0_SPEC_64MEL_15SEC.n_mels)

        base_model = timm.create_model(
            base_model_name, pretrained=pretrained, in_chans=in_channels)
        layers = list(base_model.children())[:-2]
        self.encoder = nn.Sequential(*layers)
        if hasattr(base_model, "head"):
            in_features = base_model.head.fc.in_features
        else:
            in_features = base_model.classifier.in_features

        self.fc1 = nn.Linear(in_features, in_features, bias=True)
        self.att_block = AttBlockV2(
            in_features, num_classes, activation="sigmoid")

        self.init_weight()

    def init_weight(self):
        init_layer(self.fc1)
        init_bn(self.bn0)

    def forward(self, batch):
        # batch -> image, target for mixup
        input, target = batch


        # (batch_size, 1, time_steps, freq_bins)
        x = self.spectrogram_extractor(input)
        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
        frames_num = x.shape[2]

        if self.training:
            
            x, mixup_target = self.mixup(x, target)
            if np.random.uniform() < 0.5:
                x, mixup_target = self.mixup(x, mixup_target)

            #mixup_target = mixup_target * weight.view(-1,1)
            #print("mixuped :", mixup_target, "before : ", target)
        

        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)
        if self.training:
            x = self.spec_augmenter(x)
        #if self.training:
        #    x = self.spec_augmenter(x)

        x = x.transpose(2, 3)
        x = self.encoder(x)

        # (batch_size, channels, frames)
        x = torch.mean(x, dim=2)

        # channel smoothing
        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
        # (batch_size, channels, frames)
        x = x1 + x2
        
        x = F.dropout(x, p=0.5, training=self.training)
        x = x.transpose(1, 2)
        #(x, _) = self.gru(x)
        # (batch_size, channels*some, frames)
        x = F.relu_(self.fc1(x))
        x = x.transpose(1, 2)
        x = F.dropout(x, p=0.5, training=self.training)
        # x shape : (batch_size, channels*some, frames)
        # clipwise_output shape : (batch_size, channels*some*some2)
        # norm_att shape : (batch_size,channels*some*some2, frames)
        # segmentwise_output shape : (batch_size, channels*some*some2, frames) ->sigmoid(self.cla(x))
        (clipwise_output, norm_att, segmentwise_output) = self.att_block(x)
        logit = torch.sum(norm_att * self.att_block.cla(x), dim=2)
        segmentwise_logit = self.att_block.cla(x).transpose(1, 2)
        segmentwise_output = segmentwise_output.transpose(1, 2)
        #print("frames_num : ", frames_num)
        interpolate_ratio = frames_num // segmentwise_output.size(1)

        # Get framewise output
        framewise_output = interpolate(segmentwise_output,
                                       interpolate_ratio)
        framewise_output = pad_framewise_output(framewise_output, frames_num)

        framewise_logit = interpolate(segmentwise_logit, interpolate_ratio)
        framewise_logit = pad_framewise_output(framewise_logit, frames_num)
        if self.training:
            output_dict = {
                "framewise_output": framewise_output, # applied interpolation at segmentwise_output
                "segmentwise_output": segmentwise_output,
                "logit": logit,
                "framewise_logit": framewise_logit,
                "clipwise_output": clipwise_output,
                "target":mixup_target
            }
        else:
            output_dict = {
                "framewise_output": framewise_output, # applied interpolation at segmentwise_output
                "segmentwise_output": segmentwise_output,
                "logit": logit,
                "framewise_logit": framewise_logit,
                "clipwise_output": clipwise_output,
                "target":target
            }

        return output_dict

In [None]:
class MixupTimmEFFV2SED(nn.Module):
    def __init__(self, base_model_name: str, pretrained=False, num_classes=24, in_channels=1):
        super().__init__()
        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=CFG_EFFV2DOUBLEMIXUP.n_fft, hop_length=CFG_EFFV2DOUBLEMIXUP.hop_length,
                                                 win_length=CFG_EFFV2DOUBLEMIXUP.n_fft, window="hann", center=True, pad_mode="reflect",
                                                 freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=CFG_EFFV2DOUBLEMIXUP.sample_rate, n_fft=CFG_EFFV2DOUBLEMIXUP.n_fft,
                                                 n_mels=CFG_EFFV2DOUBLEMIXUP.n_mels, fmin=CFG_EFFV2DOUBLEMIXUP.fmin, fmax=CFG_EFFV2DOUBLEMIXUP.fmax, ref=1.0, amin=1e-10, top_db=None,
                                                 freeze_parameters=True)

        
        self.mixup = Mixup()
        self.bn0 = nn.BatchNorm2d(CFG_EFFV2DOUBLEMIXUP.n_mels)

        base_model = timm.create_model(
            base_model_name, pretrained=pretrained, in_chans=in_channels)
        layers = list(base_model.children())[:-2]
        self.encoder = nn.Sequential(*layers)
        if hasattr(base_model, "head"):
            in_features = base_model.head.fc.in_features
        else:
            in_features = base_model.classifier.in_features

        self.fc1 = nn.Linear(in_features, in_features, bias=True)
        self.att_block = AttBlockV2(
            in_features, num_classes, activation="sigmoid")

        self.init_weight()

    def init_weight(self):
        init_layer(self.fc1)
        init_bn(self.bn0)

    def forward(self, batch):
        # batch -> image, target for mixup
        input, target = batch
        if CFG_EFFV2DOUBLEMIXUP.DEBUG:
            print("mixup before image shape: ", input.shape, "target before mixup shape: ", target.shape)

        # (batch_size, 1, time_steps, freq_bins)
        x = self.spectrogram_extractor(input)
        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
        frames_num = x.shape[2]

        if self.training:
            
            x, mixup_target, weight = self.mixup(x, target)
            if np.random.uniform() < 0.5:
                x, mixup_target, weight = self.mixup(x, mixup_target)

            #mixup_target = mixup_target * weight.view(-1,1)
            #print("mixuped :", mixup_target, "before : ", target)
        

        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)

        #if self.training:
        #    x = self.spec_augmenter(x)

        x = x.transpose(2, 3)
        x = self.encoder(x)

        # (batch_size, channels, frames)
        x = torch.mean(x, dim=2)

        # channel smoothing
        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
        # (batch_size, channels, frames)
        x = x1 + x2
        
        x = F.dropout(x, p=0.5, training=self.training)
        x = x.transpose(1, 2)
        #(x, _) = self.gru(x)
        # (batch_size, channels*some, frames)
        x = F.relu_(self.fc1(x))
        x = x.transpose(1, 2)
        x = F.dropout(x, p=0.5, training=self.training)
        # x shape : (batch_size, channels*some, frames)
        # clipwise_output shape : (batch_size, channels*some*some2)
        # norm_att shape : (batch_size,channels*some*some2, frames)
        # segmentwise_output shape : (batch_size, channels*some*some2, frames) ->sigmoid(self.cla(x))
        (clipwise_output, norm_att, segmentwise_output) = self.att_block(x)
        logit = torch.sum(norm_att * self.att_block.cla(x), dim=2)
        segmentwise_logit = self.att_block.cla(x).transpose(1, 2)
        segmentwise_output = segmentwise_output.transpose(1, 2)
        #print("frames_num : ", frames_num)
        interpolate_ratio = frames_num // segmentwise_output.size(1)

        # Get framewise output
        framewise_output = interpolate(segmentwise_output,
                                       interpolate_ratio)
        framewise_output = pad_framewise_output(framewise_output, frames_num)

        framewise_logit = interpolate(segmentwise_logit, interpolate_ratio)
        framewise_logit = pad_framewise_output(framewise_logit, frames_num)
        if self.training:
            output_dict = {
                "framewise_output": framewise_output, # applied interpolation at segmentwise_output
                "segmentwise_output": segmentwise_output,
                "logit": logit,
                "framewise_logit": framewise_logit,
                "clipwise_output": clipwise_output,
                "target":mixup_target
            }
        else:
            output_dict = {
                "framewise_output": framewise_output, # applied interpolation at segmentwise_output
                "segmentwise_output": segmentwise_output,
                "logit": logit,
                "framewise_logit": framewise_logit,
                "clipwise_output": clipwise_output,
                "target":target
            }

        return output_dict

In [None]:
# this notebook is by default run on debug mode (only train one epoch).
# If you'd like to get the results on par with that of inference notebook, you'll need to train the model around 30 epochs
DEBUG = False
if DEBUG:
    CFG.epochs = 1

## Utils

In [None]:
#sample_rate = 32000
#n_fft = 4096
#win_length = None
#hop_length = 512
#n_mels = 256
#min_sec_proc = sample_rate*5

#mel_spectrogram = T.MelSpectrogram(
#    sample_rate=sample_rate,
#    n_fft=n_fft,
#    win_length=win_length,
#    hop_length=hop_length,
#    center=True,
#    pad_mode="reflect",
#    power=2.0,
#    norm='slaney',
#    onesided=True,
#    n_mels=n_mels,
#    mel_scale="htk",
#)

In [None]:
class WaveformDataset(Dataset):
    def __init__(self,
                 df: pd.DataFrame,
                 datadir: Path,
                 img_size=224,
                 waveform_transforms=None,
                 period=5,
                 validation=False):
        self.df = df
        self.datadir = datadir
        self.img_size = img_size
        self.waveform_transforms = waveform_transforms
        self.period = period
        self.validation = validation

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx: int):
        sample = self.df.loc[idx, :]
        wav_name = sample["filename"]
        ebird_code = sample["primary_label"]
        
        y, sr = sf.read(self.datadir / wav_name)
        len_wav_shape = len(y.shape)
        if len_wav_shape == 1:
            pass
        else:
            y = y[:,0]
        
       # print("shape y : ", y.shape)
        len_y = len(y)
        effective_length = sr * self.period
        if len_y < effective_length:
            new_y = np.zeros(effective_length, dtype=y.dtype)
            if not self.validation:
                start = np.random.randint(effective_length - len_y)
            else:
                start = 0
            new_y[start:start + len_y] = y
            y = new_y.astype(np.float32)
        elif len_y > effective_length:
            if not self.validation:
                start = np.random.randint(len_y - effective_length)
            else:
                start = 0
            y = y[start:start + effective_length].astype(np.float32)
        else:
            y = y.astype(np.float32)

        y = np.nan_to_num(y)

        if self.waveform_transforms:
            y = self.waveform_transforms(y)

        y = np.nan_to_num(y)

        labels = np.zeros(len(CFG.target_columns), dtype=float)
        labels[CFG.target_columns.index(ebird_code)] = 1.0

        return {
            "image": y,
            "targets": labels
        }

In [None]:
def get_transforms(phase: str):
    #transforms = {
    #    "train": [{"name": "Normalize"}],
    #    "valid": [{"name": "Normalize"}]
    #}
    transforms = CFG.transforms
    if transforms is None:
        return None
    else:
        if transforms[phase] is None:
            return None
        trns_list = []
        for trns_conf in transforms[phase]:
            trns_name = trns_conf["name"]
            trns_params = {} if trns_conf.get("params") is None else \
                trns_conf["params"]
            if globals().get(trns_name) is not None:
                trns_cls = globals()[trns_name]
                trns_list.append(trns_cls(**trns_params))

        if len(trns_list) > 0:
            return Compose(trns_list)
        else:
            return None
        
        
class Normalize:
    def __call__(self, y: np.ndarray):
        max_vol = np.abs(y).max()
        y_vol = y * 1 / max_vol
        return np.asfortranarray(y_vol)


class Compose:
    def __init__(self, transforms: list):
        self.transforms = transforms

    def __call__(self, y: np.ndarray):
        for trns in self.transforms:
            y = trns(y)
        return y

In [None]:
def init_layer(layer):
    nn.init.xavier_uniform_(layer.weight)

    if hasattr(layer, "bias"):
        if layer.bias is not None:
            layer.bias.data.fill_(0.)


def init_bn(bn):
    bn.bias.data.fill_(0.)
    bn.weight.data.fill_(1.0)


def init_weights(model):
    classname = model.__class__.__name__
    if classname.find("Conv2d") != -1:
        nn.init.xavier_uniform_(model.weight, gain=np.sqrt(2))
        model.bias.data.fill_(0)
    elif classname.find("BatchNorm") != -1:
        model.weight.data.normal_(1.0, 0.02)
        model.bias.data.fill_(0)
    elif classname.find("GRU") != -1:
        for weight in model.parameters():
            if len(weight.size()) > 1:
                nn.init.orghogonal_(weight.data)
    elif classname.find("Linear") != -1:
        model.weight.data.normal_(0, 0.01)
        model.bias.data.zero_()


def interpolate(x: torch.Tensor, ratio: int):
    """Interpolate data in time domain. This is used to compensate the
    resolution reduction in downsampling of a CNN.
    Args:
      x: (batch_size, time_steps, classes_num)
      ratio: int, ratio to interpolate
    Returns:
      upsampled: (batch_size, time_steps * ratio, classes_num)
    """
    (batch_size, time_steps, classes_num) = x.shape
    upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
    upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
    return upsampled


def pad_framewise_output(framewise_output: torch.Tensor, frames_num: int):
    """Pad framewise_output to the same length as input frames. The pad value
    is the same as the value of the last frame.
    Args:
      framewise_output: (batch_size, frames_num, classes_num)
      frames_num: int, number of frames to pad
    Outputs:
      output: (batch_size, frames_num, classes_num)
    """
    output = F.interpolate(
        framewise_output.unsqueeze(1),
        size=(frames_num, framewise_output.size(2)),
        align_corners=True,
        mode="bilinear").squeeze(1)

    return output


def gem(x: torch.Tensor, p=3, eps=1e-6):
    """
    Input Tensor : (batch_size, channels, height, width)
    Output Tensor : (batch_size, channels, 1, 1)
    """
    return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1. / p)


class GeM(nn.Module):
    def __init__(self, p=3, eps=1e-6):
        super().__init__()
        self.p = nn.Parameter(torch.ones(1) * p)
        self.eps = eps

    def forward(self, x):
        return gem(x, p=self.p, eps=self.eps)

    def __repr__(self):
        return self.__class__.__name__ + f"(p={self.p.data.tolist()[0]:.4f}, eps={self.eps})"


class AttBlockV2(nn.Module):
    def __init__(self,
                 in_features: int,
                 out_features: int,
                 activation="linear"):
        super().__init__()

        self.activation = activation
        self.att = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True)
        self.cla = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True)

        self.init_weights()

    def init_weights(self):
        init_layer(self.att)
        init_layer(self.cla)

    def forward(self, x):
        # (batch_size, channels*some, frames) (From network output)
        
        # x: (n_samples, n_in, n_time)
        norm_att = torch.softmax(torch.tanh(self.att(x)), dim=-1)
        cla = self.nonlinear_transform(self.cla(x))
        x = torch.sum(norm_att * cla, dim=2)
        
        """
        # x shape : (batch_size, channels*some)
        # norm_att shape : (batch_size,channels*some, frames)
        # cla shape : (batch_size, channels*some, frames)
        """
        return x, norm_att, cla

    def nonlinear_transform(self, x):
        if self.activation == 'linear':
            return x
        elif self.activation == 'sigmoid':
            return torch.sigmoid(x)


class TimmSED(nn.Module):
    def __init__(self, base_model_name: str, pretrained=False, num_classes=24, in_channels=1):
        super().__init__()
        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=CFG.n_fft, hop_length=CFG.hop_length,
                                                 win_length=CFG.n_fft, window="hann", center=True, pad_mode="reflect",
                                                 freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=CFG.sample_rate, n_fft=CFG.n_fft,
                                                 n_mels=CFG.n_mels, fmin=CFG.fmin, fmax=CFG.fmax, ref=1.0, amin=1e-10, top_db=None,
                                                 freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
                                               freq_drop_width=8, freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(CFG.n_mels)

        base_model = timm.create_model(
            base_model_name, pretrained=pretrained, in_chans=in_channels)
        layers = list(base_model.children())[:-2]
        self.encoder = nn.Sequential(*layers)

        if hasattr(base_model, "fc"):
            in_features = base_model.fc.in_features
        else:
            in_features = base_model.classifier.in_features
        self.fc1 = nn.Linear(in_features, in_features, bias=True)
        self.att_block = AttBlockV2(
            in_features, num_classes, activation="sigmoid")

        self.init_weight()

    def init_weight(self):
        init_layer(self.fc1)
        init_bn(self.bn0)

    def forward(self, input):
        # (batch_size, 1, time_steps, freq_bins)
        x = self.spectrogram_extractor(input)
        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)

        frames_num = x.shape[2]

        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)

        #if self.training:
        #    x = self.spec_augmenter(x)

        x = x.transpose(2, 3)
        # (batch_size, channels, freq, frames)
        x = self.encoder(x)

        # (batch_size, channels, frames)
        x = torch.mean(x, dim=2)

        # channel smoothing
        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
        # (batch_size, channels, frames)
        x = x1 + x2
        
        x = F.dropout(x, p=0.5, training=self.training)
        x = x.transpose(1, 2)
        # (batch_size, channels*some, frames)
        x = F.relu_(self.fc1(x))
        x = x.transpose(1, 2)
        x = F.dropout(x, p=0.5, training=self.training)
        # x shape : (batch_size, channels*some, frames)
        # clipwise_output shape : (batch_size, channels*some*some2)
        # norm_att shape : (batch_size,channels*some*some2, frames)
        # segmentwise_output shape : (batch_size, channels*some*some2, frames) ->sigmoid(self.cla(x))
        (clipwise_output, norm_att, segmentwise_output) = self.att_block(x)
        logit = torch.sum(norm_att * self.att_block.cla(x), dim=2)
        #print("before segmentwise_logit:",x,x.shape)
        segmentwise_logit = self.att_block.cla(x).transpose(1, 2)
        #print("after segmentwise_logit:",segmentwise_logit,segmentwise_logit.shape)
        segmentwise_output = segmentwise_output.transpose(1, 2)
        #print("frames_num:",frames_num)
        interpolate_ratio = frames_num // segmentwise_output.size(1)

        # Get framewise output
        framewise_output = interpolate(segmentwise_output,
                                       interpolate_ratio)
        framewise_output = pad_framewise_output(framewise_output, frames_num)

        framewise_logit = interpolate(segmentwise_logit, interpolate_ratio)
        framewise_logit = pad_framewise_output(framewise_logit, frames_num)
        #print("after framewise_logit:",framewise_logit,framewise_logit.shape)
        output_dict = {
            "framewise_output": framewise_output, # applied interpolation at segmentwise_output
            "segmentwise_output": segmentwise_output,
            "logit": logit,
            "framewise_logit": framewise_logit,
            "clipwise_output": clipwise_output
        }

        return output_dict

In [None]:
class PretrainedTimmSED(TimmSED):
    def __init__(self, base_model_name: str, pretrained=False, num_classes=21, in_channels=1):
        super().__init__(base_model_name=base_model_name, pretrained=False, num_classes=131,in_channels=in_channels)
        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=CFG.n_fft, hop_length=CFG.hop_length,
                                                 win_length=CFG.n_fft, window="hann", center=True, pad_mode="reflect",
                                                 freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=CFG.sample_rate, n_fft=CFG.n_fft,
                                                 n_mels=CFG.n_mels, fmin=CFG.fmin, fmax=CFG.fmax, ref=1.0, amin=1e-10, top_db=None,
                                                 freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
                                               freq_drop_width=8, freq_stripes_num=2)
        self.bn0 = nn.BatchNorm2d(CFG.n_mels)
        print("timm base_model_name : ", base_model_name)
        base_model = TimmSED(
            base_model_name=CFG.base_model_name,
            pretrained=False,
            num_classes=131,
            in_channels=CFG.in_channels)
        
        #ckpt = torch.load(CFG.pretrain_checkpoint)
        #base_model.load_state_dict(ckpt['model_state_dict'])
        

        
        layers = list(base_model.children())[4:-2]
        self.encoder = nn.Sequential(*layers)

        #if hasattr(base_model, "fc"):
        #    in_features = base_model.fc.in_features
        #else:
        #    in_features = base_model.classifier.in_features
        #self.fc1 = nn.Linear(in_features, in_features, bias=True)
        self.fc1 = nn.Linear(1280, 1280, bias=True)
        self.att_block = AttBlockV2(
            1280, num_classes, activation="sigmoid")

        self.init_weight()

    def init_weight(self):
        init_layer(self.fc1)
        init_bn(self.bn0)
        return
    
    def forward(self, input):
        # (batch_size, 1, time_steps, freq_bins)
        x = self.spectrogram_extractor(input)
        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)

        frames_num = x.shape[2]

        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)

        if self.training:
            x = self.spec_augmenter(x)
        
        x = x.transpose(2, 3)
        # (batch_size, channels, freq, frames)
        #print("before encoder : ", x, x.shape)
        #print(self.encoder)
        x = self.encoder(x)
        #print("after encoder : ", x, x.shape)
        # (batch_size, channels, frames)
        x = torch.mean(x, dim=2)

        # channel smoothing
        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
        # (batch_size, channels, frames)
        x = x1 + x2
        
        x = F.dropout(x, p=0.5, training=self.training)
        x = x.transpose(1, 2)
        # (batch_size, channels*some, frames)
        x = F.relu_(self.fc1(x))
        x = x.transpose(1, 2)
        x = F.dropout(x, p=0.5, training=self.training)
        # x shape : (batch_size, channels*some, frames)
        # clipwise_output shape : (batch_size, channels*some*some2)
        # norm_att shape : (batch_size,channels*some*some2, frames)
        # segmentwise_output shape : (batch_size, channels*some*some2, frames) ->sigmoid(self.cla(x))
        (clipwise_output, norm_att, segmentwise_output) = self.att_block(x)
        logit = torch.sum(norm_att * self.att_block.cla(x), dim=2)
        segmentwise_logit = self.att_block.cla(x).transpose(1, 2)
        segmentwise_output = segmentwise_output.transpose(1, 2)
        
        interpolate_ratio = frames_num // segmentwise_output.size(1)

        # Get framewise output
        framewise_output = interpolate(segmentwise_output,
                                       interpolate_ratio)
        framewise_output = pad_framewise_output(framewise_output, frames_num)

        framewise_logit = interpolate(segmentwise_logit, interpolate_ratio)
        framewise_logit = pad_framewise_output(framewise_logit, frames_num)

        output_dict = {
            "framewise_output": framewise_output, # applied interpolation at segmentwise_output
            "segmentwise_output": segmentwise_output,
            "logit": logit,
            "framewise_logit": framewise_logit,
            "clipwise_output": clipwise_output
        }

        return output_dict

In [None]:
class TimmNFNETGRUSED(nn.Module):
    def __init__(self, base_model_name: str, pretrained=False, num_classes=24, in_channels=1):
        super().__init__()
        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=CFG.n_fft, hop_length=CFG.hop_length,
                                                 win_length=CFG.n_fft, window="hann", center=True, pad_mode="reflect",
                                                 freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=CFG.sample_rate, n_fft=CFG.n_fft,
                                                 n_mels=CFG.n_mels, fmin=CFG.fmin, fmax=CFG.fmax, ref=1.0, amin=1e-10, top_db=None,
                                                 freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
                                               freq_drop_width=8, freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(CFG.n_mels)

        base_model = timm.create_model(
            base_model_name, pretrained=pretrained, in_chans=in_channels)
        layers = list(base_model.children())[:-2]
        self.encoder = nn.Sequential(*layers)
        #print(base_model)
        if hasattr(base_model, "head"):
            in_features = base_model.head.fc.in_features
        else:
            in_features = base_model.classifier.in_features
        self.gru = torch.nn.GRU(input_size=in_features, hidden_size=in_features, 
                        num_layers=2, dropout=0.3, batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(in_features*2, in_features, bias=True)
        self.att_block = AttBlockV2(
            in_features, num_classes, activation="sigmoid")

        self.init_weight()

    def init_weight(self):
        init_layer(self.fc1)
        init_bn(self.bn0)

    def forward(self, input):
        # (batch_size, 1, time_steps, freq_bins)
        #print("input : ", input, input.shape)
        x = self.spectrogram_extractor(input)
        #print("after spectrogram : ", x, x.shape)
        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)

        frames_num = x.shape[2]

        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)

        if self.training:
            x = self.spec_augmenter(x)

        x = x.transpose(2, 3)
        # (batch_size, channels, freq, frames)
        #print("before encoder : ", x, x.shape)
        x = self.encoder(x)

        # (batch_size, channels, frames)
        x = torch.mean(x, dim=2)

        # channel smoothing
        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
        # (batch_size, channels, frames)
        x = x1 + x2
        
        x = F.dropout(x, p=0.5, training=self.training)
        x = x.transpose(1, 2)
        (x, _) = self.gru(x)
        # (batch_size, channels*some, frames)
        x = F.relu_(self.fc1(x))
        x = x.transpose(1, 2)
        x = F.dropout(x, p=0.5, training=self.training)
        # x shape : (batch_size, channels*some, frames)
        # clipwise_output shape : (batch_size, channels*some*some2)
        # norm_att shape : (batch_size,channels*some*some2, frames)
        # segmentwise_output shape : (batch_size, channels*some*some2, frames) ->sigmoid(self.cla(x))
        (clipwise_output, norm_att, segmentwise_output) = self.att_block(x)
        logit = torch.sum(norm_att * self.att_block.cla(x), dim=2)
        segmentwise_logit = self.att_block.cla(x).transpose(1, 2)
        segmentwise_output = segmentwise_output.transpose(1, 2)
        #print("frames_num : ", frames_num)
        interpolate_ratio = frames_num // segmentwise_output.size(1)

        # Get framewise output
        framewise_output = interpolate(segmentwise_output,
                                       interpolate_ratio)
        framewise_output = pad_framewise_output(framewise_output, frames_num)

        framewise_logit = interpolate(segmentwise_logit, interpolate_ratio)
        framewise_logit = pad_framewise_output(framewise_logit, frames_num)

        output_dict = {
            "framewise_output": framewise_output, # applied interpolation at segmentwise_output
            "segmentwise_output": segmentwise_output,
            "logit": logit,
            "framewise_logit": framewise_logit,
            "clipwise_output": clipwise_output
        }

        return output_dict


class PretrainedNFNETGRUTimmSED(TimmNFNETGRUSED):
    def __init__(self, base_model_name: str, pretrained=False, num_classes=21, in_channels=1):
        super().__init__(base_model_name=base_model_name, pretrained=pretrained, num_classes=CFG.n_pretrain_classes,in_channels=in_channels)
        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=CFG.n_fft, hop_length=CFG.hop_length,
                                                 win_length=CFG.n_fft, window="hann", center=True, pad_mode="reflect",
                                                 freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=CFG.sample_rate, n_fft=CFG.n_fft,
                                                 n_mels=CFG.n_mels, fmin=CFG.fmin, fmax=CFG.fmax, ref=1.0, amin=1e-10, top_db=None,
                                                 freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
                                               freq_drop_width=8, freq_stripes_num=2)
        self.bn0 = nn.BatchNorm2d(CFG.n_mels)
        print("timm base_model_name : ", base_model_name)
        base_model = TimmNFNETGRUSED(
            base_model_name="eca_nfnet_l0",
            pretrained=False,
            num_classes=131,
            in_channels=CFG.in_channels)
        
        #ckpt = torch.load(CFG.pretrain_checkpoint)
        #base_model.load_state_dict(ckpt['model_state_dict'])
        
        #print(base_model)
        
        layers = list(base_model.children())[4:-3]
        self.encoder = nn.Sequential(*layers)

        #if hasattr(base_model, "head"):
        #    in_features = base_model.head.fc.in_features
        #else:
        #    in_features = base_model.classifier.in_features
        in_features=2304
        self.gru = torch.nn.GRU(input_size=in_features, hidden_size=in_features, 
                        num_layers=2, dropout=0.3, batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(in_features*2, in_features, bias=True)
        self.att_block = AttBlockV2(
            in_features, num_classes, activation="sigmoid")

        self.init_weight()

    def init_weight(self):
        init_layer(self.fc1)
        #init_bn(self.bn0)

    def forward(self, input):
        # (batch_size, 1, time_steps, freq_bins)
        #print("input : ", input, input.shape)
        x = self.spectrogram_extractor(input)
        #print("after spectrogram : ", x, x.shape)
        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)

        frames_num = x.shape[2]

        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)

        if self.training:
            x = self.spec_augmenter(x)

        x = x.transpose(2, 3)
        # (batch_size, channels, freq, frames)
        #print("before encoder : ", x, x.shape)
        x = self.encoder(x)

        # (batch_size, channels, frames)
        x = torch.mean(x, dim=2)

        # channel smoothing
        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
        # (batch_size, channels, frames)
        x = x1 + x2
        
        x = F.dropout(x, p=0.5, training=self.training)
        x = x.transpose(1, 2)
        (x, _) = self.gru(x)
        # (batch_size, channels*some, frames)
        x = F.relu_(self.fc1(x))
        x = x.transpose(1, 2)
        x = F.dropout(x, p=0.5, training=self.training)
        # x shape : (batch_size, channels*some, frames)
        # clipwise_output shape : (batch_size, channels*some*some2)
        # norm_att shape : (batch_size,channels*some*some2, frames)
        # segmentwise_output shape : (batch_size, channels*some*some2, frames) ->sigmoid(self.cla(x))
        (clipwise_output, norm_att, segmentwise_output) = self.att_block(x)
        logit = torch.sum(norm_att * self.att_block.cla(x), dim=2)
        segmentwise_logit = self.att_block.cla(x).transpose(1, 2)
        segmentwise_output = segmentwise_output.transpose(1, 2)
        #print("frames_num : ", frames_num)
        interpolate_ratio = frames_num // segmentwise_output.size(1)

        # Get framewise output
        framewise_output = interpolate(segmentwise_output,
                                       interpolate_ratio)
        framewise_output = pad_framewise_output(framewise_output, frames_num)

        framewise_logit = interpolate(segmentwise_logit, interpolate_ratio)
        framewise_logit = pad_framewise_output(framewise_logit, frames_num)

        output_dict = {
            "framewise_output": framewise_output, # applied interpolation at segmentwise_output
            "segmentwise_output": segmentwise_output,
            "logit": logit,
            "framewise_logit": framewise_logit,
            "clipwise_output": clipwise_output
        }

        return output_dict

In [None]:
class Mixup(nn.Module):
    def __init__(self, mix_beta=1):

        super(Mixup, self).__init__()
        self.beta_distribution = torch.distributions.Beta(mix_beta, mix_beta)

    def forward(self, X, Y, weight=None):

        bs = X.shape[0]
        n_dims = len(X.shape)
        perm = torch.randperm(bs)
        coeffs = self.beta_distribution.rsample(torch.Size((bs,))).to(X.device)

        if n_dims == 2:
            X = coeffs.view(-1, 1) * X + (1 - coeffs.view(-1, 1)) * X[perm]
        elif n_dims == 3:
            X = coeffs.view(-1, 1, 1) * X + (1 - coeffs.view(-1, 1, 1)) * X[perm]
        else:
            X = coeffs.view(-1, 1, 1, 1) * X + (1 - coeffs.view(-1, 1, 1, 1)) * X[perm]

        Y = coeffs.view(-1, 1) * Y + (1 - coeffs.view(-1, 1)) * Y[perm]

        if weight is None:
            return X, Y
        else:
            weight = coeffs.view(-1) * weight + (1 - coeffs.view(-1)) * weight[perm]
            return X, Y, weight

class MixupTimmNFNETSED(nn.Module):
    def __init__(self, base_model_name: str, pretrained=False, num_classes=24, in_channels=1):
        super().__init__()
        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=CFG_NFNETMIXUP.n_fft, hop_length=CFG_NFNETMIXUP.hop_length,
                                                 win_length=CFG_NFNETMIXUP.n_fft, window="hann", center=True, pad_mode="reflect",
                                                 freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=CFG_NFNETMIXUP.sample_rate, n_fft=CFG_NFNETMIXUP.n_fft,
                                                 n_mels=CFG_NFNETMIXUP.n_mels, fmin=CFG_NFNETMIXUP.fmin, fmax=CFG_NFNETMIXUP.fmax, ref=1.0, amin=1e-10, top_db=None,
                                                 freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
                                               freq_drop_width=8, freq_stripes_num=2)

        
        self.mixup = Mixup()
        self.bn0 = nn.BatchNorm2d(CFG_NFNETMIXUP.n_mels)

        base_model = timm.create_model(
            base_model_name, pretrained=pretrained, in_chans=in_channels)
        layers = list(base_model.children())[:-2]
        self.encoder = nn.Sequential(*layers)
        if hasattr(base_model, "head"):
            in_features = base_model.head.fc.in_features
        else:
            in_features = base_model.classifier.in_features

        self.fc1 = nn.Linear(in_features, in_features, bias=True)
        self.att_block = AttBlockV2(
            in_features, num_classes, activation="sigmoid")

        self.init_weight()

    def init_weight(self):
        init_layer(self.fc1)
        init_bn(self.bn0)

    def forward(self, batch):
        # batch -> image, target for mixup
        input, target = batch

        # (batch_size, 1, time_steps, freq_bins)
        x = self.spectrogram_extractor(input)
        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
        frames_num = x.shape[2]

        if self.training:
            x, mixup_target = self.mixup(x, target)
            #if CFG.DEBUG:
            #    print(mixup_target.shape, weight.shape)
            #mixup_target = mixup_target * weight.view(-1,1)
            #print("mixuped :", mixup_target, "before : ", target)
        

        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)

        #if self.training:
        #    x = self.spec_augmenter(x)

        x = x.transpose(2, 3)
        x = self.encoder(x)

        # (batch_size, channels, frames)
        x = torch.mean(x, dim=2)

        # channel smoothing
        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
        # (batch_size, channels, frames)
        x = x1 + x2
        
        x = F.dropout(x, p=0.5, training=self.training)
        x = x.transpose(1, 2)
        #(x, _) = self.gru(x)
        # (batch_size, channels*some, frames)
        x = F.relu_(self.fc1(x))
        x = x.transpose(1, 2)
        x = F.dropout(x, p=0.5, training=self.training)
        # x shape : (batch_size, channels*some, frames)
        # clipwise_output shape : (batch_size, channels*some*some2)
        # norm_att shape : (batch_size,channels*some*some2, frames)
        # segmentwise_output shape : (batch_size, channels*some*some2, frames) ->sigmoid(self.cla(x))
        (clipwise_output, norm_att, segmentwise_output) = self.att_block(x)
        logit = torch.sum(norm_att * self.att_block.cla(x), dim=2)
        segmentwise_logit = self.att_block.cla(x).transpose(1, 2)
        segmentwise_output = segmentwise_output.transpose(1, 2)
        #print("frames_num : ", frames_num)
        interpolate_ratio = frames_num // segmentwise_output.size(1)

        # Get framewise output
        framewise_output = interpolate(segmentwise_output,
                                       interpolate_ratio)
        framewise_output = pad_framewise_output(framewise_output, frames_num)

        framewise_logit = interpolate(segmentwise_logit, interpolate_ratio)
        framewise_logit = pad_framewise_output(framewise_logit, frames_num)
        if self.training:
            output_dict = {
                "framewise_output": framewise_output, # applied interpolation at segmentwise_output
                "segmentwise_output": segmentwise_output,
                "logit": logit,
                "framewise_logit": framewise_logit,
                "clipwise_output": clipwise_output,
                "target":mixup_target
            }
        else:
            output_dict = {
                "framewise_output": framewise_output, # applied interpolation at segmentwise_output
                "segmentwise_output": segmentwise_output,
                "logit": logit,
                "framewise_logit": framewise_logit,
                "clipwise_output": clipwise_output,
                "target":target
            }

        return output_dict

In [None]:
class MixupTimmEFFV2SPECSED_LOW(nn.Module):
    def __init__(self, base_model_name: str, pretrained=False, num_classes=24, in_channels=1):
        super().__init__()
        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=CFG_EFFV2DOUBLEMIXUP.n_fft, hop_length=CFG_EFFV2DOUBLEMIXUP.hop_length,
                                                 win_length=CFG_EFFV2DOUBLEMIXUP.n_fft, window="hann", center=True, pad_mode="reflect",
                                                 freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=CFG_EFFV2DOUBLEMIXUP.sample_rate, n_fft=CFG_EFFV2DOUBLEMIXUP.n_fft,
                                                 n_mels=CFG_EFFV2DOUBLEMIXUP.n_mels, fmin=CFG_EFFV2DOUBLEMIXUP.fmin, fmax=CFG_EFFV2DOUBLEMIXUP.fmax, ref=1.0, amin=1e-10, top_db=None,
                                                 freeze_parameters=True)

        
        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
                                               freq_drop_width=8, freq_stripes_num=2)

        self.mixup = Mixup()
        self.bn0 = nn.BatchNorm2d(CFG_EFFV2DOUBLEMIXUP.n_mels)

        base_model = timm.create_model(
            base_model_name, pretrained=pretrained, in_chans=in_channels)
        layers = list(base_model.children())[:-2]
        self.encoder = nn.Sequential(*layers)
        if hasattr(base_model, "head"):
            in_features = base_model.head.fc.in_features
        else:
            in_features = base_model.classifier.in_features

        self.fc1 = nn.Linear(in_features, in_features, bias=True)
        self.att_block = AttBlockV2(
            in_features, num_classes, activation="sigmoid")

        self.init_weight()

    def init_weight(self):
        init_layer(self.fc1)
        init_bn(self.bn0)

    def forward(self, batch):
        # batch -> image, target for mixup
        input, target = batch

        # (batch_size, 1, time_steps, freq_bins)
        x = self.spectrogram_extractor(input)
        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
        frames_num = x.shape[2]

        if self.training:
            
            x, mixup_target, weight = self.mixup(x, target)
            if np.random.uniform() < 0.5:
                x, mixup_target, weight = self.mixup(x, mixup_target)

            #mixup_target = mixup_target * weight.view(-1,1)
            #print("mixuped :", mixup_target, "before : ", target)
        

        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)
        if self.training:
            x = self.spec_augmenter(x)
        #if self.training:
        #    x = self.spec_augmenter(x)

        x = x.transpose(2, 3)
        x = self.encoder(x)

        # (batch_size, channels, frames)
        x = torch.mean(x, dim=2)

        # channel smoothing
        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
        # (batch_size, channels, frames)
        x = x1 + x2
        
        x = F.dropout(x, p=0.5, training=self.training)
        x = x.transpose(1, 2)
        #(x, _) = self.gru(x)
        # (batch_size, channels*some, frames)
        x = F.relu_(self.fc1(x))
        x = x.transpose(1, 2)
        x = F.dropout(x, p=0.5, training=self.training)
        # x shape : (batch_size, channels*some, frames)
        # clipwise_output shape : (batch_size, channels*some*some2)
        # norm_att shape : (batch_size,channels*some*some2, frames)
        # segmentwise_output shape : (batch_size, channels*some*some2, frames) ->sigmoid(self.cla(x))
        (clipwise_output, norm_att, segmentwise_output) = self.att_block(x)
        logit = torch.sum(norm_att * self.att_block.cla(x), dim=2)
        segmentwise_logit = self.att_block.cla(x).transpose(1, 2)
        segmentwise_output = segmentwise_output.transpose(1, 2)
        #print("frames_num : ", frames_num)
        interpolate_ratio = frames_num // segmentwise_output.size(1)

        # Get framewise output
        framewise_output = interpolate(segmentwise_output,
                                       interpolate_ratio)
        framewise_output = pad_framewise_output(framewise_output, frames_num)

        framewise_logit = interpolate(segmentwise_logit, interpolate_ratio)
        framewise_logit = pad_framewise_output(framewise_logit, frames_num)
        if self.training:
            output_dict = {
                "framewise_output": framewise_output, # applied interpolation at segmentwise_output
                "segmentwise_output": segmentwise_output,
                "logit": logit,
                "framewise_logit": framewise_logit,
                "clipwise_output": clipwise_output,
                "target":mixup_target
            }
        else:
            output_dict = {
                "framewise_output": framewise_output, # applied interpolation at segmentwise_output
                "segmentwise_output": segmentwise_output,
                "logit": logit,
                "framewise_logit": framewise_logit,
                "clipwise_output": clipwise_output,
                "target":target
            }

        return output_dict

In [None]:
class MixupTimmEFFV2SPECSED_64MEL(nn.Module):
    def __init__(self, base_model_name: str, pretrained=False, num_classes=24, in_channels=1):
        super().__init__()
        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=CFG_EFFV2DOUBLEMIXUP_64MEL.n_fft, hop_length=CFG_EFFV2DOUBLEMIXUP_64MEL.hop_length,
                                                 win_length=CFG_EFFV2DOUBLEMIXUP_64MEL.n_fft, window="hann", center=True, pad_mode="reflect",
                                                 freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=CFG_EFFV2DOUBLEMIXUP_64MEL.sample_rate, n_fft=CFG_EFFV2DOUBLEMIXUP_64MEL.n_fft,
                                                 n_mels=CFG_EFFV2DOUBLEMIXUP_64MEL.n_mels, fmin=CFG_EFFV2DOUBLEMIXUP_64MEL.fmin, fmax=CFG_EFFV2DOUBLEMIXUP_64MEL.fmax, ref=1.0, amin=1e-10, top_db=None,
                                                 freeze_parameters=True)

        
        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
                                               freq_drop_width=8, freq_stripes_num=2)

        self.mixup = Mixup()
        self.bn0 = nn.BatchNorm2d(CFG_EFFV2DOUBLEMIXUP_64MEL.n_mels)

        base_model = timm.create_model(
            base_model_name, pretrained=pretrained, in_chans=in_channels)
        layers = list(base_model.children())[:-2]
        self.encoder = nn.Sequential(*layers)
        if hasattr(base_model, "head"):
            in_features = base_model.head.fc.in_features
        else:
            in_features = base_model.classifier.in_features

        self.fc1 = nn.Linear(in_features, in_features, bias=True)
        self.att_block = AttBlockV2(
            in_features, num_classes, activation="sigmoid")

        self.init_weight()

    def init_weight(self):
        init_layer(self.fc1)
        init_bn(self.bn0)

    def forward(self, batch):
        # batch -> image, target for mixup
        input, target = batch

        # (batch_size, 1, time_steps, freq_bins)
        x = self.spectrogram_extractor(input)
        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
        frames_num = x.shape[2]

        if self.training:
            
            x, mixup_target = self.mixup(x, target)
            if np.random.uniform() < 0.5:
                x, mixup_target = self.mixup(x, mixup_target)

            #mixup_target = mixup_target * weight.view(-1,1)
            #print("mixuped :", mixup_target, "before : ", target)
        

        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)
        if self.training:
            x = self.spec_augmenter(x)
        #if self.training:
        #    x = self.spec_augmenter(x)

        x = x.transpose(2, 3)
        x = self.encoder(x)

        # (batch_size, channels, frames)
        x = torch.mean(x, dim=2)

        # channel smoothing
        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
        # (batch_size, channels, frames)
        x = x1 + x2
        
        x = F.dropout(x, p=0.5, training=self.training)
        x = x.transpose(1, 2)
        #(x, _) = self.gru(x)
        # (batch_size, channels*some, frames)
        x = F.relu_(self.fc1(x))
        x = x.transpose(1, 2)
        x = F.dropout(x, p=0.5, training=self.training)
        # x shape : (batch_size, channels*some, frames)
        # clipwise_output shape : (batch_size, channels*some*some2)
        # norm_att shape : (batch_size,channels*some*some2, frames)
        # segmentwise_output shape : (batch_size, channels*some*some2, frames) ->sigmoid(self.cla(x))
        (clipwise_output, norm_att, segmentwise_output) = self.att_block(x)
        logit = torch.sum(norm_att * self.att_block.cla(x), dim=2)
        segmentwise_logit = self.att_block.cla(x).transpose(1, 2)
        segmentwise_output = segmentwise_output.transpose(1, 2)
        #print("frames_num : ", frames_num)
        interpolate_ratio = frames_num // segmentwise_output.size(1)

        # Get framewise output
        framewise_output = interpolate(segmentwise_output,
                                       interpolate_ratio)
        framewise_output = pad_framewise_output(framewise_output, frames_num)

        framewise_logit = interpolate(segmentwise_logit, interpolate_ratio)
        framewise_logit = pad_framewise_output(framewise_logit, frames_num)
        if self.training:
            output_dict = {
                "framewise_output": framewise_output, # applied interpolation at segmentwise_output
                "segmentwise_output": segmentwise_output,
                "logit": logit,
                "framewise_logit": framewise_logit,
                "clipwise_output": clipwise_output,
                "target":mixup_target
            }
        else:
            output_dict = {
                "framewise_output": framewise_output, # applied interpolation at segmentwise_output
                "segmentwise_output": segmentwise_output,
                "logit": logit,
                "framewise_logit": framewise_logit,
                "clipwise_output": clipwise_output,
                "target":target
            }

        return output_dict

In [None]:
# https://www.kaggle.com/c/rfcx-species-audio-detection/discussion/213075
class BCEFocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, preds, targets):
        bce_loss = nn.BCEWithLogitsLoss(reduction='none')(preds, targets)
        probas = torch.sigmoid(preds)
        loss = targets * self.alpha * \
            (1. - probas)**self.gamma * bce_loss + \
            (1. - targets) * probas**self.gamma * bce_loss
        loss = loss.mean()
        return loss


class BCEFocal2WayLoss(nn.Module):
    def __init__(self, weights=[1, 1], class_weights=None):
        super().__init__()

        self.focal = BCEFocalLoss()

        self.weights = weights

    def forward(self, input, target):
        input_ = input["logit"]
        target = target.float()

        framewise_output = input["framewise_logit"]
        clipwise_output_with_max, _ = framewise_output.max(dim=1)

        loss = self.focal(input_, target)
        aux_loss = self.focal(clipwise_output_with_max, target)

        return self.weights[0] * loss + self.weights[1] * aux_loss

In [None]:
__CRITERIONS__ = {
    "BCEFocalLoss": BCEFocalLoss,
    "BCEFocal2WayLoss": BCEFocal2WayLoss
}


def get_criterion():
    if hasattr(nn, CFG.loss_name):
        return nn.__getattribute__(CFG.loss_name)(**CFG.loss_params)
    elif __CRITERIONS__.get(CFG.loss_name) is not None:
        return __CRITERIONS__[CFG.loss_name](**CFG.loss_params)
    else:
        raise NotImplementedError

In [None]:
# Custom optimizer
__OPTIMIZERS__ = {}


def get_optimizer(model: nn.Module):
    optimizer_name = CFG.optimizer_name
    if optimizer_name == "SAM":
        base_optimizer_name = CFG.base_optimizer
        if __OPTIMIZERS__.get(base_optimizer_name) is not None:
            base_optimizer = __OPTIMIZERS__[base_optimizer_name]
        else:
            base_optimizer = optim.__getattribute__(base_optimizer_name)
        return SAM(model.parameters(), base_optimizer, **CFG.optimizer_params)

    if __OPTIMIZERS__.get(optimizer_name) is not None:
        return __OPTIMIZERS__[optimizer_name](model.parameters(),
                                              **CFG.optimizer_params)
    else:
        return optim.__getattribute__(optimizer_name)(model.parameters(),
                                                      **CFG.optimizer_params)


def get_scheduler(optimizer):
    scheduler_name = CFG.scheduler_name

    if scheduler_name is None:
        return
    else:
        return optim.lr_scheduler.__getattribute__(scheduler_name)(
            optimizer, **CFG.scheduler_params)

In [None]:
warnings.filterwarnings("ignore")

logdir = Path("out")
logdir.mkdir(exist_ok=True, parents=True)
if (logdir / "train.log").exists():
    os.remove(logdir / "train.log")
logger = init_logger(log_file=logdir / "train.log")

In [None]:
# environment
set_seed(CFG.seed)
device = get_device()

# validation
splitter = getattr(model_selection, CFG.split)(**CFG.split_params)

# data
train = train_meta

In [None]:
train = train.reset_index(drop=True)

In [None]:
from contextlib import contextmanager
from typing import Optional
import time
@contextmanager
def timer(name: str, logger: Optional[logging.Logger] = None):
    t0 = time.time()
    msg = f"[{name}] start"
    if logger is None:
        print(msg)
    else:
        logger.info(msg)
    yield

    msg = f"[{name}] done in {time.time() - t0:.2f} s"
    if logger is None:
        print(msg)
    else:
        logger.info(msg)

# For Test Prepraration

In [None]:
test_audio_dir = '../input/birdclef-2022/test_soundscapes/'
file_list = [f.split('.')[0] for f in sorted(os.listdir(test_audio_dir))]

print('Number of test soundscapes:', len(file_list))

In [None]:
class TestDataset(Dataset):
    def __init__(self, clip: np.ndarray,afile='woong',index_list=[],
                 waveform_transforms=None):
        self.clip = clip
        self.waveform_transforms=waveform_transforms
        self.afile = afile
        self.index_list = index_list
        
    def __len__(self):
        return len(self.clip)
    
    def __getitem__(self, idx: int):
        afile=self.afile
        SR = 32000
        sample = self.clip[idx]
        index = self.index_list[idx]

        y = sample.astype(np.float32)

        y = np.nan_to_num(y)

        if self.waveform_transforms:
            y = self.waveform_transforms(y)

        y = np.nan_to_num(y)
        return y, afile, index

In [None]:
def prepare_model_for_inference(model, path: Path):
    if not torch.cuda.is_available():
        ckpt = torch.load(path, map_location="cpu")
    else:
        ckpt = torch.load(path)
    model.load_state_dict(ckpt["model_state_dict"])
    model.eval()
    return model

In [None]:
def prepare_model_for_mixup_inference(model, path: Path):
    if not torch.cuda.is_available():
        ckpt = torch.load(path, map_location="cpu")
    else:
        ckpt = torch.load(path)
    model.load_state_dict(ckpt["model"])
    model.eval()
    return model

In [None]:
def prediction_for_clip(clip: np.ndarray,
                        afile: str,
                        model, 
                        threshold=0.5):

    dataset = TestDataset(clip=clip,
                          afile=afile,
                          waveform_transforms=get_transforms(phase="test"))
    loader = DataLoader(dataset, batch_size=1, shuffle=False)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model.eval()
    prediction_dict = {}
    pred = {'row_id': [], 'target': []}
    for i, (image,afile) in enumerate(tqdm(loader)):
        
        image = image.to(device)
        #print(image, image.shape)
        with torch.no_grad():
            prediction = model(image)
            #print(prediction, prediction.shape)
            proba = prediction["clipwise_output"].detach().cpu().numpy().reshape(-1)
        
        chunk_end_time = (i + 1) * 5
        
        for bird in scored_birds:
            try:
                score = proba[np.where(np.array(scored_birds)==bird)]
                #print(score)
                #print("npwhere",np.where(scored_birds==bird))
                #print("score : ", score)
                
            except IndexError:
                score = 0
            #print("afile:",afile)
            #print("type afile:",type(afile))
            #print("bird:",bird)
            #print("chunk_end_time:",chunk_end_time)
            
            row_id = afile[0] + '_' + bird + '_' + str(chunk_end_time)
            #print("row_id:",row_id)
            pred['row_id'].append(row_id)
            pred['target'].append(True if score > threshold else False)
            #pred['score'].append(score)

    return pred

In [None]:
def prediction_each_for_framewise(clip: np.ndarray,
                             afile: str,
                             index_list: list,
                             model, 
                             threshold=0.5):

    dataset = TestDataset(clip=clip,
                          afile=afile,
                          index_list=index_list,
                          waveform_transforms=get_transforms(phase="test"))
    loader = DataLoader(dataset, batch_size=1, shuffle=False)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    #print("len loader : ", len(loader))
    model.eval()
    prediction_dict = {}
    pred = {'row_id': [], 'target': []}
    before_proba_split = None
    for i, (image,afile,idx) in enumerate(tqdm(loader)):
        #print("image shape: ", image.shape)
        image = image.to(device)
        with torch.no_grad():
            prediction = model(image)
            #print(prediction, prediction.shape)
            #proba = torch.sigmoid(prediction["framewise_logit"]).detach().cpu().numpy().reshape(-1)
            proba = torch.sigmoid(prediction["framewise_logit"])
            proba = proba.detach().cpu().numpy()[0]
        
        #chunk_end_time = (i + 1) * 5
        chunk_start_time = idx.detach().cpu().numpy()[0]
        chunk_start_time *= 20
        #print("chunk start time : ", chunk_start_time)
        proba_split = np.array_split(proba, 8, axis=0)
        cur_proba_split = proba_split[:4]
        
        #print("proba split shape: ", proba_split, len(proba_split), "proba last shape:", proba_split[-1].shape)
        
        for time_index, each_proba in enumerate(cur_proba_split):
            #first_each_proba = each_proba[::2].copy()
            #second_each_proba = each_proba[1::2].copy()
            #if len(each_proba)%2==0:
            #    first_each_proba = (first_each_proba + second_each_proba) / 2
            #    avg_each_proba = first_each_proba.copy()
            #else:
            #    first_each_proba = first_each_proba[:-1]
            #    avg_each_proba = ((first_each_proba + second_each_proba) / 2).copy()
            if before_proba_split is not None:
                avg_each_proba = np.vstack([each_proba,before_proba_split[time_index]])
            else:
                avg_each_proba = each_proba
            for bird in scored_birds:
                #print("bird: ", bird)
                try:
                    
                    
                        
                    #print("before each_proba: ", each_proba)
                        result_proba = np.max(avg_each_proba, axis=0)

                        score = result_proba[np.where(np.array(scored_birds)==bird)]
                    #print("each_proba_success")
                except IndexError:
                    score = 0
                    #print("each_proba_failed")

                #row_id = afile[0] + '_' + bird + '_' + str(chunk_end_time)
                row_id = afile[0] + '_' + bird + '_' + str(chunk_start_time + (time_index+1)*5)

                pred['row_id'].append(row_id)
                pred['target'].append(True if score > threshold else False)
        
        if i < len(loader):
            before_proba_split = proba_split[4:]
            #pred['score'].append(score)

    return pred

In [None]:
def prediction_each_for_framewise_ensemble(clip: np.ndarray,
                             afile: str,
                             index_list: list,
                             model_list: list, 
                             threshold=0.5):

    dataset = TestDataset(clip=clip,
                          afile=afile,
                          index_list=index_list,
                          waveform_transforms=get_transforms(phase="test"))
    loader = DataLoader(dataset, batch_size=1, shuffle=False)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    #print("len loader : ", len(loader))
    for model in model_list:
        model.eval()
    prediction_dict = {}
    pred = {'row_id': [], 'target': []}
    before_proba_split = None
    for i, (image,afile,idx) in enumerate(tqdm(loader)):
        #print("image shape: ", image.shape)
        image = image.to(device)
        for model_idx, model in enumerate(model_list):
            with torch.no_grad():
                if model_idx == 0:
                    prediction = model(image)
                    proba_before = torch.sigmoid(prediction["framewise_logit"]) / len(model_list)
                    proba_after = proba_before.detach().cpu().numpy()[0]
                else:
                    prediction = model(image)
                    proba_before = torch.sigmoid(prediction["framewise_logit"]) / len(model_list)
                    proba_after += proba_before.detach().cpu().numpy()[0]
        proba = proba_after.copy()
        chunk_start_time = idx.detach().cpu().numpy()[0]
        chunk_start_time *= 10
        #print(proba, proba.shape)
        proba_split = np.array_split(proba, 4, axis=0)
        cur_proba_split = proba_split[:2]
        
        for time_index, each_proba in enumerate(cur_proba_split):
            if before_proba_split is not None:
                #print(time_index)
                avg_each_proba = np.vstack([each_proba,before_proba_split[time_index]])
            else:
                avg_each_proba = each_proba
            for bird in scored_birds:
                #print("bird: ", bird)
                try:
                        result_proba = np.max(avg_each_proba, axis=0)

                        score = result_proba[np.where(np.array(scored_birds)==bird)]
                    #print("each_proba_success")
                except IndexError:
                    score = 0
                    #print("each_proba_failed")

                #row_id = afile[0] + '_' + bird + '_' + str(chunk_end_time)
                row_id = afile[0] + '_' + bird + '_' + str(chunk_start_time + (time_index+1)*5)

                pred['row_id'].append(row_id)
                pred['target'].append(True if score > threshold else False)
        
        if i < len(loader):
            before_proba_split = proba_split[1:]
            #pred['score'].append(score)

    return pred

In [None]:
def prediction_each_for_framewise_voting_10sec(clip: np.ndarray,
                             afile: str,
                             index_list: list,
                             model_list: list, 
                             threshold=0.5,
                             model_with_mixup_bool = list()):
    batch_size=64
    dataset = TestDataset(clip=clip,
                          afile=afile,
                          index_list=index_list,
                          waveform_transforms=get_transforms(phase="test"))
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    #for model in model_list:
    #    model.eval()
    prediction_dict = {}
    pred_list = []
    before_proba_split = None
    for model_idx, (model, mixup_bool) in enumerate(zip(model_list, model_with_mixup_bool)):
        pred = {'row_id': [], 'target': [], 'score':[]}
        with torch.no_grad():
            for i, (image,afile,idx) in enumerate(tqdm(loader)):
                #print(image.shape, afile, "at batch size: ", batch_size)

                image = image.to(device)
                target = torch.normal(2, 3, size=(image.shape[0], 1)).to(device)
                with torch.no_grad():
                    if mixup_bool==False:
                        prediction = model(image)
                        proba_before = torch.sigmoid(prediction["framewise_logit"])
                        proba_after = proba_before.cpu().numpy()
                    else:
                        prediction = model((image,target))
                        proba_before = torch.sigmoid(prediction["framewise_logit"])
                        proba_after = proba_before.cpu().numpy()
                #print(proba_after.shape, afile, idx)
                proba_length = proba_after.shape[0]
                for interval_idx in range(proba_length):
                    proba = proba_after[interval_idx]
                    chunk_start_time = idx.cpu().numpy()[interval_idx]
                    chunk_start_time *= 10
            #print(proba, proba.shape)
                    proba_split = np.array_split(proba, 4, axis=0)
                    cur_proba_split = proba_split[:2]

                    for time_index, each_proba in enumerate(cur_proba_split):
                        if before_proba_split is not None:
                        #print(time_index)
                            avg_each_proba = np.vstack([each_proba,before_proba_split[time_index]])
                        else:
                            avg_each_proba = each_proba
                        for bird in scored_birds:
                            #print("bird: ", bird)
                            try:
                                result_proba = np.max(avg_each_proba, axis=0)

                                score = result_proba[np.where(np.array(bird_label_total)==bird)]
                            #print("each_proba_success")
                            except IndexError:
                                score = 0
                            #print("each_proba_failed")

                        #row_id = afile[0] + '_' + bird + '_' + str(chunk_end_time)
                            row_id = afile[0] + '_' + bird + '_' + str(chunk_start_time + (time_index+1)*5)
                            #print(row_id)
                            pred['row_id'].append(row_id)
                            pred['score'].append(score[0])
                            pred['target'].append(True if score > threshold else False)

                    if i < len(loader):
                        before_proba_split = proba_split[2:]
            pred_list.append(pred)
            #pred['score'].append(score)

    return pred_list

In [None]:
def prediction_each_for_framewise_voting_15sec(clip: np.ndarray,
                             afile: str,
                             index_list: list,
                             model_list: list, 
                             threshold=0.5,
                             model_with_mixup_bool = list()):
    batch_size=64
    dataset = TestDataset(clip=clip,
                          afile=afile,
                          index_list=index_list,
                          waveform_transforms=get_transforms(phase="test"))
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    #for model in model_list:
    #    model.eval()
    prediction_dict = {}
    pred_list = []
    before_proba_split = None
    for model_idx, (model, mixup_bool) in enumerate(zip(model_list, model_with_mixup_bool)):
        pred = {'row_id': [], 'target': [], 'score':[]}
        with torch.no_grad():
            for i, (image,afile,idx) in enumerate(tqdm(loader)):
                #print(image.shape, afile, "at batch size: ", batch_size)

                image = image.to(device)
                target = torch.normal(2, 3, size=(image.shape[0], 1)).to(device)
                with torch.no_grad():
                    if mixup_bool==False:
                        prediction = model(image)
                        proba_before = torch.sigmoid(prediction["framewise_logit"])
                        proba_after = proba_before.cpu().numpy()
                    else:
                        prediction = model((image,target))
                        proba_before = torch.sigmoid(prediction["framewise_logit"])
                        proba_after = proba_before.cpu().numpy()
                #print(proba_after.shape, afile, idx)
                proba_length = proba_after.shape[0]
                for interval_idx in range(proba_length):
                    proba = proba_after[interval_idx]
                    chunk_start_time = idx.cpu().numpy()[interval_idx]
                    chunk_start_time *= 30
            #print(proba, proba.shape)
                    proba_split = np.array_split(proba, 6, axis=0)
                    cur_proba_split = proba_split[:6]

                    for time_index, each_proba in enumerate(cur_proba_split):
                        if before_proba_split is not None:
                        #print(time_index)
                            avg_each_proba = np.vstack([each_proba,before_proba_split[time_index]])
                        else:
                            avg_each_proba = each_proba
                        for bird in scored_birds:
                            #print("bird: ", bird)
                            try:
                                result_proba = np.max(avg_each_proba, axis=0)

                                score = result_proba[np.where(np.array(bird_label_total)==bird)]
                            #print("each_proba_success")
                            except IndexError:
                                score = 0
                            #print("each_proba_failed")

                        #row_id = afile[0] + '_' + bird + '_' + str(chunk_end_time)
                            row_id = afile[0] + '_' + bird + '_' + str(chunk_start_time + (time_index+1)*5)
                            #print(row_id)
                            pred['row_id'].append(row_id)
                            pred['score'].append(score[0])
                            pred['target'].append(True if score > threshold else False)

                    if i < len(loader):
                        before_proba_split = proba_split[6:]
            pred_list.append(pred)
            #pred['score'].append(score)

    return pred_list


In [None]:
import gc
def prediction_framewise_ensemble(test_audios,
               weights_path: list,
               threshold):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_list = []
    for i in range(len(weights_path)):
        if i <= 14:
            model = PretrainedTimmNFNETGRUSED(base_model_name="nfnet_eca_l0",
                            pretrained=False,
                            num_classes=21,
                            in_channels=CFG.in_channels)
            model_list.append(prepare_model_for_mixup_inference(model, weights_path[i]).to(device).eval())
        else:
            model = TimmSED(base_model_name=CFG.base_model_name,
                        pretrained=False,
                        num_classes=CFG.num_classes,
                        in_channels=CFG.in_channels)
            model_list.append(prepare_model_for_inference(model, weights_path[i]).to(device).eval())
        gc.collect()
    
    warnings.filterwarnings("ignore")
    prediction_dfs = []
    
    pred = {'row_id': [], 'target': []}
    
    for audio_path in test_audios:
        #[(audio_id.name.split("_")[:2]) for audio_id in all_audios]
        afile = audio_path.name.split(".")[0]
        #print("first afile")
        with timer(f"Loading {str(audio_path)}", logger):
            clip, _ = sf.read(audio_path)
        len_wav_shape = len(clip.shape)
        if len_wav_shape == 1:
            pass
        else:
            clip = clip[:,0]
        length_clip = clip.shape[0]
        cal_frame_for_interval = 5
        intervals = round(length_clip/32000/cal_frame_for_interval)
        
        #effective_test_length = 32000*cal_frame*intervals
        #if not length_clip < 32000*5*12:
        #clip = clip[:effective_test_length]
        clip_list = []
        #print(intervals)
        index_list = []
        for index in range(intervals):
            added_clip = clip[index*32000*cal_frame_for_interval:(index+1)*32000*cal_frame_for_interval+32000*cal_frame_for_interval]
            print(len(added_clip))
            clip_list.append(added_clip)
            index_list.append(index)
        clip_list = np.array(clip_list)
        with timer(f"Prediction on {audio_path}", logger):
            prediction_dict = prediction_each_for_framewise_ensemble(clip=clip_list,
                                                       afile=afile,
                                                       index_list = index_list,
                                                       model_list=model_list,
                                                       threshold=threshold)
        row_id = list(prediction_dict['row_id'])
        target = list(prediction_dict['target'])
        prediction_df = pd.DataFrame({
            "row_id": row_id,
            "target": target,
        })
        prediction_dfs.append(prediction_df)
    
    prediction_df = pd.concat(prediction_dfs, axis=0, sort=False).reset_index(drop=True)
    return prediction_df

In [None]:
from collections import OrderedDict
def prediction_each_for_framewise_voting_15sec_speedup(clip: np.ndarray,
                             afile: str,
                             index_list: list,
                             model_list: list, 
                             threshold=0.5,
                             model_with_mixup_bool = list()):
    batch_size=64
    dataset = TestDataset(clip=clip,
                          afile=afile,
                          index_list=index_list,
                          waveform_transforms=get_transforms(phase="test"))
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    #for model in model_list:
    #    model.eval()
    prediction_dict = {}
    #pred_dict = OrderedDict(k:list() for k in range(len(model_list))}
    pred_dict = OrderedDict()
    for idx in range(len(model_list)):
        pred_dict[idx] = list()
    #for model_idx, (model, mixup_bool) in enumerate(zip(model_list, model_with_mixup_bool)):
    if True:
        #pred = {'row_id': [], 'target': [], 'score':[]}
        with torch.no_grad():
            for i, (image,afile,idx) in enumerate(tqdm(loader)):
                
                for model_idx, (model, mixup_bool) in enumerate(zip(model_list, model_with_mixup_bool)):
                #print(image.shape, afile, "at batch size: ", batch_size)
                    before_proba_split = None
                    each_pred = {'row_id': [], 'target': [], 'score':[]}
                    image = image.to(device)
                    target = torch.normal(2, 3, size=(image.shape[0], 1)).to(device)

                    if mixup_bool==False:
                        prediction = model(image)
                        proba_before = torch.sigmoid(prediction["framewise_logit"])
                        proba_after = proba_before.cpu().numpy()
                    else:
                        prediction = model((image,target))
                        proba_before = torch.sigmoid(prediction["framewise_logit"])
                        proba_after = proba_before.cpu().numpy()
                #print(proba_after.shape, afile, idx)
                    proba_length = proba_after.shape[0]
                    for interval_idx in range(proba_length):
                        proba = proba_after[interval_idx]
                        chunk_start_time = idx.cpu().numpy()[interval_idx]
                        chunk_start_time *= 30
            #print(proba, proba.shape)
                        proba_split = np.array_split(proba, 12, axis=0)
                        cur_proba_split = proba_split[:6]

                        for time_index, each_proba in enumerate(cur_proba_split):
                            if before_proba_split is not None:
                        #print(time_index)
                                avg_each_proba = np.vstack([each_proba,before_proba_split[time_index]])
                            else:
                                avg_each_proba = each_proba
                            for bird in scored_birds:
                            #print("bird: ", bird)
                                try:
                                    result_proba = np.max(avg_each_proba, axis=0)

                                    score = result_proba[np.where(np.array(bird_label_total)==bird)]
                            #print("each_proba_success")
                                except IndexError:
                                    score = 0
                            #print("each_proba_failed")

                        #row_id = afile[0] + '_' + bird + '_' + str(chunk_end_time)
                                row_id = afile[0] + '_' + bird + '_' + str(chunk_start_time + (time_index+1)*5)
                            #print(row_id)
                                each_pred['row_id'].append(row_id)
                                each_pred['score'].append(score[0])
                                each_pred['target'].append(True if score > threshold else False)

                        if i < len(loader):
                            before_proba_split = proba_split[6:]
                    pred_dict[model_idx].append(each_pred)
            #pred['score'].append(score)

    return pred_dict


In [None]:
from collections import OrderedDict
def prediction_each_for_framewise_voting_10sec_speedup(clip: np.ndarray,
                             afile: str,
                             index_list: list,
                             model_list: list, 
                             threshold=0.5,
                             model_with_mixup_bool = list()):
    batch_size=64
    dataset = TestDataset(clip=clip,
                          afile=afile,
                          index_list=index_list,
                          waveform_transforms=get_transforms(phase="test"))
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    #for model in model_list:
    #    model.eval()
    prediction_dict = {}
    #pred_dict = OrderedDict(k:list() for k in range(len(model_list))}
    pred_dict = OrderedDict()
    for idx in range(len(model_list)):
        pred_dict[idx] = list()
    #for model_idx, (model, mixup_bool) in enumerate(zip(model_list, model_with_mixup_bool)):
    if True:
        #pred = {'row_id': [], 'target': [], 'score':[]}
        with torch.no_grad():
            for i, (image,afile,idx) in enumerate(tqdm(loader)):
                
                for model_idx, (model, mixup_bool) in enumerate(zip(model_list, model_with_mixup_bool)):
                #print(image.shape, afile, "at batch size: ", batch_size)
                    before_proba_split = None
                    each_pred = {'row_id': [], 'target': [], 'score':[]}
                    image = image.to(device)
                    target = torch.normal(2, 3, size=(image.shape[0], 1)).to(device)

                    if mixup_bool==False:
                        prediction = model(image)
                        proba_before = torch.sigmoid(prediction["framewise_logit"])
                        proba_after = proba_before.cpu().numpy()
                    else:
                        prediction = model((image,target))
                        proba_before = torch.sigmoid(prediction["framewise_logit"])
                        proba_after = proba_before.cpu().numpy()
                #print(proba_after.shape, afile, idx)
                    proba_length = proba_after.shape[0]
                    for interval_idx in range(proba_length):
                        proba = proba_after[interval_idx]
                        chunk_start_time = idx.cpu().numpy()[interval_idx]
                        chunk_start_time *= 10
            #print(proba, proba.shape)
                        proba_split = np.array_split(proba, 4, axis=0)
                        cur_proba_split = proba_split[:2]

                        for time_index, each_proba in enumerate(cur_proba_split):
                            if before_proba_split is not None:
                        #print(time_index)
                                avg_each_proba = np.vstack([each_proba,before_proba_split[time_index]])
                            else:
                                avg_each_proba = each_proba
                            for bird in scored_birds:
                            #print("bird: ", bird)
                                try:
                                    result_proba = np.max(avg_each_proba, axis=0)

                                    score = result_proba[np.where(np.array(bird_label_total)==bird)]
                            #print("each_proba_success")
                                except IndexError:
                                    score = 0
                            #print("each_proba_failed")

                        #row_id = afile[0] + '_' + bird + '_' + str(chunk_end_time)
                                row_id = afile[0] + '_' + bird + '_' + str(chunk_start_time + (time_index+1)*5)
                            #print(row_id)
                                each_pred['row_id'].append(row_id)
                                each_pred['score'].append(score[0])
                                each_pred['target'].append(True if score > threshold else False)

                        if i < len(loader):
                            before_proba_split = proba_split[2:]
                    pred_dict[model_idx].append(each_pred)
            #pred['score'].append(score)

    return pred_dict


In [None]:
import gc
def prediction_framewise_voting_10sec_speedup(test_audios,
               weights_path: list,
               threshold):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_list = []
    model_with_mixup_bool = [True]*len(weights_path)
    for i in range(len(weights_path)):
        if i <=3:
            model = MixupTimmEFFV2SED(
                                    base_model_name=CFG_EFFV2DOUBLEMIXUP.base_model_name,
                                    pretrained=False,
                                    num_classes=CFG_EFFV2DOUBLEMIXUP.num_classes,
                                    in_channels=CFG_EFFV2DOUBLEMIXUP.in_channels)
            model_list.append(prepare_model_for_mixup_inference(model, weights_path[i]).to(device).eval())
        elif i >= 4:
            model = MixupTimmEFFV2SED(
                                    base_model_name=CFG_EFFV2MDOUBLEMIXUP.base_model_name,
                                    pretrained=False,
                                    num_classes=CFG_EFFV2MDOUBLEMIXUP.num_classes,
                                    in_channels=CFG_EFFV2MDOUBLEMIXUP.in_channels)
            model_list.append(prepare_model_for_mixup_inference(model, weights_path[i]).to(device).eval())
        elif i >= 8:
            #CFG_EFFV2MDOUBLEMIXUP
            model = MixupTimmEFFV2SPECSED_LOW(
                                    base_model_name=CFG_EFFV2MDOUBLEMIXUP.base_model_name,
                                    pretrained=False,
                                    num_classes=CFG_EFFV2MDOUBLEMIXUP.num_classes,
                                    in_channels=CFG_EFFV2MDOUBLEMIXUP.in_channels)
            model_list.append(prepare_model_for_mixup_inference(model, weights_path[i]).to(device).eval())
        else:
            model = TimmSED(base_model_name=CFG.base_model_name,
                        pretrained=False,
                        num_classes=CFG.num_classes,
                        in_channels=CFG.in_channels)
            model_list.append(prepare_model_for_inference(model, weights_path[i]).to(device).eval())
        gc.collect()

    warnings.filterwarnings("ignore")
    prediction_dfs = []
    
    pred = {'row_id': [], 'target': []}
    
    for audio_path in test_audios:
        #[(audio_id.name.split("_")[:2]) for audio_id in all_audios]
        afile = audio_path.name.split(".")[0]
        #print("first afile")
        with timer(f"Loading {str(audio_path)}", logger):
            clip, _ = sf.read(audio_path)
        len_wav_shape = len(clip.shape)
        if len_wav_shape == 1:
            pass
        else:
            clip = clip[:,0]
        length_clip = clip.shape[0]
        cal_frame_for_interval = 10
        intervals = round(length_clip/32000/cal_frame_for_interval)
        
        #effective_test_length = 32000*cal_frame*intervals
        #if not length_clip < 32000*5*12:
        #clip = clip[:effective_test_length]
        clip_list = []
        #print(intervals)
        index_list = []
        for index in range(intervals):
            added_clip = clip[index*32000*cal_frame_for_interval:(index+1)*32000*cal_frame_for_interval+32000*cal_frame_for_interval]
            #print(len(added_clip))
            if len(added_clip)<640000:
                added_clip = np.pad(added_clip, (0,640000-len(added_clip)))
            clip_list.append(added_clip)
            index_list.append(index)
        clip_list = np.array(clip_list)
        with timer(f"Prediction on {audio_path}", logger):
            prediction_dict = prediction_each_for_framewise_voting_10sec_speedup(clip=clip_list,
                                                       afile=afile,
                                                       index_list = index_list,
                                                       model_list=model_list,
                                                       threshold=threshold,
                                                       model_with_mixup_bool = model_with_mixup_bool)
        for idx,(model_idx, each_pred_dict) in enumerate(prediction_dict.items()):
            #print(each_pred_dict)
            row = []
            score = []
            target = []
            for row_idx, each_row in enumerate(each_pred_dict):
                #print(each_row)
                if idx == 0:
                    row += each_row['row_id']
                score += each_row['score']
                target += each_row['target']
            if idx == 0:
                prediction_df = pd.DataFrame({
                    "row_id":row,
                    f"target_{idx}":target,
                    f"score_{idx}":score
                })
            else:
                prediction_df[f"target_{idx}"]=target
                prediction_df[f"score_{idx}"]=score
        #if pred_idx==0:
        #        row_id = list(each_pred_dict['row_id'])
        #    target = list(each_pred_dict['target'])
        #    score = list(each_pred_dict['score'])
        #    if pred_idx==0:
        #        prediction_df = pd.DataFrame({
        #            "row_id": row_id,
        #            f"target_{pred_idx}": target,
        #            f"score_{pred_idx}" : score
        #        })
        #    else:
        #        
        #        prediction_df[f"target_{pred_idx}"]=target
        #        prediction_df[f"score_{pred_idx}"]=score
            #print(prediction_df)
        prediction_dfs.append(prediction_df)
    
    prediction_df = pd.concat(prediction_dfs, axis=0, sort=False).reset_index(drop=True)
    return prediction_df

In [None]:
def prediction_framewise_voting_15sec_low_speedup(test_audios,
               weights_path: list,
               threshold):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_list = []
    model_with_mixup_bool = [True]*len(weights_path)
    for i in range(len(weights_path)):
        if i <= 4:
            model = MixupTimmEFFV2SPECSED_LOW(
                                    base_model_name=CFG_EFFV2DOUBLEMIXUP.base_model_name,
                                    pretrained=False,
                                    num_classes=CFG_EFFV2DOUBLEMIXUP.num_classes,
                                    in_channels=CFG_EFFV2DOUBLEMIXUP.in_channels)
            model_list.append(prepare_model_for_mixup_inference(model, weights_path[i]).to(device).eval())
        elif i <=5:
            model = HighMixupTimmEFFV2SED(
                                    base_model_name=CFG_HIGH_EFFV2DOUBLEMIXUP.base_model_name,
                                    pretrained=False,
                                    num_classes=CFG_HIGH_EFFV2DOUBLEMIXUP.num_classes,
                                    in_channels=CFG_HIGH_EFFV2DOUBLEMIXUP.in_channels)
            model_list.append(prepare_model_for_mixup_inference(model, weights_path[i]).to(device).eval())
        elif i > 7:
            model = MixupTimmNFNETSED(
                                    base_model_name=CFG_NFNETMIXUP.base_model_name,
                                    pretrained=False,
                                    num_classes=CFG_NFNETMIXUP.num_classes,
                                    in_channels=CFG_NFNETMIXUP.in_channels)
            model_list.append(prepare_model_for_mixup_inference(model, weights_path[i]).to(device).eval())
        elif i <= 12:
            model = PretrainedNFNETGRUTimmSED(base_model_name=CFG.base_model_name,
                            pretrained=False,
                            num_classes=21,
                            in_channels=CFG.in_channels)
            model_list.append(prepare_model_for_inference(model, weights_path[i]).to(device).eval())
        else:
            model = TimmSED(base_model_name=CFG.base_model_name,
                        pretrained=False,
                        num_classes=CFG.num_classes,
                        in_channels=CFG.in_channels)
            model_list.append(prepare_model_for_inference(model, weights_path[i]).to(device).eval())
        gc.collect()

    warnings.filterwarnings("ignore")
    prediction_dfs = []
    
    pred = {'row_id': [], 'target': []}
    
    for audio_path in test_audios:
        #[(audio_id.name.split("_")[:2]) for audio_id in all_audios]
        afile = audio_path.name.split(".")[0]
        #print("first afile")
        with timer(f"Loading {str(audio_path)}", logger):
            clip, _ = sf.read(audio_path)
        len_wav_shape = len(clip.shape)
        if len_wav_shape == 1:
            pass
        else:
            clip = clip[:,0]
        length_clip = clip.shape[0]
        cal_frame_for_interval = 30
        intervals = round(length_clip/32000/cal_frame_for_interval)
        
        #effective_test_length = 32000*cal_frame*intervals
        #if not length_clip < 32000*5*12:
        #clip = clip[:effective_test_length]
        clip_list = []
        #print(intervals)
        index_list = []
        for index in range(intervals):
            added_clip = clip[index*32000*cal_frame_for_interval:(index+1)*32000*cal_frame_for_interval+32000*cal_frame_for_interval]
            #print(len(added_clip))
            if len(added_clip)<320000*6:
                added_clip = np.pad(added_clip, (0,320000*6-len(added_clip)))
            clip_list.append(added_clip)
            index_list.append(index)
        clip_list = np.array(clip_list)
        with timer(f"Prediction on {audio_path}", logger):
            prediction_dict = prediction_each_for_framewise_voting_15sec_speedup(clip=clip_list,
                                                       afile=afile,
                                                       index_list = index_list,
                                                       model_list=model_list,
                                                       threshold=threshold,
                                                       model_with_mixup_bool = model_with_mixup_bool)
        #OrderedDict([(0, [{'row_id': ['soundscape_453028782_akiapo_5', 
        #'soundscape_453028782_aniani_5', 'soundscape_453028782_apapan_5', 'soundscape_453028782_barpet_5', 
        #'soundscape_453028782_crehon_5', 'soundscape_453028782_elepai_5', 'soundscape_453028782_ercfra_5', 'soundscape_453028782_hawama_5', 'soundscape_453028782_hawcre_5', 'soundscape_453028782_hawgoo_5', 'soundscape_453028782_hawhaw_5', 'soundscape_453028782_hawpet1_5', 'soundscape_453028782_houfin_5', 'soundscape_453028782_iiwi_5', 'soundscape_453028782_jabwar_5', 'soundscape_453028782_maupar_5', 'soundscape_453028782_omao_5', 'soundscape_453028782_puaioh_5', 'soundscape_453028782_skylar_5', 'soundscape_453028782_warwhe1_5', 'soundscape_453028782_yefcan_5', 'soundscape_453028782_akiapo_10', 'soundscape_453028782_aniani_10', 'soundscape_453028782_apapan_10', 'soundscape_453028782_barpet_10', 'soundscape_453028782_crehon_10', 'soundscape_453028782_elepai_10', 'soundscape_453028782_ercfra_10', 'soundscape_453028782_hawama_10', 'soundscape_453028782_hawcre_10', 'soundscape_453028782_hawgoo_10', 'soundscape_453028782_hawhaw_10', 'soundscape_453028782_hawpet1_10', 'soundscape_453028782_houfin_10', 'soundscape_453028782_iiwi_10', 'soundscape_453028782_jabwar_10', 'soundscape_453028782_maupar_10', 'soundscape_453028782_omao_10', 'soundscape_453028782_puaioh_10', 'soundscape_453028782_skylar_10', 'soundscape_453028782_warwhe1_10', 'soundscape_453028782_yefcan_10', 'soundscape_453028782_akiapo_15', 'so
        
        #print(prediction_dict)
        for idx,(model_idx, each_pred_dict) in enumerate(prediction_dict.items()):
            #print(each_pred_dict)
            row = []
            score = []
            target = []
            for row_idx, each_row in enumerate(each_pred_dict):
                #print(each_row)
                if idx == 0:
                    row += each_row['row_id']
                score += each_row['score']
                target += each_row['target']
            if idx == 0:
                prediction_df = pd.DataFrame({
                    "row_id":row,
                    f"target_{idx}":target,
                    f"score_{idx}":score
                })
            else:
                prediction_df[f"target_{idx}"]=target
                prediction_df[f"score_{idx}"]=score
        #if pred_idx==0:
        #        row_id = list(each_pred_dict['row_id'])
        #    target = list(each_pred_dict['target'])
        #    score = list(each_pred_dict['score'])
        #    if pred_idx==0:
        #        prediction_df = pd.DataFrame({
        #            "row_id": row_id,
        #            f"target_{pred_idx}": target,
        #            f"score_{pred_idx}" : score
        #        })
        #    else:
        #        
        #        prediction_df[f"target_{pred_idx}"]=target
        #        prediction_df[f"score_{pred_idx}"]=score
            #print(prediction_df)
        prediction_dfs.append(prediction_df)
    
    prediction_df = pd.concat(prediction_dfs, axis=0, sort=False).reset_index(drop=True)
    return prediction_df

In [None]:
#MixupTimmNFNETSPECSED
#CFG_NFNET0_SPEC_64MEL_15SEC

In [None]:
def prediction_framewise_voting_15sec_64mel_speedup(test_audios,
               weights_path: list,
               threshold):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_list = []
    model_with_mixup_bool = [True]*len(weights_path)
    for i in range(len(weights_path)):
        if i <= 2:
            model = MixupTimmEFFV2SPECSED_64MEL(
                                    base_model_name=CFG_EFFV2DOUBLEMIXUP_64MEL.base_model_name,
                                    pretrained=False,
                                    num_classes=CFG_EFFV2DOUBLEMIXUP_64MEL.num_classes,
                                    in_channels=CFG_EFFV2DOUBLEMIXUP_64MEL.in_channels)
            model_list.append(prepare_model_for_mixup_inference(model, weights_path[i]).to(device).eval())
        elif i <=6:
            model = MixupTimmNFNETSPECSED(
                                    base_model_name=CFG_NFNET0_SPEC_64MEL_15SEC.base_model_name,
                                    pretrained=False,
                                    num_classes=CFG_NFNET0_SPEC_64MEL_15SEC.num_classes,
                                    in_channels=CFG_NFNET0_SPEC_64MEL_15SEC.in_channels)
            model_list.append(prepare_model_for_mixup_inference(model, weights_path[i]).to(device).eval())
        elif i <=10:
            model = MixupTimmNFNETSPECSED(
                                    base_model_name="eca_nfnet_l1",
                                    pretrained=False,
                                    num_classes=CFG_NFNET0_SPEC_64MEL_15SEC.num_classes,
                                    in_channels=CFG_NFNET0_SPEC_64MEL_15SEC.in_channels)
            model_list.append(prepare_model_for_mixup_inference(model, weights_path[i]).to(device).eval())
        elif i <= 12:
            model = PretrainedNFNETGRUTimmSED(base_model_name=CFG.base_model_name,
                            pretrained=False,
                            num_classes=21,
                            in_channels=CFG.in_channels)
            model_list.append(prepare_model_for_inference(model, weights_path[i]).to(device).eval())
        else:
            model = TimmSED(base_model_name=CFG.base_model_name,
                        pretrained=False,
                        num_classes=CFG.num_classes,
                        in_channels=CFG.in_channels)
            model_list.append(prepare_model_for_inference(model, weights_path[i]).to(device).eval())
        gc.collect()

    warnings.filterwarnings("ignore")
    prediction_dfs = []
    
    pred = {'row_id': [], 'target': []}
    
    for audio_path in test_audios:
        #[(audio_id.name.split("_")[:2]) for audio_id in all_audios]
        afile = audio_path.name.split(".")[0]
        #print("first afile")
        with timer(f"Loading {str(audio_path)}", logger):
            clip, _ = sf.read(audio_path)
        len_wav_shape = len(clip.shape)
        if len_wav_shape == 1:
            pass
        else:
            clip = clip[:,0]
        length_clip = clip.shape[0]
        cal_frame_for_interval = 30
        intervals = round(length_clip/32000/cal_frame_for_interval)
        
        #effective_test_length = 32000*cal_frame*intervals
        #if not length_clip < 32000*5*12:
        #clip = clip[:effective_test_length]
        clip_list = []
        #print(intervals)
        index_list = []
        for index in range(intervals):
            added_clip = clip[index*32000*cal_frame_for_interval:(index+1)*32000*cal_frame_for_interval+32000*cal_frame_for_interval]
            #print(len(added_clip))
            if len(added_clip)<320000*6:
                added_clip = np.pad(added_clip, (0,320000*6-len(added_clip)))
            clip_list.append(added_clip)
            index_list.append(index)
        clip_list = np.array(clip_list)
        with timer(f"Prediction on {audio_path}", logger):
            prediction_dict = prediction_each_for_framewise_voting_15sec_speedup(clip=clip_list,
                                                       afile=afile,
                                                       index_list = index_list,
                                                       model_list=model_list,
                                                       threshold=threshold,
                                                       model_with_mixup_bool = model_with_mixup_bool)
        #OrderedDict([(0, [{'row_id': ['soundscape_453028782_akiapo_5', 
        #'soundscape_453028782_aniani_5', 'soundscape_453028782_apapan_5', 'soundscape_453028782_barpet_5', 
        #'soundscape_453028782_crehon_5', 'soundscape_453028782_elepai_5', 'soundscape_453028782_ercfra_5', 'soundscape_453028782_hawama_5', 'soundscape_453028782_hawcre_5', 'soundscape_453028782_hawgoo_5', 'soundscape_453028782_hawhaw_5', 'soundscape_453028782_hawpet1_5', 'soundscape_453028782_houfin_5', 'soundscape_453028782_iiwi_5', 'soundscape_453028782_jabwar_5', 'soundscape_453028782_maupar_5', 'soundscape_453028782_omao_5', 'soundscape_453028782_puaioh_5', 'soundscape_453028782_skylar_5', 'soundscape_453028782_warwhe1_5', 'soundscape_453028782_yefcan_5', 'soundscape_453028782_akiapo_10', 'soundscape_453028782_aniani_10', 'soundscape_453028782_apapan_10', 'soundscape_453028782_barpet_10', 'soundscape_453028782_crehon_10', 'soundscape_453028782_elepai_10', 'soundscape_453028782_ercfra_10', 'soundscape_453028782_hawama_10', 'soundscape_453028782_hawcre_10', 'soundscape_453028782_hawgoo_10', 'soundscape_453028782_hawhaw_10', 'soundscape_453028782_hawpet1_10', 'soundscape_453028782_houfin_10', 'soundscape_453028782_iiwi_10', 'soundscape_453028782_jabwar_10', 'soundscape_453028782_maupar_10', 'soundscape_453028782_omao_10', 'soundscape_453028782_puaioh_10', 'soundscape_453028782_skylar_10', 'soundscape_453028782_warwhe1_10', 'soundscape_453028782_yefcan_10', 'soundscape_453028782_akiapo_15', 'so
        
        #print(prediction_dict)
        for idx,(model_idx, each_pred_dict) in enumerate(prediction_dict.items()):
            #print(each_pred_dict)
            row = []
            score = []
            target = []
            for row_idx, each_row in enumerate(each_pred_dict):
                #print(each_row)
                if idx == 0:
                    row += each_row['row_id']
                score += each_row['score']
                target += each_row['target']
            if idx == 0:
                prediction_df = pd.DataFrame({
                    "row_id":row,
                    f"target_{idx}":target,
                    f"score_{idx}":score
                })
            else:
                prediction_df[f"target_{idx}"]=target
                prediction_df[f"score_{idx}"]=score
        #if pred_idx==0:
        #        row_id = list(each_pred_dict['row_id'])
        #    target = list(each_pred_dict['target'])
        #    score = list(each_pred_dict['score'])
        #    if pred_idx==0:
        #        prediction_df = pd.DataFrame({
        #            "row_id": row_id,
        #            f"target_{pred_idx}": target,
        #            f"score_{pred_idx}" : score
        #        })
        #    else:
        #        
        #        prediction_df[f"target_{pred_idx}"]=target
        #        prediction_df[f"score_{pred_idx}"]=score
            #print(prediction_df)
        prediction_dfs.append(prediction_df)
    
    prediction_df = pd.concat(prediction_dfs, axis=0, sort=False).reset_index(drop=True)
    return prediction_df

In [None]:
TARGET_SR = 32000

DATADIR = Path("../input/birdclef-2022/test_soundscapes/")

all_audios = list(DATADIR.glob("*.ogg"))
all_audio_ids = ["_".join(audio_id.name.split("_")[:2]) for audio_id in all_audios]
submission_df = pd.DataFrame({
    "row_id": all_audio_ids
})
submission_df

In [None]:
weights_path_10sec = [Path("../input/fulleffv2320hop1024nft/totalclass-doublemixup-5sec-length1024-128mel-320hop-weightmixed-tf-efficientnet-v2s-pretrain-fold-0.pth"),
                Path("../input/fulleffv2320hop1024nft/totalclass-doublemixup-5sec-length1024-128mel-320hop-weightmixed-tf-efficientnet-v2s-pretrain-fold-1.pth"),
                Path("../input/fulleffv2320hop1024nft/totalclass-doublemixup-5sec-length1024-128mel-320hop-weightmixed-tf-efficientnet-v2s-pretrain-fold-2.pth"),
                Path("../input/fulleffv2320hop1024nft/totalclass-doublemixup-5sec-length1024-128mel-320hop-weightmixed-tf-efficientnet-v2s-pretrain-fold-3.pth"),
                Path("../input/effv2m128mel1024nfftdoublemixup/128mel-5sec-effv2m-fold0.pth"),
                Path("../input/effv2m128mel1024nfftdoublemixup/128mel-5sec-effv2m-fold1.pth"),
                Path("../input/effv2m128mel1024nfftdoublemixup/128mel-5sec-effv2m-fold2.pth"),
                Path("../input/effv2m128mel1024nfftdoublemixup/128mel-5sec-effv2m-fold3.pth"),
                Path("../input/specdiffseed128meleffv2m/fold0-320hop-spec-diffseed-tf-efficientnet-v2m.pth"),
                Path("../input/specdiffseed128meleffv2m/fold1-320hop-spec-diffseed-tf-efficientnet-v2m.pth"),
                Path("../input/specdiffseed128meleffv2m/fold3-320hop-spec-diffseed-tf-efficientnet-v2m.pth"),
                #Path("../input/fullnfnetmixup320hop5sec/totalclass-5sec-320hop-weightmixed-eca-nfnet-l0-pretrain-fold-0.pth"),
                #Path("../input/nfnetgrufromfinetune/nfnetl0grufromfinetune-10sec-fold0.pth"),
                #Path("../input/nfnetgrufromfinetune/nfnetl0grufromfinetune-10sec-fold1.pth"),
                #Path("../input/nfnetgrufromfinetune/nfnetl0grufromfinetune-10sec-fold2.pth"),
                #Path("../input/nfnetgrufromfinetune/nfnetl0grufromfinetune-10sec-fold3.pth"),
                #Path("../input/nfnetgrufromfinetune/nfnetl0grufromfinetune-10sec-fold4.pth"),
                #Path("../input/birdclefclass21effb0aug/effb0-frompretrain-5sec-fold2-train.49_full.pth"),
                #Path("../input/birdclefclass21effb0aug/effb0-frompretrain-5sec-train.35_full.pth"),
                #Path("../input/birdclefclass21effb0aug/effb0-frompretrain-5sec-train.38_full.pth"),
                #Path("../input/birdclefclass21effb0aug/effb0-frompretrain-5sec-train.42_full.pth"),
                #Path("../input/birdclefclass21effb0aug/effb0-frompretrain-5sec-train.49_full.pth"),
                #Path("../input/birdclefclass21effb0aug/effb0_10sec_fold0_train.31_full.pth"),
                #Path("../input/birdclefclass21effb0aug/effb0_10sec_fold1_train.47_full.pth"),
                #Path("../input/birdclefclass21effb0aug/effb0_10sec_fold2_train.50_full.pth"),
                #Path("../input/birdclefclass21effb0aug/effb0_10sec_fold3_train.48_full.pth"),
                #Path("../input/birdclefclass21effb0aug/effb0_10sec_fold4_train.49_full.pth"),
                #Path("../input/birdclefclass21effb0aug/fold0-frompretrain-effb0-15sec-train.50_full.pth"),
                #Path("../input/birdclefclass21effb0aug/fold1-frompretrain-effb0-15sec-train.50_full.pth"),
                #Path("../input/birdclefclass21effb0aug/fold2-frompretrain-effb0-15sec-train.28_full.pth"),
                #Path("../input/birdclefclass21effb0aug/fold3-frompretrain-effb0-15sec-train.34_full.pth"),
                #Path("../input/birdclefclass21effb0aug/fold4-frompretrain-effb0-15sec-train.46_full.pth"),
                #Path("../input/birdclefclass21effb0aug/effb0_aug_fold4_train.88_full.pth"),
                #Path("../input/birdclefclass21effb0aug/effb0_aug_fold0_train.94_full.pth"),
                #Path("../input/birdclefclass21effb0aug/effb0_aug_fold1_train.91_full.pth"),
                #Path("../input/birdclefclass21effb0aug/effb0_aug_fold2_train.75_full.pth"),
                #Path("../input/birdclefclass21effb0aug/effb0_aug_fold3_train.91_full.pth")
               ]
weights_path_15sec = [Path("../input/fulleffv2320hop1024nft/fold0-spec-15sec-effv2-320hop.pth"),
                      Path("../input/speceffv2diffseed1281024/fold0_spec_320hop_diffseed_128mel_effv2s.pth"),
                      Path("../input/speceffv2diffseed1281024/fold1_spec_320hop_diffseed_128mel_effv2s.pth"),
                      Path("../input/speceffv2diffseed1281024/fold2_spec_320hop_diffseed_128mel_effv2s.pth"),
                      Path("../input/speceffv2diffseed1281024/fold3_spec_320hop_diffseed_128mel_effv2s.pth")
                     ]
weights_path_15sec_64mel = [Path("../input/15seceffv2sspec64mel2048nfftdoublemixup/spec-64mel-15sec-effv2s-fold1.pth"),
                      Path("../input/15seceffv2sspec64mel2048nfftdoublemixup/spec-64mel-15sec-effv2s-fold2.pth"),
                      Path("../input/15seceffv2sspec64mel2048nfftdoublemixup/spec-64mel-15sec-effv2s-fold3.pth"),
                      Path("../input/nfnet64melspec/fold0-64mel-spec-diffseed-nfnetecal0.pth"),
                      Path("../input/nfnet64melspec/fold1-64mel-spec-diffseed-nfnetecal0.pth"),
                      Path("../input/nfnet64melspec/fold2-64mel-spec-diffseed-nfnetecal0.pth"),
                      Path("../input/nfnet64melspec/fold3-64mel-spec-diffseed-nfnetecal0.pth"),
                      Path("../input/nfnet64melspec/fold0-64mel-spec-diffseed-nfnetecal1.pth.pth"),
                      Path("../input/nfnet64melspec/fold1-64mel-spec-diffseed-nfnetecal1.pth.pth"),
                      Path("../input/nfnet64melspec/fold2-64mel-spec-diffseed-nfnetecal1.pth"),
                      Path("../input/nfnet64melspec/fold3-64mel-spec-diffseed-nfnetecal1.pth"),
                     ]
#submission = prediction_framewise_ensemble(test_audios=all_audios,
temp_submission = prediction_framewise_voting_10sec_speedup(test_audios=all_audios,
                        weights_path=weights_path_10sec,
                        threshold=0.065)
temp_submission1 = prediction_framewise_voting_15sec_low_speedup(test_audios=all_audios,
                        weights_path=weights_path_15sec,
                        threshold=0.065)
temp_submission2 = prediction_framewise_voting_15sec_64mel_speedup(test_audios=all_audios,
                        weights_path=weights_path_15sec_64mel,
                        threshold=0.065)
#submission.to_csv("submission.csv", index=False)

In [None]:
temp_submission

In [None]:
temp_submission1

In [None]:
temp_submission2

In [None]:
temp_submission1 = temp_submission1.rename(columns={"target_0":"target_11", "score_0":"score_11",'row_id':"row_id_backup"})
temp_submission1 = temp_submission1.rename(columns={"target_1":"target_12", "score_1":"score_12",'row_id':"row_id_backup"})
temp_submission1 = temp_submission1.rename(columns={"target_2":"target_13", "score_2":"score_13",'row_id':"row_id_backup"})
temp_submission1 = temp_submission1.rename(columns={"target_3":"target_14", "score_3":"score_14",'row_id':"row_id_backup"})
temp_submission1 = temp_submission1.rename(columns={"target_4":"target_15", "score_4":"score_15",'row_id':"row_id_backup"})

In [None]:
temp_submission2 = temp_submission2.rename(columns={"target_0":"target_16", "score_0":"score_16",'row_id':"row_id_backup"})
temp_submission2 = temp_submission2.rename(columns={"target_1":"target_17", "score_1":"score_17",'row_id':"row_id_backup"})
temp_submission2 = temp_submission2.rename(columns={"target_2":"target_18", "score_2":"score_18",'row_id':"row_id_backup"})
temp_submission2 = temp_submission2.rename(columns={"target_3":"target_19", "score_3":"score_19",'row_id':"row_id_backup"})
temp_submission2 = temp_submission2.rename(columns={"target_4":"target_20", "score_4":"score_20",'row_id':"row_id_backup"})
temp_submission2 = temp_submission2.rename(columns={"target_5":"target_21", "score_5":"score_21",'row_id':"row_id_backup"})
temp_submission2 = temp_submission2.rename(columns={"target_6":"target_22", "score_6":"score_22",'row_id':"row_id_backup"})

In [None]:
temp_submission2 = temp_submission2.rename(columns={"target_7":"target_23", "score_7":"score_23",'row_id':"row_id_backup"})
temp_submission2 = temp_submission2.rename(columns={"target_8":"target_24", "score_8":"score_24",'row_id':"row_id_backup"})
temp_submission2 = temp_submission2.rename(columns={"target_9":"target_25", "score_9":"score_25",'row_id':"row_id_backup"})
temp_submission2 = temp_submission2.rename(columns={"target_10":"target_26", "score_10":"score_26",'row_id':"row_id_backup"})

In [None]:
temp_submission

In [None]:
temp_submission = pd.concat([temp_submission,temp_submission1,temp_submission2],axis=1)

In [None]:
temp_submission

In [None]:
temp_submission1

In [None]:
temp_submission2

In [None]:
len_target = len([x for x in temp_submission.columns if x.startswith("target")])

In [None]:
print(len_target)

In [None]:
target_columns = ["target_"+str(i) for i in range(len_target)]

In [None]:
score_columns = ["score_"+str(i) for i in range(len_target)]

In [None]:
pd.set_option('display.max_colwidth', -1)

In [None]:
temp_submission = temp_submission[['row_id']+["target_"+str(i) for i in range(len_target)]+["score_"+str(i) for i in range(len_target)]]

In [None]:
#temp_submission[[f"score_{i}" for i in range(11)]][-20:]

In [None]:
def get_max_score(df):
    score_list = []
    for col in score_columns:
        score_list.append(df[col])

    return max(score_list)

In [None]:
def get_mean_score(df):
    score_list = []
    for col in score_columns:
        score_list.append(df[col])

    return np.mean(score_list)

In [None]:
def get_second_score(df):
    score_list = []
    for col in score_columns:
        score_list.append(df[col])

    return score_list[np.argsort(-np.array(score_list))[1]]

In [None]:
temp_submission.head()

In [None]:
temp_submission.tail()

In [None]:
def get_real_target(df):
    init = 0
    for col in target_columns:
        init += df[col]

    return init >= int((len(target_columns) / 2) + 1)

In [None]:
temp_submission['target'] = temp_submission.apply(get_real_target, axis=1)
temp_submission['max_score'] = temp_submission.apply(get_max_score, axis=1)
temp_submission['mean_score'] = temp_submission.apply(get_mean_score, axis=1)
temp_submission['second_score'] = temp_submission.apply(get_second_score, axis=1)
temp_submission.head()

In [None]:
#temp_submission[[f"target_{x}" for x in range(23)]]

In [None]:
(temp_submission['max_score']>0.200).sum()

In [None]:
(temp_submission['mean_score']>0.0855).sum()

In [None]:
(temp_submission['second_score']>0.20).sum()

In [None]:
#temp_submission['target'] = temp_submission['max_score']>0.2
temp_submission['target'] = temp_submission['second_score']>0.20

In [None]:
submission = temp_submission[['row_id','target']]

In [None]:
submission.to_csv("submission.csv", index=False)

In [None]:
submission.head()

In [None]:
submission['target'].sum()