In [None]:
!nvidia-smi

In [None]:
# Set environment
import sys, os
IN_COLAB  = 'google.colab' in sys.modules
IN_KAGGLE = 'kaggle_web_client' in sys.modules
LOCAL     = not (IN_KAGGLE or IN_COLAB)
print(f'IN_COLAB:{IN_COLAB}, IN_KAGGLE:{IN_KAGGLE}, LOCAL:{LOCAL}')

In [None]:
# For Colab
# ==================
if IN_COLAB:
    # mount googledrive
    from google.colab import drive
    drive.mount('/content/drive')
    # copy kaggle.json from googledrive
    ! pip install --upgrade --force-reinstall --no-deps  kaggle > /dev/null
    ! mkdir ~/.kaggle
    ! cp "/content/drive/MyDrive/kaggle/kaggle.json" ~/.kaggle/
    ! chmod 600 ~/.kaggle/kaggle.json
    
    if not os.path.exists("/content/input/train_short_audio"):
        !mkdir input
        !kaggle competitions download -c birdclef-2021
        !unzip /content/birdclef-2021.zip -d input

## Libraries

In [None]:
if IN_KAGGLE:
    !pip install ../input/torchlibrosa/torchlibrosa-0.0.5-py3-none-any.whl
    !pip install colorednoise
elif IN_COLAB:
    !pip install -q pysndfx SoundFile audiomentations timm torchlibrosa colorednoise

In [None]:
import sys
if IN_KAGGLE:
    sys.path.append('../input/pytorch-image-models/pytorch-image-models-master')

import os
import math
import time
import random
from os.path import join as pjoin
from glob import glob

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import librosa
import soundfile as sf
from pathlib import Path
import colorednoise as cn
from IPython.display import Audio, IFrame, display # jupyterで再生につかう

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau

from torchlibrosa.stft import LogmelFilterBank, Spectrogram
from torchlibrosa.augmentation import SpecAugmentation

import timm
import warnings 
from tqdm.notebook import tqdm
import joblib

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## CFG

In [None]:
class CFG:
    debug = True
    inference_soundscape = True
    exp_name = "test"
    seed = 29
    n_fold = 5
    trn_fold = [0]
    target_col = 'primary_label'
    train_datadir = Path("../input/birdclef-2021/train_short_audio")
    period = 5
    img_size = 224
    criterion ='BCEWithLogitsLoss'
    model_name = 'rexnet_200' 
    #['rexnet_200' 'tf_efficientnet_b3', 'resnet18', 'densenet121''vit_deit_small_patch16_224' 
    # ''vit_deit_tiny_patch16_224', 'vit_deit_small_patch16_224','vit_deit_base_patch16_224',',]
    target_size = 397
    label_smoothing = 0.0
    mixup_proba = 0.5
    transforms = {
            "train": [{"name": "NoiseInjection", "params":{"max_noise_level": 0.04}},
                      {"name": "PinkNoise", "params":{"min_snr": 10.0}},
                      {"name": "SpecifiedNoise", "params":{"noise_folder_path": "", "min_snr":0.5, "max_snr":0.8 }},
                      #{"name": "PitchShift", "params":{"max_range": 3}},
                      {"name": "RandomVolume", "params":{"limit": 4}},
                      {"name": "Normalize"}],
            "valid": [{"name": "Normalize"}]
        }
    # Audio Params
    sample_rate = 32000
    n_mels = 128
    fmin = 20
    fmax = 16000
    n_fft = 2048
    hop_length = n_fft//4
    spec_aggreagation = 'deltas' #['repeat3', 'deltas']
    
    epochs = 30
    # scheduler/optimizer
    scheduler = 'CosineAnnealingWarmRestarts' # ['ReduceLROnPlateau', 'CosineAnnealingLR', 'CosineAnnealingWarmRestarts']
    #factor=0.2 # ReduceLROnPlateau
    #patience=4 # ReduceLROnPlateau
    #eps=1e-6 # ReduceLROnPlateau
    #T_max=10 # CosineAnnealingLR
    T_0=30 # CosineAnnealingWarmRestarts
    lr=1e-4
    min_lr=1e-6
    weight_decay=1e-6  
    # train
    gradient_accumulation_steps=1
    apex = False
    max_grad_norm = 1000
    print_freq = 100
    # model
    pretrained = True
    in_channels = 3
    # Split
    split = "StratifiedKFold"
    split_params = {
        "n_splits": 5,
        "shuffle": True,
        "random_state": 29
    }
    # DataLoader
    loader = {
        "train": {
            "batch_size": 32,
            "num_workers": 4,
            "shuffle": True,
            "pin_memory": True,
            "drop_last": True
        },
        "valid": {
            "batch_size": 64,
            "num_workers": 4,
            "shuffle": False,
            "pin_memory": True,
            "drop_last": False
        }
    }

In [None]:
if LOCAL:
    CFG.train_datadir = Path("F:/Kaggle/BirdCLEF2021/data/input/resample")
    CFG.loader["train"]["batch_size"] = 32
    CFG.loader["train"]["num_workers"] = 0
    CFG.loader["valid"]["num_workers"] = 0
elif IN_COLAB:
    CFG.train_datadir = Path("/content/input/train_short_audio")

In [None]:
TARGETS = [
        'acafly', 'acowoo', 'aldfly', 'ameavo', 'amecro',
        'amegfi', 'amekes', 'amepip', 'amered', 'amerob',
        'amewig', 'amtspa', 'andsol1', 'annhum', 'astfly',
        'azaspi1', 'babwar', 'baleag', 'balori', 'banana',
        'banswa', 'banwre1', 'barant1', 'barswa', 'batpig1',
        'bawswa1', 'bawwar', 'baywre1', 'bbwduc', 'bcnher',
        'belkin1', 'belvir', 'bewwre', 'bkbmag1', 'bkbplo',
        'bkbwar', 'bkcchi', 'bkhgro', 'bkmtou1', 'bknsti', 'blbgra1',
        'blbthr1', 'blcjay1', 'blctan1', 'blhpar1', 'blkpho',
        'blsspa1', 'blugrb1', 'blujay', 'bncfly', 'bnhcow', 'bobfly1',
        'bongul', 'botgra', 'brbmot1', 'brbsol1', 'brcvir1', 'brebla',
        'brncre', 'brnjay', 'brnthr', 'brratt1', 'brwhaw', 'brwpar1',
        'btbwar', 'btnwar', 'btywar', 'bucmot2', 'buggna', 'bugtan',
        'buhvir', 'bulori', 'burwar1', 'bushti', 'butsal1', 'buwtea',
        'cacgoo1', 'cacwre', 'calqua', 'caltow', 'cangoo', 'canwar',
        'carchi', 'carwre', 'casfin', 'caskin', 'caster1', 'casvir',
        'categr', 'ccbfin', 'cedwax', 'chbant1', 'chbchi', 'chbwre1',
        'chcant2', 'chispa', 'chswar', 'cinfly2', 'clanut', 'clcrob',
        'cliswa', 'cobtan1', 'cocwoo1', 'cogdov', 'colcha1', 'coltro1',
        'comgol', 'comgra', 'comloo', 'commer', 'compau', 'compot1',
        'comrav', 'comyel', 'coohaw', 'cotfly1', 'cowscj1', 'cregua1',
        'creoro1', 'crfpar', 'cubthr', 'daejun', 'dowwoo', 'ducfly', 'dusfly',
        'easblu', 'easkin', 'easmea', 'easpho', 'eastow', 'eawpew', 'eletro',
        'eucdov', 'eursta', 'fepowl', 'fiespa', 'flrtan1', 'foxspa', 'gadwal',
        'gamqua', 'gartro1', 'gbbgul', 'gbwwre1', 'gcrwar', 'gilwoo',
        'gnttow', 'gnwtea', 'gocfly1', 'gockin', 'gocspa', 'goftyr1',
        'gohque1', 'goowoo1', 'grasal1', 'grbani', 'grbher3', 'grcfly',
        'greegr', 'grekis', 'grepew', 'grethr1', 'gretin1', 'greyel',
        'grhcha1', 'grhowl', 'grnher', 'grnjay', 'grtgra', 'grycat',
        'gryhaw2', 'gwfgoo', 'haiwoo', 'heptan', 'hergul', 'herthr',
        'herwar', 'higmot1', 'hofwoo1', 'houfin', 'houspa', 'houwre',
        'hutvir', 'incdov', 'indbun', 'kebtou1', 'killde', 'labwoo', 'larspa',
        'laufal1', 'laugul', 'lazbun', 'leafly', 'leasan', 'lesgol', 'lesgre1',
        'lesvio1', 'linspa', 'linwoo1', 'littin1', 'lobdow', 'lobgna5', 'logshr',
        'lotduc', 'lotman1', 'lucwar', 'macwar', 'magwar', 'mallar3', 'marwre',
        'mastro1', 'meapar', 'melbla1', 'monoro1', 'mouchi', 'moudov', 'mouela1',
        'mouqua', 'mouwar', 'mutswa', 'naswar', 'norcar', 'norfli', 'normoc', 'norpar',
        'norsho', 'norwat', 'nrwswa', 'nutwoo', 'oaktit', 'obnthr1', 'ocbfly1',
        'oliwoo1', 'olsfly', 'orbeup1', 'orbspa1', 'orcpar', 'orcwar', 'orfpar',
        'osprey', 'ovenbi1', 'pabspi1', 'paltan1', 'palwar', 'pasfly', 'pavpig2',
        'phivir', 'pibgre', 'pilwoo', 'pinsis', 'pirfly1', 'plawre1', 'plaxen1',
        'plsvir', 'plupig2', 'prowar', 'purfin', 'purgal2', 'putfru1', 'pygnut',
        'rawwre1', 'rcatan1', 'rebnut', 'rebsap', 'rebwoo', 'redcro', 'reevir1',
        'rehbar1', 'relpar', 'reshaw', 'rethaw', 'rewbla', 'ribgul', 'rinkin1',
        'roahaw', 'robgro', 'rocpig', 'rotbec', 'royter1', 'rthhum', 'rtlhum',
        'ruboro1', 'rubpep1', 'rubrob', 'rubwre1', 'ruckin', 'rucspa1', 'rucwar',
        'rucwar1', 'rudpig', 'rudtur', 'rufhum', 'rugdov', 'rumfly1', 'runwre1',
        'rutjac1', 'saffin', 'sancra', 'sander', 'savspa', 'saypho', 'scamac1',
        'scatan', 'scbwre1', 'scptyr1', 'scrtan1', 'semplo', 'shicow', 'sibtan2',
        'sinwre1', 'sltred', 'smbani', 'snogoo', 'sobtyr1', 'socfly1', 'solsan',
        'sonspa', 'soulap1', 'sposan', 'spotow', 'spvear1', 'squcuc1', 'stbori',
        'stejay', 'sthant1', 'sthwoo1', 'strcuc1', 'strfly1', 'strsal1', 'stvhum2',
        'subfly', 'sumtan', 'swaspa', 'swathr', 'tenwar', 'thbeup1', 'thbkin',
        'thswar1', 'towsol', 'treswa', 'trogna1', 'trokin', 'tromoc', 'tropar',
        'tropew1', 'tuftit', 'tunswa', 'veery', 'verdin', 'vigswa', 'warvir',
        'wbwwre1', 'webwoo1', 'wegspa1', 'wesant1', 'wesblu', 'weskin', 'wesmea',
        'westan', 'wewpew', 'whbman1', 'whbnut', 'whcpar', 'whcsee1', 'whcspa',
        'whevir', 'whfpar1', 'whimbr', 'whiwre1', 'whtdov', 'whtspa', 'whwbec1',
        'whwdov', 'wilfly', 'willet1', 'wilsni1', 'wiltur', 'wlswar', 'wooduc',
        'woothr', 'wrenti', 'y00475', 'yebcha', 'yebela1', 'yebfly', 'yebori1',
        'yebsap', 'yebsee1', 'yefgra1', 'yegvir', 'yehbla', 'yehcar1', 'yelgro',
        'yelwar', 'yeofly1', 'yerwar', 'yeteup1', 'yetvir']

## Directory & LoadData

In [None]:
if IN_KAGGLE:
    INPUT_DIR = Path('../input/birdclef-2021/')
    OUTPUT_DIR = './'
    train = pd.read_csv('../input/birdclef-2021/train_metadata.csv')
    train = train[['primary_label', 'filename']]
    test = pd.read_csv('../input/birdclef-2021/test.csv')
    train_HL = pd.read_csv( '../input/bird2-hl-list/30HL_146classes.csv')
    train_HL = train_HL[["primary_label", "secondary_labels", "filename", "begin", "end"]]
    BGN_DIR = '../input/birdclef-bgn'
elif IN_COLAB:
    INPUT_DIR = Path('/content/input/')
    OUTPUT_DIR = f'/content/drive/MyDrive/kaggle/BirdClef2021/data/output/{CFG.exp_name}/'
    train = pd.read_csv('/content/input/train_metadata.csv')
    train = train[['primary_label', 'filename']]
    test = pd.read_csv('/content/input/test.csv')
    train_HL = pd.read_csv( '/content/drive/MyDrive/kaggle/BirdClef2021/data/input/30HL_146classes.csv')
    train_HL = train_HL[["primary_label", "secondary_labels", "filename", "begin", "end"]]
    BGN_DIR = '/content/drive/MyDrive/kaggle/BirdClef2021/data/input/bgn'
if LOCAL:
    INPUT_DIR = Path("F:/Kaggle/BirdCLEF2021/data/input/")
    OUTPUT_DIR = f'F:/Kaggle/BirdCLEF2021/data/output/{CFG.exp_name}/'
    train = pd.read_csv('F:/Kaggle/BirdCLEF2021/data/input/resample/train_mod.csv')
    train = train[['primary_label', 'resampled_filename']]
    train.columns = ['primary_label', 'filename']
    test = pd.read_csv('F:/Kaggle/BirdCLEF2021/data/input/test.csv')
    train_HL = pd.read_csv( 'F:/Kaggle/BirdCLEF2021/data/input/30HL_146classes.csv')
    train_HL = train_HL[["primary_label", "secondary_labels", "filename", "begin", "end"]]
    BGN_DIR = 'F:/Kaggle/BirdCLEF2021/data/input/bgn'

CFG.transforms["train"][2]["params"]["noise_folder_path"] = BGN_DIR
# get train soundscapes
TS_clips = []
DATADIR = INPUT_DIR / "train_soundscapes/"
all_audios = list(DATADIR.glob("*.ogg"))
for audio in all_audios:
    clip, _ = sf.read(audio)
    TS_clips.append(clip)
    
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

if CFG.debug:
    CFG.epochs = 2
    train = train.sample(n=1000, random_state=CFG.seed).reset_index(drop=True)

## Utils

In [None]:
def set_seed(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def init_logger(log_file = f'{OUTPUT_DIR}train.log'):
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = init_logger()
set_seed(seed=CFG.seed)

In [None]:
# Log config
LOGGER.info(f"exp_name:{CFG.exp_name}")
LOGGER.info(f"train_period:{CFG.period}")
LOGGER.info(f"model_name:{CFG.model_name}")
LOGGER.info(f"spec_aggreagation:{CFG.spec_aggreagation}")
LOGGER.info(f"epochs:{CFG.epochs}")
LOGGER.info(f"lr:{CFG.lr}")
LOGGER.info(f"min_lr:{CFG.min_lr}")
bs=CFG.loader["train"]["batch_size"]
LOGGER.info(f"batch_size:{bs}")

## CV split

In [None]:
folds = train.copy()
Fold = StratifiedKFold(**CFG.split_params)
for n, (tr_idx, val_idx) in enumerate(Fold.split(folds, folds[CFG.target_col])):
    folds.loc[val_idx, 'fold'] = int(n)
folds['fold'] = folds['fold'].astype(int)
# # check the propotion
fold_proportion = pd.pivot_table(folds, index=CFG.target_col, columns="fold", aggfunc=len)
print(fold_proportion.shape)

In [None]:
fold_proportion

## Dataset

In [None]:
class WaveformDataset(Dataset):
    def __init__(self,
                 df: pd.DataFrame,
                 df_HL: pd.DataFrame,
                 datadir: Path,
                 img_size=224,
                 waveform_transforms=None,
                 period=20,
                 validation=False):
        self.df = df
        self.df_HL = df_HL
        self.datadir = datadir
        self.img_size = img_size
        self.waveform_transforms = waveform_transforms
        self.period = period
        self.validation = validation
        self.y = np.array([TARGETS.index(c) for c in df[CFG.target_col]])
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx: int):
        """
        HLされているデータの場合、その範囲でランダムに5秒Clip
        HLされていないときと、validationのときは冒頭5秒Clip
        """
        sample = self.df.loc[idx, :]
        wav_name = sample['filename']
        ebird_code = sample[CFG.target_col]
        
        y, sr = sf.read(self.datadir / ebird_code / wav_name)
        # search hl
        row = self.df_HL[self.df_HL['filename'] == wav_name]
        # 5秒以下の場合は周囲を埋める
        if (len(row) > 0) &  (not self.validation):
            begin = row["begin"].values[0].astype(int)
            end = row["end"].values[0].astype(int)
            if end - begin < 5:
                y_sec = len(y)//sr
                missing = self.period - (end - begin)
                new_begin = max(begin - missing, 0)
                new_end = min(end + missing, y_sec)
                y = y[new_begin*sr: new_end*sr]
            else:
                y = y[begin*sr: end*sr]
        
        len_y = len(y)
        effective_length = sr * self.period
        if len_y < effective_length:
            new_y = np.zeros(effective_length, dtype=y.dtype)
            if (len(row) > 0) &  (not self.validation):
                start = np.random.randint(effective_length - len_y)
            else:
                start = 0
            new_y[start:start + len_y] = y
            y = new_y.astype(np.float32)
        elif len_y > effective_length:
            if (len(row) > 0) &  (not self.validation):
                start = np.random.randint(len_y - effective_length)
            else:
                start = 0
            y = y[start:start + effective_length].astype(np.float32)
        else:
            y = y.astype(np.float32)
        y = np.nan_to_num(y)

        if self.waveform_transforms:
            y = self.waveform_transforms(y)

        y = np.nan_to_num(y)
        
        labels = np.zeros(len(TARGETS), dtype=float)
        labels[TARGETS.index(ebird_code)] = 1.0
        
        return{
            'waveforms': y,
            'targets': labels
        }


In [None]:
# train_dataset = WaveformDataset(train,
#                                 CFG.train_datadir,
#                                 img_size=CFG.img_size,
#                                 waveform_transforms=None,
#                                 period=CFG.period,
#                                 validation=True)
# 
# data = train_dataset[0]
# print(data['waveforms'].shape, data['targets'].shape)
# plt.plot(data['waveforms'])
# plt.show()
# Audio(data=data['waveforms'], rate=32000)

## WaveformTransforms

In [None]:
# https://www.kaggle.com/hidehisaarai1213/pytorch-training-birdclef2021-starter
def get_transforms(phase: str):
    transforms = CFG.transforms
    if transforms is None:
        return None
    else:
        if transforms[phase] is None:
            return None
        trns_list = []
        for trns_conf in transforms[phase]:
            trns_name = trns_conf["name"]
            trns_params = {} if trns_conf.get("params") is None else \
                trns_conf["params"]
            if globals().get(trns_name) is not None:
                trns_cls = globals()[trns_name]
                trns_list.append(trns_cls(**trns_params))

        if len(trns_list) > 0:
            return Compose(trns_list)
        else:
            return None

# Base Class
# -------------------------------------------------
class AudioTransform:
    def __init__(self, always_apply=False, p=0.5):
        self.always_apply = always_apply
        self.p = p

    def __call__(self, y: np.ndarray):
        if self.always_apply:
            return self.apply(y)
        else:
            if np.random.rand() < self.p:
                return self.apply(y)
            else:
                return y

    def apply(self, y: np.ndarray):
        raise NotImplementedError
        
class Normalize:
    def __call__(self, y: np.ndarray):
        max_vol = np.abs(y).max()
        y_vol = y * 1 / max_vol
        return np.asfortranarray(y_vol)

class Compose:
    def __init__(self, transforms: list):
        self.transforms = transforms

    def __call__(self, y: np.ndarray):
        for trns in self.transforms:
            y = trns(y)
        return y
    
class NoiseInjection(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, max_noise_level=0.5, sr=32000):
        super().__init__(always_apply, p)

        self.noise_level = (0.0, max_noise_level)
        self.sr = sr

    def apply(self, y: np.ndarray, **params):
        noise_level = np.random.uniform(*self.noise_level)
        noise = np.random.randn(len(y))
        augmented = (y + noise * noise_level).astype(y.dtype)
        return augmented

# GaussianNoiseSNRとも
class GaussianNoise(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, min_snr=5, max_snr=20, sr=32000):
        super().__init__(always_apply, p)

        self.min_snr = min_snr
        self.max_snr = max_snr
        self.sr = sr

    def apply(self, y: np.ndarray, **params):
        snr = np.random.uniform(self.min_snr, self.max_snr)
        a_signal = np.sqrt(y ** 2).max()
        a_noise = a_signal / (10 ** (snr / 20))

        white_noise = np.random.randn(len(y))
        a_white = np.sqrt(white_noise ** 2).max()
        augmented = (y + white_noise * 1 / a_white * a_noise).astype(y.dtype)
        return augmented

class PinkNoise(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, min_snr=5, max_snr=20, sr=32000):
        super().__init__(always_apply, p)

        self.min_snr = min_snr
        self.max_snr = max_snr
        self.sr = sr

    def apply(self, y: np.ndarray, **params):
        snr = np.random.uniform(self.min_snr, self.max_snr)
        a_signal = np.sqrt(y ** 2).max()
        a_noise = a_signal / (10 ** (snr / 20))

        pink_noise = cn.powerlaw_psd_gaussian(1, len(y))
        a_pink = np.sqrt(pink_noise ** 2).max()
        augmented = (y + pink_noise * 1 / a_pink * a_noise).astype(y.dtype)
        return augmented
    
def _db2float(db: float, amplitude=True):
    if amplitude:
        return 10**(db / 20)
    else:
        return 10 ** (db / 10)


def volume_down(y: np.ndarray, db: float):
    """
    Low level API for decreasing the volume
    Parameters
    ----------
    y: numpy.ndarray
        stereo / monaural input audio
    db: float
        how much decibel to decrease
    Returns
    -------
    applied: numpy.ndarray
        audio with decreased volume
    """
    applied = y * _db2float(-db)
    return applied


def volume_up(y: np.ndarray, db: float):
    """
    Low level API for increasing the volume
    Parameters
    ----------
    y: numpy.ndarray
        stereo / monaural input audio
    db: float
        how much decibel to increase
    Returns
    -------
    applied: numpy.ndarray
        audio with increased volume
    """
    applied = y * _db2float(db)
    return applied


class RandomVolume(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, limit=10):
        super().__init__(always_apply, p)
        self.limit = limit

    def apply(self, y: np.ndarray, **params):
        db = np.random.uniform(-self.limit, self.limit)
        if db >= 0:
            return volume_up(y, db)
        else:
            return volume_down(y, db)
        
# https://www.kaggle.com/vladimirsydor/4-th-place-solution-inference-and-training-tips/data?scriptVersionId=42796948
class SpecifiedNoise(AudioTransform):
    def __init__(self, noise_folder_path, always_apply=False, p=0.5, min_snr=0.0, max_snr=1.0, sr=32000):
        super().__init__(always_apply, p)
        filenames = glob(pjoin(noise_folder_path, '*/*.wav'))
        self.noises = [librosa.load(noise_path, sr=sr)[0] for noise_path in filenames]
        self.noises = [librosa.util.normalize(noise) for noise in self.noises]
        self.noises = [noise[:CFG.period*sr] for noise in self.noises]
        self.min_snr = min_snr
        self.max_snr = max_snr
        self.sr = sr

    def apply(self, y:np.ndarray, **params):
        alpha = np.random.uniform(low=self.min_snr, high=self.max_snr)
        noise = self.noises[np.random.randint(low=0, high=len(self.noises))]
        augmented = y*(1 - alpha) + noise * alpha

        return augmented

## Criterion

In [None]:
def get_criterion():
    if CFG.criterion=='BCEWithLogitsLoss':
        criterion = nn.BCEWithLogitsLoss(reduction="mean").to(device)
    else:
        raise NotImplementedError
    return criterion

## Scheduler

In [None]:
def get_scheduler(optimizer):
    if CFG.scheduler=='ReduceLROnPlateau':
        scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=CFG.factor, patience=CFG.patience, verbose=True, eps=CFG.eps)
    elif CFG.scheduler=='CosineAnnealingLR':
        scheduler = CosineAnnealingLR(optimizer, T_max=CFG.T_max, eta_min=CFG.min_lr, last_epoch=-1)
    elif CFG.scheduler=='CosineAnnealingWarmRestarts':
        scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=CFG.T_0, T_mult=1, eta_min=CFG.min_lr, last_epoch=-1)
    return scheduler

In [None]:
# check scheduler
model = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2))
optimizer = Adam(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay, amsgrad=False)
scheduler = get_scheduler(optimizer)

from pylab import rcParams
lrs = []
for epoch in range(1, CFG.epochs+1):
    scheduler.step(epoch-1)
    lrs.append(optimizer.param_groups[0]["lr"])
rcParams['figure.figsize'] = 20,3
plt.plot(lrs)

## Torchaudio utils
https://www.kaggle.com/vladimirsydor/4-th-place-solution-inference-and-training-tips

In [None]:
def compute_deltas(
        specgram: torch.Tensor,
        win_length: int = 5,
        mode: str = "replicate"
) -> torch.Tensor:
    r"""Compute delta coefficients of a tensor, usually a spectrogram:

    .. math::
       d_t = \frac{\sum_{n=1}^{\text{N}} n (c_{t+n} - c_{t-n})}{2 \sum_{n=1}^{\text{N}} n^2}

    where :math:`d_t` is the deltas at time :math:`t`,
    :math:`c_t` is the spectrogram coeffcients at time :math:`t`,
    :math:`N` is ``(win_length-1)//2``.

    Args:
        specgram (Tensor): Tensor of audio of dimension (..., freq, time)
        win_length (int, optional): The window length used for computing delta (Default: ``5``)
        mode (str, optional): Mode parameter passed to padding (Default: ``"replicate"``)

    Returns:
        Tensor: Tensor of deltas of dimension (..., freq, time)

    Example
        >>> specgram = torch.randn(1, 40, 1000)
        >>> delta = compute_deltas(specgram)
        >>> delta2 = compute_deltas(delta)
    """
    device = specgram.device
    dtype = specgram.dtype

    # pack batch
    shape = specgram.size()
    specgram = specgram.reshape(1, -1, shape[-1])

    assert win_length >= 3

    n = (win_length - 1) // 2

    # twice sum of integer squared
    denom = n * (n + 1) * (2 * n + 1) / 3

    specgram = torch.nn.functional.pad(specgram, (n, n), mode=mode)

    kernel = torch.arange(-n, n + 1, 1, device=device, dtype=dtype).repeat(specgram.shape[1], 1, 1)

    output = torch.nn.functional.conv1d(specgram, kernel, groups=specgram.shape[1]) / denom

    # unpack batch
    output = output.reshape(shape)

    return output

def make_delta(input_tensor: torch.Tensor):
    input_tensor = input_tensor.transpose(3,2)
    input_tensor = compute_deltas(input_tensor)
    input_tensor = input_tensor.transpose(3,2)
    return input_tensor

In [None]:
def mixup_data(x, y, alpha=0.4):
    """
    https://github.com/TheoViel/kaggle_birdcall_identification/blob/2de708b9871cf388f91b9b0a33e738a24cca565d/src/training/train.py#L15
    Applies mixup to a sample
    Arguments:
        x {torch tensor} -- Input batch
        y {torch tensor} -- Labels
    Keyword Arguments:
        alpha {float} -- Parameter of the beta distribution (default: {0.4})
    Returns:
        torch tensor  -- Mixed input
        torch tensor  -- Labels of the original batch
        torch tensor  -- Labels of the shuffle batch
        float  -- Probability samples by the beta distribution
    """
    lam = np.random.beta(alpha, alpha) if alpha > 0 else 1

    index = torch.randperm(x.size()[0])#.cuda()

    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]

    return mixed_x, y_a, y_b, lam

## Model

In [None]:
class CustomModel(nn.Module):
    def __init__(self, model_name=CFG.model_name, pretrained=False, in_channels=1,spec_aggreagation: str='repeat3'):
        super().__init__()
        self.model_name = model_name
        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=CFG.n_fft, hop_length=CFG.hop_length,
                                                 win_length=CFG.n_fft, window="hann", center=True, pad_mode="reflect",
                                                 freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=CFG.sample_rate, n_fft=CFG.n_fft,
                                                 n_mels=CFG.n_mels, fmin=CFG.fmin, fmax=CFG.fmax, ref=1.0, amin=1e-10, top_db=None,
                                                 freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
                                               freq_drop_width=8, freq_stripes_num=2)
        self.bn0 = nn.BatchNorm2d(CFG.n_mels)
        
        self.spec_aggreagation = spec_aggreagation

        self.model = timm.create_model(model_name,
                                       num_classes=CFG.target_size,
                                       pretrained=pretrained,
                                       in_chans=in_channels)

    def forward(self, input):
        """
        Input: (batch_size, data_length)
        """
        x = self.spectrogram_extractor(input)# output:(batch_size, 1(channel), time_steps, freq_bins)
        x = self.logmel_extractor(x)# output:(batch_size, 1(channel), time_steps, mel_bins)
        if 'vit' in self.model_name:
            x = F.adaptive_avg_pool2d(x, (CFG.img_size,CFG.img_size)) # (batch_size,channel(1),224,224)
        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)
        if self.training:
            x = self.spec_augmenter(x)
        # 1channel => 3channel
        if self.spec_aggreagation == 'repeat3':
            x = torch.cat([x,x,x], dim=1)
        elif self.spec_aggreagation == 'deltas':
            delta_1 = make_delta(x)
            delta_2 = make_delta(delta_1)
            x = torch.cat([x,delta_1,delta_2], dim=1)

        #x = x.transpose(2, 3)
        # (batch_size, channels, freq, frames)

        x = self.model(x)
        return x

In [None]:
def get_model(model_name, pretrained=False, in_channels=3, spec_aggreagation='repeat3'):

    model = CustomModel(model_name=model_name,
                        pretrained=pretrained,
                        in_channels=in_channels,
                        spec_aggreagation=spec_aggreagation)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    
    return model

## Helper functions

In [None]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

def save(model, optimizer, scheduler, epoch, preds, path):
    torch.save({'model': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'scheduler': scheduler.state_dict(),
                'epoch': epoch,
                'preds': preds},
                path)

# https://www.kaggle.com/theoviel/training-a-winning-model/notebook?scriptVersionId=42814701
ONE_HOT = np.eye(CFG.target_size)
def f1(truth, pred, threshold=0.5, avg="samples"):

    if len(truth.shape) == 1:
        truth = ONE_HOT[truth]
    pred = (pred > threshold).astype(int)
    return f1_score(truth, pred, average=avg)

https://www.kaggle.com/yasufuminakama/cassava-resnext50-32x4d-starter-training

In [None]:
def train_one_epoch(train_loader, model, criterion, optimizer, epoch, scheduler, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to train mode
    model.train()
    start = end = time.time()
    global_step = 0
    
    for step, data in enumerate(train_loader):
        waveforms = data['waveforms']
        labels = data['targets']
        # measure data loading time
        data_time.update(time.time() - end)
        
        waveforms = waveforms.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        
        if np.random.rand() < CFG.mixup_proba:
            waveforms, y_a, y_b, _ = mixup_data(waveforms, labels, alpha=0.4)
            labels = torch.clamp(y_a + y_b, 0, 1)
        
        y_preds = model(waveforms)
        loss = criterion(y_preds, labels)
        # record loss
        losses.update(loss.item(), batch_size)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        if CFG.apex:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  #'LR: {lr:.6f}  '
                  .format(
                   epoch+1, step, len(train_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses,
                   remain=timeSince(start, float(step+1)/len(train_loader)),
                   grad_norm=grad_norm,
                   #lr=scheduler.get_lr()[0],
                   ))
    return losses.avg

def valid_fn(valid_loader, model, criterion, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to evaluation mode
    model.eval()
    preds = np.empty((0, CFG.target_size))
    start = end = time.time()
    for step, data in enumerate(valid_loader):
        waveforms = data['waveforms']
        labels = data['targets']
        # measure data loading time
        data_time.update(time.time() - end)
        waveforms = waveforms.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        # compute loss
        with torch.no_grad():
            y_preds = model(waveforms)
            preds = np.concatenate([preds, torch.sigmoid(y_preds).cpu().numpy()])
        loss = criterion(y_preds, labels)
        losses.update(loss.item(), batch_size)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(
                   step, len(valid_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses,
                   remain=timeSince(start, float(step+1)/len(valid_loader)),
                   ))
    return losses.avg, preds

In [None]:
class TestDataset(Dataset):
    def __init__(self, df: pd.DataFrame, clip: np.ndarray,
                 waveform_transforms=None):
        self.df = df
        self.clip = clip
        self.waveform_transforms=waveform_transforms
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx: int):
        SR = 32000
        sample = self.df.loc[idx, :]
        row_id = sample.row_id
        
        end_seconds = int(sample.seconds)
        start_seconds = int(end_seconds - 5)
        
        start_index = SR * start_seconds
        end_index = SR * end_seconds
        
        y = self.clip[start_index:end_index].astype(np.float32)
        
        y = np.nan_to_num(y)

        if self.waveform_transforms:
            y = self.waveform_transforms(y)

        y = np.nan_to_num(y)
        
        return y, row_id

def prediction_for_clip(test_df: pd.DataFrame,
                        clip: np.ndarray,
                        model,
                        threshold=0.5):
    """
    [input]test_df:audiofile一個分のdataframe
           clip:音声データ
           model:モデルのインスタンス
           threshold:
           
    [output]preds:ndarray(120,397)
    """
    model.eval()
    preds = np.empty((0, len(TARGETS)))
    dataset = TestDataset(df=test_df,
                          clip=clip,
                          waveform_transforms=None)
    loader = DataLoader(dataset, batch_size=120, shuffle=False)
    
    for image, row_id in loader:
        row_id = row_id[0]
        image = image.to(device)
        model = model.to(device)
        with torch.no_grad():
            y_pred = model(image)
            preds = np.concatenate([preds, torch.sigmoid(y_pred).cpu().numpy()])
        
    return preds  

def post_process_site_12(preds, threshold=0.5, maxpreds=3):
    """
    input: preds(120, 397)
    """
    preds = preds * (preds >= threshold)   # remove preds < threshold

    next_preds = np.concatenate([preds[1:], np.zeros((1, preds.shape[-1]))])   # pred corresponding to next window
    prev_preds = np.concatenate([np.zeros((1, preds.shape[-1])), preds[:-1]])  # pred corresponding to previous window
    
    score = preds + 0.5 * next_preds + 0.5 * prev_preds  # Aggregating with neighbouring predictions
    
    n_birds = (score >= threshold).sum(-1)   # Counting birds
    n_birds = np.clip(n_birds, 0, maxpreds)  # keep at most maxpreds birds
    
    labels = [np.argsort(- score[i])[:n_birds[i]] for i in range(len(preds))]  # Getting the n_birds most likely class indices
    
    class_labels = [" ".join([TARGETS[l] for l in label]) for label in labels]  # Getting class names
    
    return class_labels

def reformat_preds(preds, df):
    prediction_df = pd.DataFrame({
        'row_id': df['row_id'].values,
        'birds': preds
    })
    
    prediction_df['birds'] = prediction_df['birds'].replace([''],'nocall')
    
    return prediction_df

def get_metrics(s_true, s_pred):
    s_true = set(s_true.split())
    s_pred = set(s_pred.split())
    n, n_true, n_pred = len(s_true.intersection(s_pred)), len(s_true), len(s_pred)
    
    prec = n/n_pred
    rec = n/n_true
    f1 = 2*prec*rec/(prec + rec) if prec + rec else 0
    
    return {"f1": f1, "prec": prec, "rec": rec, "n_true": n_true, "n_pred": n_pred, "n": n}
    
def inference_voting(test_audios, clips, model, threshold=0.5):
    pred_dfs = []
    for (audio_path, clip) in zip(test_audios, clips):
        # 1clip分のdfを作成
        seconds = []
        row_ids = []
        for second in range(5, 605, 5):
            row_id = "_".join(audio_path.name.split("_")[:2]) + f"_{second}"
            seconds.append(second)
            row_ids.append(row_id)
        
        test_df = pd.DataFrame({
            "row_id": row_ids,
            "seconds": seconds
        })
        
        # prediction
        preds = prediction_for_clip(test_df, clip, model, threshold) #ndarray(120,397)
        # postprocess
        preds_pp = post_process_site_12(preds, threshold=threshold)
        #print("Predicted classes :", preds_pp)
        pred_df = reformat_preds(preds_pp, test_df)
        pred_dfs.append(pred_df)
    
    sub = pd.concat(pred_dfs, axis=0, sort=False).reset_index(drop=True)
    return sub

def test_fn(all_audios,TS_clips,model):
    # inference for train soundscape
    submission = inference_voting(test_audios=all_audios,clips=TS_clips,model=model)
    submission.to_csv(OUTPUT_DIR+"submission.csv", index=False)
    sub_target = pd.read_csv(INPUT_DIR / 'train_soundscape_labels.csv')
    sub_target = sub_target.merge(submission, how="left", on="row_id")
    # get metrics
    df_metrics = pd.DataFrame([get_metrics(s_true, s_pred) for s_true, s_pred in zip(sub_target.birds_x, sub_target.birds_y)])
    TS_f1 = df_metrics.mean()[0]
    TS_prec = df_metrics.mean()[1]
    TS_rec = df_metrics.mean()[2]
    return TS_f1, TS_prec, TS_rec

## Train loop

In [None]:
def train_loop(folds: pd.DataFrame, fold_num: int = 0):
    LOGGER.info(f"========== fold: {fold_num} training ==========")
    ### dataset
    tr_index = folds[folds["fold"] != fold_num].index
    vl_index = folds[folds["fold"] == fold_num].index
    
    train_folds = folds.loc[tr_index].reset_index(drop=True)
    valid_folds = folds.loc[vl_index].reset_index(drop=True)
    
    train_dataset = WaveformDataset(train_folds,
                                    train_HL,
                                    CFG.train_datadir,
                                    img_size=CFG.img_size,
                                    waveform_transforms=get_transforms("train"),
                                    period=CFG.period,
                                    validation=False)
    valid_dataset = WaveformDataset(valid_folds,
                                    train_HL,
                                    CFG.train_datadir,
                                    img_size=CFG.img_size,
                                    waveform_transforms=get_transforms("valid"),
                                    period=CFG.period,
                                    validation=True)
    ### dataloader
    train_loader = DataLoader(train_dataset, **CFG.loader['train'])
    valid_loader = DataLoader(valid_dataset, **CFG.loader['valid'])
    
    ### model
    model = get_model(model_name=CFG.model_name, pretrained=CFG.pretrained, in_channels=CFG.in_channels, spec_aggreagation=CFG.spec_aggreagation)
    model.to(device)
    ### optimizer
    optimizer = Adam(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay, amsgrad=False)
    ### get scheduler
    scheduler = get_scheduler(optimizer)
    if CFG.apex:
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0)
    ### criterion
    criterion = get_criterion()
    
    # ====================================================
    # loop
    # ====================================================
    cols = ['epoch','avg_train_loss','avg_val_loss','micro_f1','samples_f1','TS_f1', 'TS_prec', 'TS_rec']
    df_log = pd.DataFrame(index=[], columns=cols)

    best_score = 0.
    best_f1_train_soundscape = 0.
    best_loss = np.inf
    
    for epoch in range(CFG.epochs):
        start_time = time.time()
        # train
        avg_loss = train_one_epoch(train_loader, model, criterion, optimizer, epoch, scheduler, device)
        # eval
        avg_val_loss, preds = valid_fn(valid_loader, model, criterion, device)
        #test
        TS_f1, TS_prec, TS_rec = test_fn(all_audios, TS_clips, model)
        # scoring
        print(f'pred max{np.amax(preds)}')
        micro_f1 = f1(valid_dataset.y, preds, avg="micro")
        samples_f1 = f1(valid_dataset.y, preds)
        
        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(avg_val_loss)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step()

        elapsed = time.time() - start_time
        
        # LINEにおくる
        #send_line_notification(f'[GPU]Epoch {epoch+1} - val_loss: {avg_val_loss}')
        LOGGER.info(f'Epoch {epoch+1} - micro_f1:{micro_f1:.4f},samples_f1{samples_f1:.4f}')
        LOGGER.info(f'Epoch {epoch+1} - TS_f1:{TS_f1:.4f},TS_prec:{TS_prec:.4f},TS_rec:{TS_rec:.4f}')
        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        record = pd.Series([epoch+1, avg_loss,avg_val_loss,micro_f1,samples_f1,TS_f1, TS_prec, TS_rec], index=df_log.columns)
        df_log = df_log.append(record, ignore_index=True)
        df_log.to_csv(OUTPUT_DIR+"log.csv", index=False)

        if avg_val_loss < best_loss:
            best_loss = avg_val_loss
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_loss:.4f} Model')
            save(model, optimizer, scheduler, epoch+1, preds,
                 path=OUTPUT_DIR+f'{CFG.model_name}_fold{fold_num}_best_loss.pth')
        if best_f1_train_soundscape < TS_f1:
            best_f1_train_soundscape = TS_f1
            save(model, optimizer, scheduler, epoch+1, preds,
                 path=OUTPUT_DIR+f'{CFG.model_name}_fold{fold_num}_best_f1.pth')

    save(model, optimizer, scheduler, epoch+1, preds,
         path=OUTPUT_DIR+f'{CFG.model_name}_fold{fold_num}_last.pth')

    return valid_folds

In [None]:
def main():
    for fold in range(CFG.n_fold):
        if fold in CFG.trn_fold:
            _oof_df = train_loop(folds, fold)

if __name__ == '__main__':
    main()