## About

This code enables to train a 5-fold ResNext-50 in the kaggle kernels. There is unfortunately some timeout issues.

The 5 fold blend with our [post-processing method](https://www.kaggle.com/theoviel/inference-theo) achieves private LB 0.675 (3rd place)


Code is a bit dirty, the clean version is available on [GitHub](https://github.com/TheoViel/kaggle_birdcall_identification)

## Initialization

In [None]:
!pip install audiomentations pysndfx

In [None]:
import os
import sys

sys.path = [
    '../input/bird-outputs/src/src/',
] + sys.path

import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import *

pd.options.display.max_rows = 500
pd.options.display.max_columns = 500

## Data

In [None]:
ROOT = Path.cwd().parent
INPUT_ROOT = ROOT / "input"
RAW_DATA = INPUT_ROOT / "birdsong-recognition"
TRAIN_AUDIO_DIR = RAW_DATA / "train_audio"
TEST_AUDIO_DIR = RAW_DATA / "test_audio"

TRAIN_RESAMPLED_AUDIO_DIRS = [
  INPUT_ROOT / "birdsong-resampled-train-audio-{:0>2}".format(i)  for i in range(5)
]

train = pd.read_csv(TRAIN_RESAMPLED_AUDIO_DIRS[0] / "train_mod.csv")

In [None]:
resampled_infos = []
for audio_d in TRAIN_RESAMPLED_AUDIO_DIRS:
    if not audio_d.exists():
        continue
    for ebird_d in audio_d.iterdir():
        if ebird_d.is_file():
            continue
        for wav_f in ebird_d.iterdir():
            resampled_infos.append([ebird_d.name, wav_f.name, wav_f.as_posix()])
            
train_resampled_infos = pd.DataFrame(resampled_infos, columns=["ebird_code", "resampled_filename", "file_path"])

train_all = pd.merge(train, train_resampled_infos, on=["ebird_code", "resampled_filename"], how="inner")
df_train = train_all.copy()

print(f"Successfully loaded {len(train_all)} resampled audios out of {len(train)}")

In [None]:
df_extra = pd.read_csv(INPUT_ROOT / "xenoexternalwav0/train_extended.csv")

In [None]:
EXTRA_RESAMPLED_AUDIO_DIRS = [INPUT_ROOT / f"xenoexternalwav{i // 3}/external-xeno-wav-{i}"  for i in range(5)]

resampled_infos = []
for audio_d in EXTRA_RESAMPLED_AUDIO_DIRS:
    if not audio_d.exists():
        continue
    for ebird_d in audio_d.iterdir():
        if ebird_d.is_file():
            continue
        for wav_f in ebird_d.iterdir():
            resampled_infos.append([wav_f.name, wav_f.as_posix()])
            
extra_resampled_infos = pd.DataFrame(resampled_infos, columns=["ebird_code", "file_path"]).sort_values("ebird_code").reset_index(drop=True)

In [None]:
df_extra_ = pd.merge(df_extra, extra_resampled_infos, on=["ebird_code"], how="left")

In [None]:
paths = []
for c, file in df_extra_[["file_path", "filename"]].values:
    path = f"{c}/{file[:-4]}.wav"
    paths.append(path)
df_extra["file_path"] = paths

In [None]:
# df_extra = df_extra[df_extra['duration'] < 200]  # remove long samples to save time

## Params

In [None]:
import os
import torch
import warnings
import numpy as np


warnings.simplefilter(action="ignore", category=UserWarning)
warnings.simplefilter(action="ignore", category=FutureWarning)

SEED = 2020

DATA_PATH = "../input/birdsong-recognition/"
AUDIO_PATH = "../input/birdsong-recognition/train_audio/"

BACKGROUND_PATH = "../input/bird-backgrounds/"

MEAN = np.array([0.485, 0.456, 0.406])
STD = np.array([0.229, 0.224, 0.225])

NUM_WORKERS = 4
VAL_BS = 32

DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


CLASSES = sorted(os.listdir(AUDIO_PATH))
NUM_CLASSES = len(CLASSES)

CP_TODAY = ""

## Utils

In [None]:
import os
import torch
import random
import numpy as np
import torch.nn as nn
from sklearn.metrics import f1_score


def seed_everything(seed):
    """
    Seeds basic parameters for reproductibility of results
    
    Arguments:
        seed {int} -- Number of the seed
    """
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True  # False


def save_model_weights(model, filename, verbose=1, cp_folder=""):
    """
    Saves the weights of a PyTorch model
    
    Arguments:
        model {torch module} -- Model to save the weights of
        filename {str} -- Name of the checkpoint
    
    Keyword Arguments:
        verbose {int} -- Whether to display infos (default: {1})
        cp_folder {str} -- Folder to save to (default: {''})
    """
    if verbose:
        print(f"\n -> Saving weights to {os.path.join(cp_folder, filename)}\n")
    torch.save(model.state_dict(), os.path.join(cp_folder, filename))


def load_model_weights(model, filename, verbose=1, cp_folder=""):
    """
    Loads the weights of a PyTorch model. The exception handles cpu/gpu incompatibilities
    
    Arguments:
        model {torch module} -- Model to load the weights to
        filename {str} -- Name of the checkpoint
    
    Keyword Arguments:
        verbose {int} -- Whether to display infos (default: {1})
        cp_folder {str} -- Folder to load from (default: {''})
    
    Returns:
        torch module -- Model with loaded weights
    """
    if verbose:
        print(f"\n -> Loading weights from {os.path.join(cp_folder,filename)}\n")
    try:
        model.load_state_dict(os.path.join(cp_folder, filename), strict=strict)
    except BaseException:
        model.load_state_dict(
            torch.load(os.path.join(cp_folder, filename), map_location="cpu"),
            strict=True,
        )
    return model


def count_parameters(model, all=False):
    """
    Count the parameters of a model
    
    Arguments:
        model {torch module} -- Model to count the parameters of
    
    Keyword Arguments:
        all {bool} -- Whether to include not trainable parameters in the sum (default: {False})
    
    Returns:
        int -- Number of parameters
    """
    if all:
        return sum(p.numel() for p in model.parameters())
    else:
        return sum(p.numel() for p in model.parameters() if p.requires_grad)


ONE_HOT = np.eye(NUM_CLASSES)


def f1(truth, pred, threshold=0.5, avg="samples"):

    if len(truth.shape) == 1:
        truth = ONE_HOT[truth]

    pred = (pred > threshold).astype(int)

    return f1_score(truth, pred, average=avg)


## Transforms

In [None]:
import cv2
import pysndfx
import numpy as np
from audiomentations import *


def mono_to_color(X, eps=1e-6, mean=None, std=None):
    X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = mean or X.mean()
    std = std or X.std()
    X = (X - mean) / (std + eps)

    # Normalize to [0, 255]
    _min, _max = X.min(), X.max()

    if (_max - _min) > eps:
        V = np.clip(X, _min, _max)
        V = 255 * (V - _min) / (_max - _min)
        V = V.astype(np.uint8)
    else:
        V = np.zeros_like(X, dtype=np.uint8)

    return V


def resize(image, size=None):
    if size is not None:
        h, w, _ = image.shape
        new_w, new_h = int(w * size / h), size
        image = cv2.resize(image, (new_w, new_h))

    return image


def normalize(image, mean=None, std=None):
    image = image / 255.0
    if mean is not None and std is not None:
        image = (image - mean) / std
    return np.moveaxis(image, 2, 0).astype(np.float32)


def crop_or_pad(y, length, sr, train=True, probs=None):
    # if len(y) > 0:
    # y, _ = librosa.effects.trim(y) # trim, top_db=default(60)

    if len(y) <= length:
        y = np.concatenate([y, np.zeros(length - len(y))])
    else:
        if not train:
            start = 0
        elif probs is None:
            start = np.random.randint(len(y) - length)
        else:
            start = (
                np.random.choice(np.arange(len(probs)), p=probs) + np.random.random()
            )
            start = int(sr * (start))

        y = y[start : start + length]

    return y.astype(np.float32)


def get_wav_transforms():
    transforms = Compose(
        [
            AddGaussianSNR(max_SNR=0.5, p=0.5),
            AddBackgroundNoise(
                sounds_path=BACKGROUND_PATH, min_snr_in_db=0, max_snr_in_db=2, p=0.5
            ),
        ]
    )

    return transforms


class AudioAugmentation:
    def __init__(self, p_effects=0.5, p_noise=0.5):
        self.p_effects = p_effects

        self.noise_transfos = Compose(
            [
                AddGaussianSNR(max_SNR=0.5, p=p_noise),
                AddBackgroundNoise(
                    sounds_path=BACKGROUND_PATH, min_snr_in_db=0, max_snr_in_db=2, p=p_noise
                ),
            ]
        )

    def __call__(self, y, sr):
        y = self.noise_transfos(y, sr)

        if np.random.uniform() < self.p_effects:
            effects_chain = (
                pysndfx.AudioEffectsChain()
                .reverb(
                    reverberance=random.randrange(50),
                    room_scale=random.randrange(50),
                    stereo_depth=random.randrange(50),
                )
                .pitch(shift=random.randrange(-300, 300))
                .overdrive(gain=random.randrange(2, 20))
            )

            y = effects_chain(y)

        return y


## Dataset

In [None]:
import os
import pickle
import librosa
import soundfile
import numpy as np
from torch.utils.data import Dataset


ONE_HOT = np.eye(len(CLASSES))
CONF_PATH = "../input/bird-outputs/preds_oof_2.pkl"
assert os.path.isfile(CONF_PATH)


def compute_melspec(y, params):
    melspec = librosa.feature.melspectrogram(
        y,
        sr=params.sr,
        n_mels=params.n_mels,
        fmin=params.fmin,
        fmax=params.fmax,
    )

    melspec = librosa.power_to_db(melspec).astype(np.float32)
    return melspec


class BirdDataset(Dataset):
    def __init__(self, df, params, audio_path="", train=True, use_conf=False):
        self.train = train
        self.params = params
        self.audio_path = audio_path

        self.wav_transfos = get_wav_transforms() if train else None
        # self.wav_transfos = AudioAugmentation(p_effects=0.5, p_noise=0.5) if train else None

        self.spec_transfos = None

        self.y = np.array([CLASSES.index(c) for c in df["ebird_code"]])
        self.paths = df["file_path"].values

        self.sample_len = params.duration * params.sr

        self.use_conf = use_conf
        if use_conf:
            with open(CONF_PATH, "rb") as file:
                self.confidences = pickle.load(file)

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx: int):
        y, sr = soundfile.read(self.audio_path + self.paths[idx])

        if self.use_conf:
            name = "/".join(self.paths[idx].split('/')[-2:])
            confs = self.confidences[name][:, self.y[idx]]
            if len(confs):
                confs = confs / np.sum(confs)
            else:
                confs = None
        else:
            confs = None

        y = crop_or_pad(
            y, self.sample_len, sr=self.params.sr, train=self.train, probs=confs
        )

        if self.wav_transfos is not None:
            y = self.wav_transfos(y, self.params.sr)

        melspec = compute_melspec(y, self.params)

        if self.spec_transfos is not None:
            melspec = self.spec_transfos(melspec)

        image = mono_to_color(melspec)
        image = resize(image, self.params.img_size)
        image = normalize(image, mean=None, std=None)

        return image, ONE_HOT[self.y[idx]]


## Model

In [None]:
import torch


def get_model(name, use_msd=False, num_classes=1):

    model = torch.hub.load('pytorch/vision:v0.6.0', name, pretrained=True)
            
    nb_ft = model.fc.in_features
    del model.fc
    model.fc = nn.Linear(nb_ft, num_classes)

    return model

## Training

In [None]:
import gc
import time
import torch
import numpy as np
import torch.nn as nn

from tqdm import tqdm
from torch.optim import Adam
from torch.utils.data import DataLoader
from torch.utils.data.sampler import RandomSampler
from transformers import get_linear_schedule_with_warmup
from torchvision.models.inception import InceptionOutputs

# from util import f1
from training.mixup import mixup_data
# from params import NUM_WORKERS, NUM_CLASSES
from training.specaugment import SpecAugmentation


def smooth_label(y , alpha=0.01):
    y = y * (1 - alpha)
    y[y == 0] = alpha
    return y

    
def fit(
    model,
    train_dataset,
    val_dataset,
    epochs=50,
    batch_size=32,
    val_bs=32,
    warmup_prop=0.1,
    lr=1e-3,
    alpha=0.4,
    mixup_proba=0.0,
    specaugment_proba=0.0,
    label_smoothing=0.0,
    verbose=1,
    verbose_eval=1,
):
    """
    Usual torch fit function
    
    Arguments:
        model {torch model} -- Model to train
        train_dataset {torch dataset} -- Dataset to train with
        val_dataset {torch dataset} -- Dataset to validate with
    
    Keyword Arguments:
        epochs {int} -- Number of epochs (default: {50})
        batch_size {int} -- Training batch size (default: {32})
        val_bs {int} -- Validation batch size (default: {32})
        warmup_prop {float} -- Warmup proportion (default: {0.1})
        lr {float} -- Start (or maximum) learning rate (default: {1e-3})
        alpha {float} -- alpha value for mixup (default: {0.4})
        mixup_proba {float} -- Probability to apply mixup (default: {0.})
        specaugment_proba {float} -- Probability to apply specaugment (default: {0.})
        verbose {int} -- Period (in epochs) to display logs at (default: {1})
        verbose_eval {int} -- Period (in epochs) to perform evaluation at (default: {1})

    Returns:
        numpy array -- Predictions at the last epoch
    """

    avg_val_loss = 0.
    avg_loss = 0.
    score = 0.

    optimizer = Adam(model.parameters(), lr=lr)

    loss_fct = nn.BCEWithLogitsLoss(reduction="mean").cuda()

    spec_augmenter = SpecAugmentation(
        time_drop_width=16, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        drop_last=True,
        pin_memory=True,
        num_workers=NUM_WORKERS,
    )
    val_loader = DataLoader(
        val_dataset, batch_size=val_bs, shuffle=False, pin_memory=True, num_workers=NUM_WORKERS
    )

    num_warmup_steps = int(warmup_prop * epochs * len(train_loader))
    num_training_steps = int(epochs * len(train_loader))
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps, num_training_steps
    )

    for epoch in range(epochs):
        model.train()
        start_time = time.time()
        optimizer.zero_grad()

        avg_loss = 0
        for step, (x, y_batch) in enumerate(train_loader):
            if specaugment_proba:
                if np.random.rand() < specaugment_proba:
                    x = spec_augmenter(x)

            if np.random.rand() < mixup_proba:
                x, y_a, y_b, _ = mixup_data(x.cuda(), y_batch.cuda(), alpha=alpha)
                y_batch = torch.clamp(y_a + y_b, 0, 1)

            # if label_smoothing:
            #     y_batch = smooth_label(y_batch, alpha=label_smoothing)

            y_pred = model(x.cuda())

            # if type(y_pred) == InceptionOutputs:
            #     y_pred = y_pred.logits

            loss = loss_fct(y_pred, y_batch.cuda().float())

            loss.backward()
            avg_loss += loss.item() / len(train_loader)

            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()

        if (epoch + 1) % verbose_eval == 0 or (epoch + 1 == epochs):
            model.eval()

            avg_val_loss = 0.0
            with torch.no_grad():
                preds = np.empty((0, NUM_CLASSES))
                for x, y_batch in val_loader:
                    y_pred = model(x.cuda()).detach()
                    loss = loss_fct(y_pred, y_batch.cuda().float())
                    avg_val_loss += loss.item() / len(val_loader)

                    preds = np.concatenate([preds, torch.sigmoid(y_pred).cpu().numpy()])

            micro_f1 = f1(val_dataset.y, preds, avg="micro")
            samples_f1 = f1(val_dataset.y, preds)

        elapsed_time = time.time() - start_time
        if (epoch + 1) % verbose == 0:
            elapsed_time = elapsed_time * verbose
            lr = scheduler.get_lr()[0]
            print(
                f"Epoch {epoch + 1}/{epochs} \t lr={lr:.1e} \t t={elapsed_time:.0f}s  \t loss={avg_loss:.4f} \t ",
                end="",
            )
            if (epoch + 1) % verbose_eval == 0 or (epoch + 1 == epochs):
                print(
                    f"val_loss={avg_val_loss:.4f} \t micro_f1={micro_f1:.3f} \t samples_f1={samples_f1:.3f}"
                )
            else:
                print("")

    torch.cuda.empty_cache()
    return preds


def predict(model, dataset, batch_size=64):
    """
    Usual torch predict function

    Arguments:
        model {torch model} -- Model to predict with
        dataset {torch dataset} -- Dataset to predict with on

    Keyword Arguments:
        batch_size {int} -- Batch size (default: {32})

    Returns:
        numpy array -- Predictions
    """
    model.eval()
    preds = np.empty((0, NUM_CLASSES))

    loader = DataLoader(
        dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=NUM_WORKERS
    )
    with torch.no_grad():
        for x, _ in loader:
            y_pred = model(x.cuda()).detach()
            preds = np.concatenate([preds, torch.sigmoid(y_pred).cpu().numpy()])

    return preds


# Main

In [None]:
def train(config, df_train, df_val, fold):

    print(f"    -> {len(df_train)} training birds")
    print(f"    -> {len(df_val)} validation birds")

    seed_everything(config.seed)

    model = get_model(
        config.selected_model, use_msd=config.use_msd, num_classes=NUM_CLASSES
    ).cuda()
    model.zero_grad()

    train_dataset = BirdDataset(
        df_train, AudioParams, audio_path="", use_conf=config.use_conf
    )
    val_dataset = BirdDataset(df_val, AudioParams, audio_path="", train=False)

    n_parameters = count_parameters(model)
    print(f"    -> {n_parameters} trainable parameters\n")

    pred_val = fit(
        model,
        train_dataset,
        val_dataset,
        epochs=config.epochs,
        batch_size=config.batch_size,
        val_bs=config.val_bs,
        lr=config.lr,
        warmup_prop=config.warmup_prop,
        alpha=config.alpha,
        mixup_proba=config.mixup_proba,
        specaugment_proba=config.specaugment_proba,
        label_smoothing=config.label_smoothing,
        verbose_eval=config.verbose_eval,
    )

    if config.save:
        save_model_weights(
            model,
            f"{config.selected_model}_{config.name}_{fold}.pt",
            cp_folder=CP_TODAY,
        )

    return pred_val


def k_fold(config, df, df_extra=None):

    skf = StratifiedKFold(n_splits=config.k, random_state=config.random_state)
    splits = list(skf.split(X=df, y=df["ebird_code"]))

    pred_oof = np.zeros((len(df), NUM_CLASSES))

    for i, (train_idx, val_idx) in enumerate(splits):
        if i in config.selected_folds:
            print(f"\n-------------   Fold {i + 1} / {config.k}  -------------\n")

            df_train = df.iloc[train_idx].copy()
            df_val = df.iloc[val_idx].copy()

            if df_extra is not None:
                df_train = pd.concat((df_train, df_extra), 0).reset_index(drop=True)

            pred_val = train(config, df_train, df_val, i)
            pred_oof[val_idx] = pred_val

    return pred_oof

In [None]:
class Config:
    # General
    seed = 2020
    verbose = 1
    verbose_eval = 31
    save = True

    # k-fold
    k = 5
    random_state = 42
    selected_folds = [0] 

    # Model
    selected_model = 'resnext50_32x4d'
    
    use_msd = False
    use_conf = False
    
    img_size = None
    batch_size = 64
    epochs = 30
    lr = 1e-3
    warmup_prop = 0.05
    val_bs = 64

    label_smoothing = 0.
    specaugment_proba = 0.
    mixup_proba = 0.5
    alpha = 5

    name = "extra"

In [None]:
class AudioParams:
    sr = 32000
    duration = 5
    img_size = None

    # Melspectrogram
    n_mels = 128
    fmin = 20
    fmax = 16000

In [None]:
pred_oof = k_fold(Config, df_train, df_extra)