
Note: Instability between CV and LB

Some work for this notebook:
- Multi-label + LWLRAP metric
- Multi-fold
- Modify loss
- Augs

Thank @koukyo1994 and credit: https://github.com/koukyo1994/kaggle-birdcall-6th-place

# Library

In [None]:
!pip install pytorch_lightning efficientnet_pytorch colorednoise torchlibrosa

In [None]:
import sys
sys.path.append('../input/kagglebirdcall/src')

import warnings
warnings.simplefilter("ignore")

import numpy as np
import pandas as pd
import os, math
import librosa
import librosa.display as display
from collections import defaultdict
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import label_ranking_average_precision_score

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from pytorch_lightning.core import LightningModule
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

from models import *
import transforms as kbt

# Setting

In [None]:
PERIOD = 10
EPOCH = 80
DEVICE= "cuda:0"

batch_size = 24
sample_rate = 48000
window_size = 2048
hop_size = 512
mel_bins = 128
fmin = sample_rate/2.0
fmax = 0
classes_num = 24
is_inference = False  # if True, pls update weight for predict blow

data_dir = '../input/rfcx-species-audio-detection'
train_tp_df = pd.read_csv(os.path.join(data_dir, 'train_tp.csv'))
train_fp_df = pd.read_csv(os.path.join(data_dir, 'train_fp.csv'))
data_test = pd.read_csv(os.path.join(data_dir, 'sample_submission.csv'))

OUTPUT_DIR = './'

if not is_inference:
    !pip install audiomentations

    import audiomentations as aaug

# Dataloader

In [None]:
class MelSpecDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        df=None,
        num_classes=24,
        effective_length=sample_rate * PERIOD,
        waveform_transforms=None,
        is_train=False,
        data_dir=None
    ):
        self.data_dir = data_dir
        self.is_train = is_train
        self.num_classes = num_classes
        self.df = df
        self.effective_length = effective_length
        self.waveform_transforms = waveform_transforms
        

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx: int):
        recording_id = self.df.recording_id.values[idx]

        prefix = 'train' if self.is_train else 'test'

        y, sr = librosa.load(
            os.path.join(self.data_dir, prefix , recording_id + '.flac'),
            sr=None, mono=True, res_type="kaiser_fast"
        )

        len_y = len(y)

        # y = butter_bandpass_filter(y, f_min, f_max, order=9)          
        t_min = float(self.df.t_min.values[idx]) * sr
        t_max = float(self.df.t_max.values[idx]) * sr
        
        # Positioning sound slice
        center = np.round((t_min + t_max) / 2)
        beginning = center - self.effective_length / 2
        if beginning < 0:
            beginning = 0
        
        ending = beginning + self.effective_length
        if ending > len_y:
            ending = len_y
            beginning = ending - self.effective_length

        y = y[int(beginning):int(ending)]

        # Do the augmentations
        if self.waveform_transforms:
            y = self.waveform_transforms(y, sr)

        if self.is_train:
            target = np.zeros(self.num_classes, dtype=np.single)
            target[self.df.species_id.values[idx]] = 1
            return  y, target
        else:
            return  y

In [None]:
train_full_df = train_tp_df

for index, row in train_full_df.iterrows():
    if fmin > float(row['f_min']):
        fmin = float(row['f_min'])
    if fmax < float(row['f_max']):
        fmax = float(row['f_max'])

# Get some safety margin
fmin = int(fmin * 0.9)
fmax = int(fmax * 1.1)

# Model

In [None]:
args = {
    'sample_rate': sample_rate,
    'window_size': window_size,
    'hop_size': hop_size,
    'mel_bins': mel_bins,
    'fmin': fmin, 
    'fmax': fmax,
    'classes_num': classes_num
}
base_model = PANNsCNN14Att(**args)
print(base_model)

# Training Utils

In [None]:
def _one_sample_positive_class_precisions(scores, truth):
    num_classes = scores.shape[0]
    pos_class_indices = np.flatnonzero(truth > 0)

    if not len(pos_class_indices):
        return pos_class_indices, np.zeros(0)

    retrieved_classes = np.argsort(scores)[::-1]

    class_rankings = np.zeros(num_classes, dtype=np.int)
    class_rankings[retrieved_classes] = range(num_classes)

    retrieved_class_true = np.zeros(num_classes, dtype=np.bool)
    retrieved_class_true[class_rankings[pos_class_indices]] = True

    retrieved_cumulative_hits = np.cumsum(retrieved_class_true)

    precision_at_hits = retrieved_cumulative_hits[class_rankings[pos_class_indices]] / (
        1 + class_rankings[pos_class_indices].astype(np.float)
    )
    return pos_class_indices, precision_at_hits


def lwlrap(truth, scores):
    assert truth.shape == scores.shape
    num_samples, num_classes = scores.shape
    precisions_for_samples_by_classes = np.zeros((num_samples, num_classes))
    for sample_num in range(num_samples):
        pos_class_indices, precision_at_hits = _one_sample_positive_class_precisions(
            scores[sample_num, :], truth[sample_num, :]
        )
        precisions_for_samples_by_classes[
            sample_num, pos_class_indices
        ] = precision_at_hits

    labels_per_class = np.sum(truth > 0, axis=0)
    weight_per_class = labels_per_class / float(np.sum(labels_per_class))

    per_class_lwlrap = np.sum(precisions_for_samples_by_classes, axis=0) / np.maximum(
        1, labels_per_class
    )
    return per_class_lwlrap, weight_per_class

class ImprovedPANNsLoss(nn.Module):
    def __init__(self, output_key="logit", weights=[1, 0.5]):
        super().__init__()

        self.output_key = output_key
        if output_key == "logit":
            self.normal_loss = nn.BCEWithLogitsLoss()
        else:
            self.normal_loss = nn.BCELoss()

        self.bce = nn.BCELoss()
        self.weights = weights

    def forward(self, input, target):
        input_ = input[self.output_key]
        target = target.float()

        framewise_output = input["framewise_output"]
        clipwise_output_with_max, _ = framewise_output.max(dim=1)

        normal_loss = self.normal_loss(input_, target)
        auxiliary_loss = self.bce(clipwise_output_with_max, target)

        return self.weights[0] * normal_loss + self.weights[1] * auxiliary_loss

In [None]:
class LightModel(LightningModule):
    def __init__(self, model, df, num_fold=0, data_dir=data_dir,
                 batch_size=6, num_workers=4, DEVICE="cuda:0"):
        super().__init__()
        self.model = model
        self.use_fold = num_fold
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.DEVICE = DEVICE
        self.loss_fn = ImprovedPANNsLoss()
        self.train_df, self.val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['species_id'])

    def forward(self, batch):
        return self.model(batch)
    
    def train_dataloader(self):
        train_aug = aaug.Compose([
            # aaug.AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.3),
            # aaug.AddGaussianSNR(p=0.3),
            # aaug.PitchShift(min_semitones=-4, max_semitones=4, p=0.3),
       ])

        return DataLoader(
            MelSpecDataset(
                data_dir=self.data_dir,
                df=self.train_df, is_train=True,
                waveform_transforms=train_aug
            ),
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            shuffle=True,
            pin_memory=True,
            drop_last=True
        )

    def val_dataloader(self):
        return DataLoader(
            MelSpecDataset(
                data_dir=self.data_dir,
                df=self.val_df, is_train=True
            ), 
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            shuffle=False,
            pin_memory=True,
            drop_last=False
        )

    def training_step(self, batch, batch_idx):
        y, target = [x.to(self.DEVICE) for x in batch]
        output = model(y)
        bceLoss = self.loss_fn(output, target)
        loss = bceLoss
        return loss

    def validation_step(self, batch, batch_idx):
        y, target = [x.to(self.DEVICE) for x in batch]
        output = model(y)
        bceLoss = self.loss_fn(output, target)
        loss = bceLoss
        self.log('val_loss', loss, on_epoch=True, prog_bar=True)
        p = torch.sigmoid(output['clipwise_output'])
        score_class, weight = lwlrap(target.cpu().numpy(), p.cpu().numpy())
        score = (score_class * weight).sum()
        self.log("Lwlrap_epoch", score, on_epoch=True, prog_bar=True)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)
        lr_scheduler = {"scheduler": scheduler }
        return [optimizer], [lr_scheduler]

# Training

In [None]:
if not is_inference:
    model = LightModel(base_model, train_full_df)

    checkpoint_callback = ModelCheckpoint(
        filepath=OUTPUT_DIR + '{epoch:02d}-{val_loss:.2f}-{Lwlrap_epoch:.3f}',
        save_top_k=3,
        verbose=1,
        monitor='Lwlrap_epoch',
        mode='max',
        prefix='Cnn14_DecisionLevelAtt_',
    )

    logger = TensorBoardLogger(
        save_dir=OUTPUT_DIR,
        name='lightning_logs'
    )
    
    trainer = Trainer(
        max_epochs=EPOCH,
        gradient_clip_val=1,
        logger=logger,
        checkpoint_callback=checkpoint_callback,
        limit_val_batches=0.2,
        gpus=int(torch.cuda.is_available())
    )

    trainer.fit(model)

# Inference

In [None]:
def read_audio_test(recording_id):
    y, sr = librosa.load(
        os.path.join(data_dir, 'test' , recording_id + '.flac'),
        sr=None, mono=True, res_type="kaiser_fast"
    )

    effective_length = sample_rate*PERIOD
    # Split for enough segments to not miss anything
    segments = len(y) / effective_length
    segments = int(np.ceil(segments))
    
    features = []

    for i in range(0, segments):
        # Last segment going from the end
        if (i + 1) * effective_length > len(y):
            y = y[len(y) - effective_length:len(y)]
        else:
            y = y[i * effective_length:(i + 1) * effective_length]

        features.append(y)

    return torch.tensor(features)

In [None]:
if is_inference:
    # load model
    paths = [
        os.path.join('../input/dddddd/Cnn14_DecisionLevelAtt_-epoch65-val_loss0.14-LwAP_epoch0.797.ckpt')
    ]
    weights = np.array([1.0])
    weights /= weights.sum()
    nets = []

    for path in paths:
        net = LightModel(base_model, train_full_df).to(DEVICE)
        net.load_state_dict(torch.load(path, map_location=DEVICE)["state_dict"])
        net.eval()
        nets.append(net)
    assert len(weights) == len(paths)

    # load test data
    test_file_list = data_test["recording_id"].to_list()

    # predict
    preds = []
    nets[0].eval()
    with torch.no_grad():
        for recording_id in tqdm(test_file_list):
            inputs = read_audio_test(recording_id) 
            inputs = inputs.to(DEVICE)
            o = []
            for net in nets:
                o.append(torch.max(net(inputs)["logit"], dim=0).values.detach().cpu().numpy())
            o = np.stack(o)
            o = np.sum(o*weights[:, None, None], axis=0)
            preds.append(o)
    preds = np.vstack(preds)

    # to output csv
    sub = pd.DataFrame(preds, columns=[f"s{i}" for i in range(24)])
    sub["recording_id"] = data_test["recording_id"].values[:len(sub)]
    sub = sub[["recording_id"] + [f"s{i}" for i in range(24)]]

    sub.to_csv(OUTPUT_DIR + "submission.csv", index=False)