In [None]:
!pip install resnest > /dev/null

In [None]:
import numpy as np
import pandas as pd
import os
import sys
import csv
import glob
import random
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torchvision import models
import torch.utils.data as data
from torch.utils.data import DataLoader

from pytorch_lightning.core import LightningModule
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning import Trainer, seed_everything
from resnest.torch import resnest50

from scipy.io import wavfile
from skimage.transform import resize
import librosa
import cv2
from pathlib import Path
import warnings

import soundfile as sf
from sklearn.model_selection import StratifiedKFold

In [None]:
data_dir = '../input/rfcx-species-audio-detection/'

train_tp_df = pd.read_csv(os.path.join(data_dir, 'train_tp.csv'))

## Data

In [None]:
def create_folds(df, k=5):
    df["kfold"] = -1
    df = df.sample(frac=1).reset_index(drop=True)
    y = df.species_id.values
    kf = StratifiedKFold(n_splits=k, shuffle=True)
    for fold_, (t, v_) in enumerate(kf.split(X=df, y=y)):
        df.loc[v_, "kfold"] = fold_
    
    return df

In [None]:
class MelSpecSlicesDataset(data.Dataset):
    def __init__(
        self,
        data_dir=None,
        df=None,
        num_classes=24,
        sample_rate=48000,
        sample_size=48000 * 10,
        melspectrogram_parameters={},
        img_height=224,
        img_width=512,
        is_train=False,
        is_validation=False,
        is_testset=False,
        testset_recording_ids=[]
    ):
        assert sample_rate * 10 == sample_size
        
        self.data_dir = data_dir
        self.df = df
        self.num_classes = num_classes
        self.sample_rate = sample_rate
        self.sample_size = sample_size
        self.melspectrogram_parameters = melspectrogram_parameters
        self.img_height = img_height
        self.img_width = img_width
        self.is_train = is_train
        self.is_validation = is_validation
        self.is_testset = is_testset
        
        if self.is_testset:
            self.testset_recording_ids = testset_recording_ids

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if self.is_testset:
            return self.get_testset_batch(self.testset_recording_ids[idx])
        
        recording_id = self.df.recording_id.values[idx]
        t_min = self.df.t_min.values[idx]
        t_max = self.df.t_max.values[idx]
        label = self.df.species_id.values[idx]

        wav_slice, label = self.get_wav_slice(recording_id, t_min, t_max)
        mel_spec = self.get_mel_spec(wav_slice)

        return mel_spec, label
    
    def get_labels(self, recording_id, beginning_time, ending_time):
        beginning_time = beginning_time / self.sample_rate
        ending_time = ending_time / self.sample_rate
        
        assert beginning_time >= 0 and beginning_time <= 60
        assert ending_time >= 0 and ending_time <= 60
        
        query_string = f"recording_id == '{recording_id}' & "
        query_string += f"t_min < {ending_time} & t_max > {beginning_time}"
        all_tp_events = self.df.query(query_string)

        label = np.zeros(24, dtype=np.float32)
        for species_id in all_tp_events["species_id"].unique():
            label[int(species_id)] = 1.0
        
        return label

    def load_wav(self, recording_id):
        if self.is_testset:
            file_path = os.path.join(self.data_dir, "test", recording_id)
        else:
            file_path = os.path.join(self.data_dir, "train", recording_id + ".flac")
            
        wav, _ = librosa.load(
            file_path,
            sr=self.sample_rate
        )
        return wav

    def get_wav_slice(self, recording_id, t_min, t_max):
        wav = self.load_wav(recording_id)

        t_min = t_min * self.sample_rate
        t_max = t_max * self.sample_rate

        center = np.round((t_min + t_max) / 2)
        beginning = center - self.sample_size / 2
        if beginning < 0:
            beginning = 0
        
        if not self.is_validation:
            beginning = np.random.randint( beginning , center)

        ending = beginning + self.sample_size
        if ending > len(wav):
            ending = len(wav)
            beginning = ending - self.sample_size

        wav_slice = wav[int(beginning) : int(ending)]
        
        labels = self.get_labels(recording_id, beginning, ending)
        return wav_slice, labels

    def get_mel_spec(self, wav_slice):
        mel_spec = librosa.feature.melspectrogram(
            wav_slice, sr=self.sample_rate, **self.melspectrogram_parameters
        )
        mel_spec = resize(mel_spec, (self.img_height, self.img_width))

        # Normalize to 0...1 - this is what goes into neural net
        mel_spec = mel_spec - np.min(mel_spec)
        mel_spec = mel_spec / np.max(mel_spec)

        mel_spec = np.stack((mel_spec, mel_spec, mel_spec))
        return mel_spec
    
    def get_testset_batch(self, recording_id):
        wav = self.load_wav(recording_id)

        # Split into n chunks
        return (
            list(map(self.get_mel_spec, np.array_split(wav, 6))),
            recording_id.replace(".flac", ""),
        )

In [None]:
# Run some tests on the dataset configured for processing test set
test_dataset = MelSpecSlicesDataset(
    data_dir="../input/rfcx-species-audio-detection",
    testset_recording_ids=os.listdir("../input/rfcx-species-audio-detection/test"),
    df=None,
    num_classes=24,
    sample_rate=48000,
    sample_size=48000 * 10,
    melspectrogram_parameters = {
        "n_fft": 2048,
        "hop_length": 512,
        "n_mels": 256,
        "fmin": 24,
        "fmax": 24000,
        "power": 2,
    },
    img_height=224,
    img_width=512,
    is_validation=False,
    is_testset=True,
)

test_batch, recording_id = test_dataset[0]
print(len(test_batch))
print(test_batch[0].shape)
print(recording_id)

# Each file should be split into 6 chunks
assert len(test_batch) == 6

# The image input dimensions
assert test_batch[0].shape == (3, 224, 512)

# Test that the first chunk is equal to itself
assert np.array_equal(test_batch[0], test_batch[0])

# Test that other chunks are different from the first
assert not np.array_equal(test_batch[0], test_batch[1])
assert not np.array_equal(test_batch[0], test_batch[5])
del test_dataset, test_batch, recording_id

## Metrics

In [None]:
def _one_sample_positive_class_precisions(scores, truth):
    num_classes = scores.shape[0]
    pos_class_indices = np.flatnonzero(truth > 0)

    if not len(pos_class_indices):
        return pos_class_indices, np.zeros(0)

    retrieved_classes = np.argsort(scores)[::-1]

    class_rankings = np.zeros(num_classes, dtype=np.int)
    class_rankings[retrieved_classes] = range(num_classes)

    retrieved_class_true = np.zeros(num_classes, dtype=np.bool)
    retrieved_class_true[class_rankings[pos_class_indices]] = True

    retrieved_cumulative_hits = np.cumsum(retrieved_class_true)

    precision_at_hits = retrieved_cumulative_hits[class_rankings[pos_class_indices]] / (
        1 + class_rankings[pos_class_indices].astype(np.float)
    )
    return pos_class_indices, precision_at_hits


def lwlrap(truth, scores):
    assert truth.shape == scores.shape
    num_samples, num_classes = scores.shape
    precisions_for_samples_by_classes = np.zeros((num_samples, num_classes))
    for sample_num in range(num_samples):
        pos_class_indices, precision_at_hits = _one_sample_positive_class_precisions(
            scores[sample_num, :], truth[sample_num, :]
        )
        precisions_for_samples_by_classes[
            sample_num, pos_class_indices
        ] = precision_at_hits

    labels_per_class = np.sum(truth > 0, axis=0)
    weight_per_class = labels_per_class / float(np.sum(labels_per_class))

    per_class_lwlrap = np.sum(precisions_for_samples_by_classes, axis=0) / np.maximum(
        1, labels_per_class
    )
    return per_class_lwlrap, weight_per_class

## Loss Function

In [None]:
class FocalLoss(nn.Module):
    def __init__(self, gamma=2):
        super().__init__()
        self.gamma = gamma

    def forward(self, logit, target):
        target = target.float()
        max_val = (-logit).clamp(min=0)
        loss = logit - logit * target + max_val + \
            ((-max_val).exp() + (-logit - max_val).exp()).log()

        invprobs = F.logsigmoid(-logit * (target * 2.0 - 1.0))
        loss = (invprobs * self.gamma).exp() * loss
        if len(loss.size()) == 2:
            loss = loss.sum(dim=1)
        return loss.mean()

## Model

In [None]:
class RainforestModelPL(LightningModule):

    def __init__(
        self,
        random_seed: int = 1234,
        fold: int = 0,
        lr: float = 0.001,
        num_classes: int = 24,
        batch_size: int = 16,
        num_workers: int = 6,
        n_mels: int = 224,
        fmin: int = 40,
        fmax: int = 24000,
        img_height: int = 224,
        img_width: int = 512,
        sample_rate: int = 48000,
        sample_size: int = 48000 * 10,
        train_folds_df = None
    ):
        super().__init__()
        
        assert sample_rate * 10 == sample_size
        
        self.save_hyperparameters()
        
        self.melspectrogram_parameters = {
            "n_fft": 2048,
            "hop_length": 512,
            "n_mels": n_mels,
            "fmin": fmin,
            "fmax": fmax,
            "power": 2,
        }

        model = resnest50(pretrained=True)

        model.fc = nn.Sequential(
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(1024, 1024),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(1024, num_classes),
        )

        self.net = model
        self.criterion = FocalLoss()

    def forward(self, x):
        return self.net(x)

    def training_step(self, batch, batch_idx):
        data, target = batch
        output = self(data)
        loss = self.criterion(output, target)
        return loss

    def validation_step(self, batch, batch_idx):
        data, target = batch
        output = self(data)
        val_loss = self.criterion(output, target)

        p = torch.sigmoid(output)
        
        score_class, weight = lwlrap(target.cpu().numpy(), p.cpu().numpy())
        score = (score_class * weight).sum()

        self.log("val_lrap", score, on_step=False, on_epoch=True, prog_bar=True)
        self.log('val_loss', val_loss, on_step=False, on_epoch=True, prog_bar=True)
        
        
    def configure_optimizers(self):
        
        # Paramters from https://www.kaggle.com/fffrrt/all-in-one-rfcx-baseline-for-beginners/notebook
        # Try finding better params using something like Optuna 
        # https://github.com/optuna/optuna/blob/master/examples/pytorch_lightning_simple.py
        optimizer = torch.optim.SGD(self.parameters(), lr=self.hparams.lr, weight_decay=0.0001, momentum=0.9)
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.4)

        lr_scheduler = {"scheduler": scheduler }
        return [optimizer], [lr_scheduler]

    def setup(self, stage):
        train_folds_df = self.hparams.train_folds_df
        
        train_df = train_folds_df[
            train_folds_df.kfold != self.hparams.fold
        ].reset_index(drop=True)
        val_df = train_folds_df[train_folds_df.kfold == self.hparams.fold].reset_index(
            drop=True
        )

        self.train_dataset = MelSpecSlicesDataset(
            data_dir=data_dir,
            df=train_df,
            num_classes=self.hparams.num_classes,
            sample_rate=self.hparams.sample_rate,
            sample_size=self.hparams.sample_size,
            img_height=self.hparams.img_height,
            img_width=self.hparams.img_width,
            melspectrogram_parameters=self.melspectrogram_parameters,
        )

        self.validation_dataset = MelSpecSlicesDataset(
            data_dir=data_dir,
            df=val_df,
            num_classes=self.hparams.num_classes,
            sample_rate=self.hparams.sample_rate,
            sample_size=self.hparams.sample_size,
            img_height=self.hparams.img_height,
            img_width=self.hparams.img_width,
            melspectrogram_parameters=self.melspectrogram_parameters,
            is_validation=True,
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.hparams.batch_size,
            num_workers=self.hparams.num_workers,
            shuffle=False,
            pin_memory=False,
            
            # Drop the last batch if it's smaller than
            # the set batch size. Some implementations
            # of batch norm throw an error if there is 
            # only one sample in the batch.
            drop_last=True,
        )

    def val_dataloader(self):
        return DataLoader(
            self.validation_dataset,
            batch_size=self.hparams.batch_size,
            num_workers=self.hparams.num_workers,
            shuffle=False,
            pin_memory=False,
            drop_last=True,
        )

In [None]:
# Seed for reproducibility
random_seed = 1234
seed_everything(seed=random_seed)
random.seed(random_seed)
np.random.seed(random_seed)
os.environ["PYTHONHASHSEED"] = str(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
def train_fold(experiment_version, fold, max_epochs, train_folds_df):
    log_dir = "./lightning_logs"
    
    full_log_dir_path = os.path.join(
        log_dir, "melspec", "version_" + experiment_version, "fold_" + str(fold)
    )
    checkpoint_callback = ModelCheckpoint(
        monitor="val_lrap",
        dirpath=full_log_dir_path,
        filename="{epoch:02d}-{vr:.3f}-{vp:.3f}-{val_lrap:.3f}",
        mode="max",
    )

    logger = TensorBoardLogger(
        "lightning_logs", name="melspec", version="version_" + experiment_version
    )


    trainer = Trainer(
        gpus=1,
        max_epochs=max_epochs,
        callbacks=[checkpoint_callback],
        logger=logger,
    )
    
    trainer.fit(RainforestModelPL(random_seed=random_seed, fold=0, lr=0.01, train_folds_df=train_folds_df))

    print(
        "Completed training for experiment version: ",
        "version_" + experiment_version,
        "fold: " + str(fold),
    )

## Train Folds

In [None]:
experiment_id = 'resnest04'
max_epochs = 1
train_folds_df = create_folds(train_tp_df)
train_fold(experiment_id, 0, max_epochs, train_folds_df)
# train_fold(experiment_id, 1, max_epochs, train_folds_df)
# train_fold(experiment_id, 2, max_epochs, train_folds_df)
# train_fold(experiment_id, 3, max_epochs, train_folds_df)
# train_fold(experiment_id, 4, max_epochs, train_folds_df)

## Test Set Inference & Create Submission

In [None]:
os.listdir("./lightning_logs/melspec/version_resnest03")

In [None]:
glob.glob("./lightning_logs/version_resnest03/fold_*/*.ckpt")

In [None]:
def create_submission(experiment):
    print('Creating submission file for experiment: ', experiment)
    
    sub_df = pd.read_csv("../input/rfcx-species-audio-detection/sample_submission.csv")
    pred_cols = sub_df.columns.tolist()[1:]
    sub_df[pred_cols] = None
    
    experiment_dir = os.path.join('./lightning_logs/melspec', experiment)
    subs_output_dir = os.path.join('./lightning_logs/melspec', experiment, 'subs')
    
    def _load_model(ckpt_fold_path):
        rainforest_model = RainforestModelPL.load_from_checkpoint(
            ckpt_fold_path, hparams_file=os.path.join(experiment_dir, "hparams.yaml")
        )
        rainforest_model.cuda()
        rainforest_model.eval()
        return rainforest_model
    
    
    checkpoints_paths = glob.glob(experiment_dir + "/fold_*/*.ckpt")
    nets = list(map(_load_model, checkpoints_paths))
    
    test_dataset = MelSpecSlicesDataset(
        data_dir="../input/rfcx-species-audio-detection",
        testset_recording_ids=os.listdir("../input/rfcx-species-audio-detection/test"),
        df=None,
        num_classes=nets[0].hparams.num_classes,
        sample_rate=nets[0].hparams.sample_rate,
        sample_size=nets[0].hparams.sample_size,
        melspectrogram_parameters = {
            "n_fft": 2048,
            "hop_length": 512,
            "n_mels": nets[0].hparams.n_mels,
            "fmin": nets[0].hparams.fmin,
            "fmax": nets[0].hparams.fmax,
            "power": 2,
        },
        img_height=nets[0].hparams.img_height,
        img_width=nets[0].hparams.img_width,
        is_validation=False,
        is_testset=True,
    )
    
    for index, (chunks, recording_id) in tqdm(enumerate(test_dataset)):
        with torch.no_grad():

            nets_preds = []

            # Aggreate predictions for all nets
            for net in nets:
                output = net(torch.tensor(chunks).cuda())
                nets_preds.append(
                    torch.max(net(torch.tensor(chunks).cuda()), dim=0)
                    .values.detach()
                    .cpu()
                    .numpy()
                )
            sub_df.loc[sub_df.recording_id == recording_id, pred_cols] = np.sum(
                np.stack(nets_preds), axis=0
            )

    print(sub_df.head())
    if not os.path.exists(subs_output_dir):
        os.makedirs(subs_output_dir)
    sub_df.to_csv(experiment + "__agg_avg_score_" + avg_score + ".csv", index=False)
    print("Created agg submission for experiment: ", experiment)
    


In [None]:
create_submission("version_" + experiment_id)

### References
- https://www.kaggle.com/fffrrt/all-in-one-rfcx-baseline-for-beginners/notebook
- https://www.kaggle.com/c/rfcx-species-audio-detection/discussion/198418