In [None]:
!pip install  timm > /dev/null
!pip install pydicom > /dev/null
!pip install catalyst > /dev/null
!pip install colorednoise > /dev/null

In [None]:
import numpy as np
import librosa as lb
import pandas as pd
from pathlib import Path
import torch
from  torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import Audio
import sys    
import colorednoise as cn
from skimage.transform import resize
import os
from glob import glob
from sklearn.model_selection import GroupKFold, StratifiedKFold
import cv2
from skimage import io
from torch import nn
from datetime import datetime
import time
import random
import cv2
import torchvision
from torchvision import transforms
from torch.utils.data.sampler import SequentialSampler, RandomSampler
from  torch.cuda.amp import autocast, GradScaler
import timm
import sklearn
import warnings
import joblib
from sklearn.metrics import roc_auc_score, log_loss
from sklearn import metrics
import warnings
import librosa
import pydicom
from scipy.ndimage.interpolation import zoom

In [None]:
CFG = {
    'fold_num': 5,
    'seed': 2020,
    'model_arch': 'resnest50d',
    'epochs': 30,
    'train_bs': 16,
    'valid_bs': 32,
    'T_0': 10,
    'lr': 1e-4,
    'min_lr': 1e-6,
    'weight_decay':1e-6,
    'num_workers': 4,
    'accum_iter': 1, # suppoprt to do batch accumulation for backprop with effectively larger batch size
    'verbose_step': 1,
    'device': 'cuda:0',
    'num_classes': 25,
    "sr": 48000,
    "duration": 10,
    "fft": 2048,
    "hop": 512,
    "data_root": Path("../input/rfcx-species-audio-detection"),
    "train_root": Path("../input/rfcx-species-audio-detection/train"),
    "test_root": Path("../input/rfcx-species-audio-detection/test")
}

In [None]:
# import timm
# from pprint import pprint
# model_names = timm.list_models(pretrained=True)
# pprint(model_names)

In [None]:
traint = pd.read_csv( '../input/rfcx-species-audio-detection/train_tp.csv' )
trainf = pd.read_csv( '../input/rfcx-species-audio-detection/train_fp.csv' )
traint.shape, trainf.shape

In [None]:
fmin = 24000
fmax = 0

for i in range(len(traint)):
    if fmin > float(traint["f_min"][i]):
        fmin = float(traint["f_min"][i])
    if fmax < float(traint["f_max"][i]):
        fmax = float(traint["f_max"][i])
# Get some safety margin
fmin = int(fmin * 0.9)
fmax = int(fmax * 1.1)
print('Minimum frequency: ' + str(fmin) + ', maximum frequency: ' + str(fmax))
CFG["f_min"], CFG["f_max"] = fmin, fmax

# DataAugumentation

In [None]:
class AudioTransform:
    def __init__(self, always_apply=False, p=0.5):
        self.always_apply = always_apply
        self.p = p

    def __call__(self, y: np.ndarray):
        if self.always_apply:
            return self.apply(y)
        else:
            if np.random.rand() < self.p:
                return self.apply(y)
            else:
                return y

    def apply(self, y: np.ndarray):
        raise NotImplementedError


class wave_Compose:
    def __init__(self, transforms: list):
        self.transforms = transforms

    def __call__(self, y: np.ndarray):
        for trns in self.transforms:
            y = trns(y)
        return y


class wave_OneOf:
    def __init__(self, transforms: list):
        self.transforms = transforms

    def __call__(self, y: np.ndarray):
        n_trns = len(self.transforms)
        trns_idx = np.random.choice(n_trns)
        trns = self.transforms[trns_idx]
        return trns(y)

In [None]:
class AddGaussianNoise(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, max_noise_amplitude=0.5, **kwargs):
        super().__init__(always_apply, p)

        self.noise_amplitude = (0.0, max_noise_amplitude)

    def apply(self, y: np.ndarray, **params):
        noise_amplitude = np.random.uniform(*self.noise_amplitude)
        noise = np.random.randn(len(y))
        augmented = (y + noise * noise_amplitude).astype(y.dtype)
        return augmented
    
class GaussianNoiseSNR(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, min_snr=5.0, max_snr=20.0, **kwargs):
        super().__init__(always_apply, p)

        self.min_snr = min_snr
        self.max_snr = max_snr

    def apply(self, y: np.ndarray, **params):
        snr = np.random.uniform(self.min_snr, self.max_snr)
        a_signal = np.sqrt(y ** 2).max()
        a_noise = a_signal / (10 ** (snr / 20))

        white_noise = np.random.randn(len(y))
        a_white = np.sqrt(white_noise ** 2).max()
        augmented = (y + white_noise * 1 / a_white * a_noise).astype(y.dtype)
        return augmented

class PinkNoiseSNR(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, min_snr=5.0, max_snr=20.0, **kwargs):
        super().__init__(always_apply, p)

        self.min_snr = min_snr
        self.max_snr = max_snr

    def apply(self, y: np.ndarray, **params):
        snr = np.random.uniform(self.min_snr, self.max_snr)
        a_signal = np.sqrt(y ** 2).max()
        a_noise = a_signal / (10 ** (snr / 20))

        pink_noise = cn.powerlaw_psd_gaussian(1, len(y))
        a_pink = np.sqrt(pink_noise ** 2).max()
        augmented = (y + pink_noise * 1 / a_pink * a_noise).astype(y.dtype)
        return augmented

class PitchShift(AudioTransform):
    def __init__(self, always_apply=False, p=0.3, max_steps=5, sr=32000):
        super().__init__(always_apply, p)

        self.max_steps = max_steps
        self.sr = sr

    def apply(self, y: np.ndarray, **params):
        n_steps = np.random.randint(-self.max_steps, self.max_steps)
        augmented = librosa.effects.pitch_shift(y, sr=self.sr, n_steps=n_steps)
        return augmented
    
class TimeStretch(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, max_rate=1.2):
        super().__init__(always_apply, p)

        self.max_rate = max_rate

    def apply(self, y: np.ndarray, **params):
        rate = np.random.uniform(0, self.max_rate)
        augmented = librosa.effects.time_stretch(y, rate)
        return augmented

class TimeShift(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, max_shift_second=2, sr=32000, padding_mode="replace"):
        super().__init__(always_apply, p)
    
        assert padding_mode in ["replace", "zero"], "`padding_mode` must be either 'replace' or 'zero'"
        self.max_shift_second = max_shift_second
        self.sr = sr
        self.padding_mode = padding_mode

    def apply(self, y: np.ndarray, **params):
        shift = np.random.randint(-self.sr * self.max_shift_second, self.sr * self.max_shift_second)
        augmented = np.roll(y, shift)
        if self.padding_mode == "zero":
            if shift > 0:
                augmented[:shift] = 0
            else:
                augmented[shift:] = 0
        return augmented

class VolumeControl(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, db_limit=10, mode="uniform"):
        super().__init__(always_apply, p)

        assert mode in ["uniform", "fade", "fade", "cosine", "sine"], \
            "`mode` must be one of 'uniform', 'fade', 'cosine', 'sine'"

        self.db_limit= db_limit
        self.mode = mode

    def apply(self, y: np.ndarray, **params):
        db = np.random.uniform(-self.db_limit, self.db_limit)
        if self.mode == "uniform":
            db_translated = 10 ** (db / 20)
        elif self.mode == "fade":
            lin = np.arange(len(y))[::-1] / (len(y) - 1)
            db_translated = 10 ** (db * lin / 20)
        elif self.mode == "cosine":
            cosine = np.cos(np.arange(len(y)) / len(y) * np.pi * 2)
            db_translated = 10 ** (db * cosine / 20)
        else:
            sine = np.sin(np.arange(len(y)) / len(y) * np.pi * 2)
            db_translated = 10 ** (db * sine / 20)
        augmented = y * db_translated
        return augmented

# Utils

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [None]:
def get_duration(audio_name, root=CFG["test_root"]):
    return lb.get_duration(filename=root.joinpath(audio_name).with_suffix(".flac"))

# DataSet

In [None]:
train_tab = traint.copy()
train_tab["duration"] = [get_duration(i, CFG["train_root"]) for i in train_tab["recording_id"]]
train_tab.head(-10)

In [None]:
class MelSpecComputer:
    def __init__(self, sr, fft, hop, fmin, fmax):
        self.sr = sr
        self.fft = fft
        self.hop = hop
        self.fmin = fmin
        self.fmax = fmax

    def __call__(self, y):
        melspec = librosa.feature.melspectrogram(y, n_fft=self.fft, hop_length=self.hop, 
                                                  sr=self.sr, fmin=self.fmin, 
                                                  fmax=self.fmax, power=1.5) 
        melspec = lb.power_to_db(melspec).astype(np.float32)
        return melspec

In [None]:
def mono_to_color(X, eps=1e-6, mean=None, std=None):
    X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = mean or X.mean()
    std = std or X.std()
    X = (X - mean) / (std + eps)

    # Normalize to [0, 255]
    _min, _max = X.min(), X.max()

    if (_max - _min) > eps:
        V = np.clip(X, _min, _max)
        V = 255 * (V - _min) / (_max - _min)
        V = V.astype(np.uint8)
    else:
        V = np.zeros_like(X, dtype=np.uint8)

    return V


def normalize(image, mean=None, std=None):
    image = image / 255.0
    if mean is not None and std is not None:
        image = (image - mean) / std
    return np.moveaxis(image, 2, 0).astype(np.float32)

def crop(y, length, t_min, t_max, sr):
    center = random.randint(0,len(y))
    start = 0 if (center-length/2) < 0 else center-length/2
    end = len(y) if (start+length) > len(y) else start+length
    y = y[int(start) : int(end)] 
    y = y.astype(np.float32, copy=False)
    if start < t_min*sr and end > t_max*sr:
        is_label = True
    elif start > t_min*sr and start < t_max*sr:
        is_label = True
    elif end > t_min*sr and end < t_max*sr:
        is_label = True
    else:
        is_label = None
        
    return y, is_label

In [None]:
def get_wave_transforms():
    return wave_Compose([
#                     wave_OneOf([
#                         GaussianNoiseSNR(min_snr=10),
#                         PinkNoiseSNR(min_snr=10)
#                         ]),
                    PitchShift(max_steps=2, sr=CFG["sr"]),
#                     TimeStretch(),
#                     TimeShift(sr=CFG["sr"]),
#                     VolumeControl(mode="sine")
                    ])

In [None]:
import albumentations
from albumentations import (
    HorizontalFlip, VerticalFlip, IAAPerspective, ShiftScaleRotate, CLAHE, RandomRotate90,
    Transpose, ShiftScaleRotate, Blur, OpticalDistortion, GridDistortion, HueSaturationValue,
    IAAAdditiveGaussianNoise, GaussNoise, MotionBlur, MedianBlur, IAAPiecewiseAffine, RandomResizedCrop,
    IAASharpen, IAAEmboss, RandomBrightnessContrast, Flip, OneOf, Compose, Normalize, Cutout, CoarseDropout, 
    ShiftScaleRotate, CenterCrop, Resize
)

from albumentations.pytorch import ToTensorV2

In [None]:
# def get_img_transforms():
#     return Compose([
# #             Resize(CFG['img_size'], CFG['img_size']),
#             Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], max_pixel_value=255.0, p=1.0),
#             ToTensorV2(p=1.0),
#         ], p=1.)

In [None]:
class RFCXDataset(Dataset):
    def __init__(self, df, data_root, sr, fft, hop, duration, num_classes, fmin=20, 
                 wave_transforms=None, img_transforms=None, output_label=True, fmax=None):
        
        super().__init__()
        self.df = df.reset_index(drop=True).copy()
        self.wave_transforms = wave_transforms
        self.img_transforms = img_transforms
        self.data_root = data_root
        self.output_label = output_label        
        self.sr = sr
        self.fft = fft
        self.hop = hop
        self.fmin = fmin
        self.fmax = fmax 
        self.num_classes = num_classes
        self.duration = duration
        self.audio_length = self.duration*self.sr
        self.mel_spec_computer = MelSpecComputer(sr=self.sr, fft=self.fft, hop=self.hop, fmin=self.fmin, fmax=self.fmax)

    
    def __len__(self):
        return self.df.shape[0]

    def read_index(self, index):
        d = self.df.iloc[index]
        record = d["recording_id"]
        y, y_sr = lb.load(self.data_root.joinpath(record).with_suffix(".flac").as_posix(),sr=None)
        if self.wave_transforms is not None:
            y = self.wave_transforms(y)
        y, is_label = crop(y, self.audio_length, d["t_min"], d["t_max"], y_sr) 
        return y, is_label
    
    def __getitem__(self, index: int):
                  
        y, is_label = self.read_index(index)      
        # get labels
        if self.output_label:
            target = np.zeros(CFG["num_classes"])
            if is_label:
                label = self.df.iloc[index]['species_id']
                target[label] = 1
            else:
                target[24] = 1
        melspec = self.mel_spec_computer(y) 
        img = resize(melspec, (224, 400))
        img = mono_to_color(img)
        img = normalize(img)
        
        # do label smoothing
        if self.output_label:
            return img, target
        else:
            return img

In [None]:
ds = RFCXDataset(train_tab, data_root=CFG["train_root"], sr=CFG["sr"], fft=CFG["fft"], hop=CFG["hop"], 
                 duration=CFG["duration"], num_classes=CFG["num_classes"], 
                 wave_transforms=get_wave_transforms(), img_transforms=None)

In [None]:
x, y = ds.__getitem__(0)
x.shape, y

In [None]:
x = np.moveaxis(x, 0, 1)
plt.imshow(np.moveaxis(x, 1, 2))

In [None]:
class ImgClassifier(nn.Module):
    def __init__(self, model_arch, n_classes, pretrained=False):
        super().__init__()
        self.model = timm.create_model(model_arch, pretrained=pretrained)
        self.classifier = nn.Sequential(
            nn.ReLU(), nn.Dropout(p=0.2),
            nn.Linear(1000, 1000), nn.ReLU(), nn.Dropout(p=0.2),
            nn.Linear(1000, 500), nn.ReLU(), nn.Dropout(p=0.2),
            nn.Linear(500, n_classes))
        
    def forward(self, x):
        x = self.model(x)
        x = self.classifier(x)
#         x = torch.sigmoid(x)
        return x

In [None]:
# model = ImgClassifier(CFG['model_arch'], train_tab.species_id.nunique(), pretrained=True)
# print(model)

In [None]:
def prepare_dataloader(df, trn_idx, val_idx):
    
    from catalyst.data.sampler import BalanceClassSampler
    train_ = df.iloc[trn_idx].reset_index(drop=True)
    valid_ = df.iloc[val_idx].reset_index(drop=True)
    train_ds = RFCXDataset(train_, data_root=CFG["train_root"], sr=CFG["sr"], fft=CFG["fft"], 
                           hop=CFG["hop"], duration=CFG["duration"], num_classes=CFG["num_classes"], 
                           wave_transforms=None, img_transforms=None, output_label=True)
#     PinkNoiseSNR(max_steps=2, sr=CFG["sr"])
    valid_ds = RFCXDataset(valid_, data_root=CFG["train_root"], sr=CFG["sr"], fft=CFG["fft"], 
                           hop=CFG["hop"], duration=CFG["duration"], num_classes=CFG["num_classes"], 
                           wave_transforms=None, img_transforms=None, output_label=True)
    
    train_loader = torch.utils.data.DataLoader(
        train_ds,
        batch_size=CFG['train_bs'],
        pin_memory=False,
        drop_last=False,
        shuffle=True,        
        num_workers=CFG['num_workers'],
        #sampler=BalanceClassSampler(labels=train_['label'].values, mode="downsampling")
    )
    val_loader = torch.utils.data.DataLoader(
        valid_ds, 
        batch_size=CFG['valid_bs'],
        num_workers=CFG['num_workers'],
        shuffle=False,
        pin_memory=False,
    )
    return train_loader, val_loader

def train_one_epoch(epoch, model, loss_fn, optimizer, train_loader, device, scheduler=None, schd_batch_update=False):
    model.train()

    t = time.time()
    running_loss = None

    pbar = tqdm(enumerate(train_loader), total=len(train_loader))
    for step, (imgs, image_labels) in pbar:
        imgs = imgs.to(device)
        image_labels = image_labels.to(device).float()

        with autocast():
            image_preds = model(imgs).float()   
            loss = loss_fn(image_preds, image_labels)
            
            scaler.scale(loss).backward()

            if running_loss is None:
                running_loss = loss.item()
            else:
                running_loss = running_loss * .99 + loss.item() * .01

            if ((step + 1) %  CFG['accum_iter'] == 0) or ((step + 1) == len(train_loader)):

                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad() 
                
                if scheduler is not None and schd_batch_update:
                    scheduler.step()

            if ((step + 1) % CFG['verbose_step'] == 0) or ((step + 1) == len(train_loader)):
                description = f'epoch {epoch} loss: {running_loss:.4f}'
                pbar.set_description(description)
                
    if scheduler is not None and not schd_batch_update:
        scheduler.step()
        
def valid_one_epoch(epoch, model, loss_fn, val_loader, device, scheduler=None, schd_loss_update=False):
    model.eval()

    t = time.time()
    loss_sum = 0
    sample_num = 0
    image_preds_all = []
    image_targets_all = []
    
    pbar = tqdm(enumerate(val_loader), total=len(val_loader))
    for step, (imgs, image_labels) in pbar:
        imgs = imgs.to(device).float()
        image_labels = image_labels.to(device).float()
        
        image_preds = model(imgs).float()   
        image_preds_all += [torch.argmax(image_preds, 1).detach().cpu().numpy()]
        image_targets_all += [torch.argmax(image_labels, 1).cpu().numpy()]
        
        loss = loss_fn(image_preds, image_labels)
        
        loss_sum += loss.item()*image_labels.shape[0]
        sample_num += image_labels.shape[0]  

        if ((step + 1) % CFG['verbose_step'] == 0) or ((step + 1) == len(val_loader)):
            description = f'epoch {epoch} loss: {loss_sum/sample_num:.4f}'
            pbar.set_description(description)
    
    image_preds_all = np.concatenate(image_preds_all)
    image_targets_all = np.concatenate(image_targets_all)
    print('validation multi-label accuracy = {:.4f}'.format((image_preds_all==image_targets_all).mean()))
    
    if scheduler is not None:
        if schd_loss_update:
            scheduler.step(loss_sum/sample_num)
        else:
            scheduler.step()
    
    return loss_sum/sample_num

In [None]:
if __name__ == '__main__':

    seed_everything(CFG['seed'])
    
    folds = StratifiedKFold(n_splits=CFG['fold_num'], shuffle=True, random_state=CFG['seed']).split(np.arange(train_tab.shape[0]), train_tab.species_id.values)
    
    for fold, (trn_idx, val_idx) in enumerate(folds):
#         # we'll train fold 0 first
#         if fold > 0:
#             break 

        print('Training with {} started'.format(fold))

        print(len(trn_idx), len(val_idx))
        train_loader, val_loader = prepare_dataloader(train_tab, trn_idx, val_idx)

        device = torch.device(CFG['device'])
        
        model = ImgClassifier(CFG['model_arch'], CFG["num_classes"], pretrained=True).to(device)
        scaler = GradScaler()   
        optimizer = torch.optim.Adam(model.parameters(), lr=CFG['lr'], weight_decay=CFG['weight_decay'])
        #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, gamma=0.1, step_size=CFG['epochs']-1)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=CFG['T_0'], T_mult=1, eta_min=CFG['min_lr'], last_epoch=-1)
        loss_fn = nn.BCEWithLogitsLoss().to(device)
        best_loss = 1e6
        
        for epoch in range(CFG['epochs']):
            train_one_epoch(epoch, model, loss_fn, optimizer, train_loader, device, scheduler=scheduler, schd_batch_update=False)

            with torch.no_grad():
                loss = valid_one_epoch(epoch, model, loss_fn, val_loader, device, scheduler=None, schd_loss_update=False)
                os.chdir('/kaggle/working')
                torch.save(model.state_dict(),'{}_fold_{}_{}'.format(CFG['model_arch'], fold, epoch))
                os.chdir('../input')
                
            if loss <= best_loss:
                best_loss = loss
                print('{}_fold_{}_{}'.format(CFG['model_arch'], fold, epoch))
                
            
        del model, optimizer, train_loader, val_loader, scaler, scheduler
        torch.cuda.empty_cache()