In [None]:
!pip install  resnest > /dev/null

In [None]:
import numpy as np
import librosa as lb
import pandas as pd
from pathlib import Path

import torch
from  torch.utils.data import Dataset, DataLoader

from tqdm.notebook import tqdm

from resnest.torch import resnest50

from pytorch_lightning import Trainer
from pytorch_lightning import LightningModule
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
import torch.nn.functional as F
from sklearn.metrics import f1_score

import pickle, copy, re,time, datetime, random, warnings, gc
import albumentations
from albumentations.core.transforms_interface import DualTransform, BasicTransform
from albumentations import Compose

In [None]:
class config:
    batch_size= 12
    weight_decay=1e-8
    lr=1e-3
    num_workers = 2
    EPOCHS = 4
    NUM_CLASSES = 24
    SR = 16_000
    DURATION =  60
    ROOT= "."
    DEVICE = "cuda:0" if torch.cuda.is_available() else 'cpu'
    DATA_ROOT = Path("../input/rfcx-species-audio-detection")
    TRAIN_AUDIO_ROOT = Path("../input/rfcx-species-audio-detection/train")
    TEST_AUDIO_ROOT = Path("../input/rfcx-species-audio-detection/test")


In [None]:
class MelSpecComputer:
    def __init__(self, sr, n_mels, fmin, fmax):
        self.sr = sr
        self.n_mels = n_mels
        self.fmin = fmin
        self.fmax = fmax

    def __call__(self, y):

        melspec = lb.feature.melspectrogram(
            y, sr=self.sr, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax,
        )

        melspec = lb.power_to_db(melspec).astype(np.float32)
        return melspec


In [None]:
def mono_to_color(X, eps=1e-6, mean=None, std=None):
    X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = mean or X.mean()
    std = std or X.std()
    X = (X - mean) / (std + eps)

    # Normalize to [0, 255]
    _min, _max = X.min(), X.max()

    if (_max - _min) > eps:
        V = np.clip(X, _min, _max)
        V = 255 * (V - _min) / (_max - _min)
        V = V.astype(np.uint8)
    else:
        V = np.zeros_like(X, dtype=np.uint8)

    return V


def normalize(image, mean=None, std=None):
    image = image / 255.0
    if mean is not None and std is not None:
        image = (image - mean) / std
    return np.moveaxis(image, 2, 0).astype(np.float32)


def crop_or_pad(y, length, sr, is_train=True):
    if len(y) < length:
        y = np.concatenate([y, np.zeros(length - len(y))])
    elif len(y) > length:
        if not is_train:
            start = 0
        else:
            start = np.random.randint(len(y) - length)

        y = y[start:start + length]

    y = y.astype(np.float32, copy=False)

    return y

In [None]:
class RFCXDataset(Dataset):

    def __init__(self, data, sr, n_mels=128, fmin=0, fmax=None,  AUDIO_ROOT=".",is_train=False,waveform_transforms=None,
                 num_classes=24, root=None, duration=10):

        self.data = data
        
        self.sr = sr
        self.n_mels = n_mels
        self.fmin = fmin
        self.fmax = fmax or self.sr//2

        self.is_train = is_train
        self.num_classes = num_classes
        self.duration = duration
        self.audio_length = self.duration*self.sr
        
        self.root =  AUDIO_ROOT

        self.wav_transfos = get_wav_transforms() if waveform_transforms else None

        self.mel_spec_computer = MelSpecComputer(sr=self.sr, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax)


    def __len__(self):
        return len(self.data)
    
    def read_index(self, idx, fill_val=1.0, offset=None, use_offset=True):
        d = self.data.iloc[idx]
        record, species = d["recording_id"], d["species_id"]
        try:
            if use_offset and (self.duration < d["duration"]+1):
                offset = offset or np.random.uniform(1, int(d["duration"]-self.duration))

            y, _ = lb.load(self.root.joinpath(record).with_suffix(".flac").as_posix(),
                           sr=self.sr, duration=self.duration, offset=offset)
            
            if self.wav_transfos is not None:
                y = self.wav_transfos(y, self.sr)
            y = crop_or_pad(y, self.audio_length, sr=self.sr)
            t = np.zeros(self.num_classes)
            t[species] = fill_val
        except Exception as e:
#             print(e)
            raise ValueError()  from  e
            y = np.zeros(self.audio_length)
            t = np.zeros(self.num_classes)
        
        return y,t
            
        

    def __getitem__(self, idx):

        y, t = self.read_index(idx)
        
        
        melspec = self.mel_spec_computer(y) 
        image = mono_to_color(melspec)
        image = normalize(image, mean=None, std=None)

        return image, t

In [None]:
def get_duration(audio_name, root=config.TRAIN_AUDIO_ROOT):
    return lb.get_duration(filename=root.joinpath(audio_name).with_suffix(".flac"))

In [None]:
data = pd.DataFrame({
    "recording_id": [path.stem for path in Path(config.TRAIN_AUDIO_ROOT).glob("*.flac")],
})
data["species_id"] = [[] for _ in range(len(data))]

print(data.shape)
data["duration"] = data["recording_id"].apply(get_duration)

In [None]:
data.head()

In [None]:
data["duration"].value_counts()

In [None]:
ds = RFCXDataset(data=data, sr=config.SR, duration=10, AUDIO_ROOT=config.TRAIN_AUDIO_ROOT, is_train=True)

In [None]:
x, y = ds[0]
x.shape, y.shape

In [None]:
import matplotlib.pyplot as plt
plt.imshow(np.transpose(x,(1,2,0)))

In [None]:
print(y)

## Audio Transform

In [None]:

class AudioTransform(BasicTransform):
    """Transform for Audio task"""

    @property
    def targets(self):
        return {"data": self.apply}
    
    def update_params(self, params, **kwargs):
        if hasattr(self, "interpolation"):
            params["interpolation"] = self.interpolation
        if hasattr(self, "fill_value"):
            params["fill_value"] = self.fill_value
        return params
    


In [None]:
class NoiseInjection(AudioTransform):
    """It simply add some random value into data by using numpy"""
    def __init__(self, always_apply=False, p=0.5):
        super(NoiseInjection, self).__init__(always_apply, p)
    
    def apply(self, data, noise_levels=(0, 0.05), **params):
        sound, sr = data
        noise_level = np.random.uniform(*noise_levels)
        noise = np.random.randn(len(sound))
        
        noise = (noise - np.amin(np.abs(noise)))/ np.amax(np.abs(noise))
        if np.isnan(noise).any() or not np.isfinite(noise).all():
            return sound, sr
        augmented_sound = (sound - np.amin(np.abs(sound)))/ np.amax(np.abs(sound))
        if np.isnan(augmented_sound).any() or not np.isfinite(augmented_sound).all():
            return sound, sr
        augmented_sound = augmented_sound + noise_level * noise
#         augmented_sound = sound + noise_level * noise
        # Cast back to same data type
        augmented_sound = augmented_sound.astype(type(sound[0]))

        return augmented_sound, sr

class PitchShift(AudioTransform):
    """Shifting time axis"""
    def __init__(self, always_apply=False, p=0.5):
        super(PitchShift, self).__init__(always_apply, p)
    
    def apply(self, data, **params):
        sound, sr = data

        n_steps = np.random.randint(-10, 10)
        augmented_sound = librosa.effects.pitch_shift(sound, sr, n_steps)

        return augmented_sound, sr

class RandomAudio(AudioTransform):
    """Shifting time axis"""
    def __init__(self,  seconds=5, always_apply=False, p=0.5):
        super(RandomAudio, self).__init__(always_apply, p)

        self.seconds = seconds
    
    def apply(self, data, **params):
        sound, sr = data
        trim_sound = sound

#         shift = np.random.randint(len(sound))
#         trim_sound = np.roll(sound, shift)

        min_samples = int(sr * self.seconds)

        if len(trim_sound) < min_samples:
            padding = min_samples - len(trim_sound)
            offset = padding // 2
            trim_sound = np.pad(trim_sound, (offset, padding - offset), "constant")
        else:
            trim_sound = trim_sound[:min_samples]

        return trim_sound, sr

In [None]:
class AddGaussianNoise(AudioTransform):
    """Shifting time axis"""
    def __init__(self, always_apply=False, p=0.5):
        super(AddGaussianNoise, self).__init__(always_apply, p)
    
    def apply(self, data, **params):
        sound, sr = data
        noise = 0.005*np.random.uniform()*np.amax(sound)
        augmented_sound = np.array(sound).astype('float64') + noise * np.random.normal(size=sound.shape[0])
        return augmented_sound, sr

In [None]:
audio_augs = Compose([
    AddGaussianNoise( p=0.5)
])

## Training process

In [None]:
import torch
from torchvision.models import mobilenet_v2
from torch import nn

device = torch.device("cuda:0")

class MobileNetV2(nn.Module):
    def __init__(self, pretrained, num_classes):
        super(MobileNetV2, self).__init__()
        self.model = mobilenet_v2(pretrained=pretrained)
        self.classifier = nn.Linear(1280, num_classes)
        
    def forward(self, x):
        bs, _, _, _ = x.shape
        x = self.model.features(x)
        x = F.adaptive_avg_pool2d(x, 1).reshape(bs, -1)  # x = x.mean(3).mean(2)        
        x = self.classifier(x)
        return x

def get_model(pretrained=True, n_class=24, weight_path=""):
    model = MobileNetV2(pretrained=pretrained, num_classes=n_class)

    if weight_path != "":
        model.load_state_dict(torch.load(weight_path['state_dict']))
    return model

In [None]:
def mixup_data(x, y, alpha=1.0, use_cuda=True):
    '''Returns mixed inputs, pairs of targets, and lambda'''
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1

    batch_size = x.size()[0]
    
    for i, xi in enumerate(x):
        index = np.random.randint(0,batch_size)
        if index == i:
            return x, y
        x[i] = lam * xi + (1 - lam) * x[index]
        y[i] = y[i] + y[index]
        print("y[i] ", y[i])
    return x,y

In [None]:
class BaseNet(LightningModule):   
    def __init__(self, batch_size=32, lr=5e-4, weight_decay=1e-8, num_workers=0, 
                  AUDIO_ROOT=".",  epochs=1, DEVICE="cuda:0", SR=config.SR):
        super().__init__()
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.DEVICE = DEVICE
        
        
        self.lr = lr
        self.epochs=epochs
        
        self.weight_decay = weight_decay        
        self.loss_fn = torch.nn.BCEWithLogitsLoss()
        self.AUDIO_ROOT = AUDIO_ROOT
        self.data = pd.DataFrame({
            "recording_id": [path.stem for path in Path(AUDIO_ROOT).glob("*.flac")],
        })
        self.data["species_id"] = [[] for _ in range(len(self.data))]
        self.data["duration"] = self.data["recording_id"].apply(get_duration)
        self.SR= SR

    def train_dataloader(self):
        
        ds = RFCXDataset(data=self.data, sr=self.SR, duration=10 , AUDIO_ROOT=self.AUDIO_ROOT)
        train_loader= DataLoader(ds, batch_size=self.batch_size,
                                 shuffle=True, num_workers=self.num_workers,
                                 pin_memory=True)
        return train_loader

#     def val_dataloader(self):
#         val_aug = get_valid_augmentation(config.im_size)
#         valid_datasets = HuBMAPDataset(self.valid_idx, transforms=val_aug, preprocessing=self.preprocessing)
#         valid_loader = DataLoader(valid_datasets, batch_size=self.batch_size, 
#                           shuffle=False, num_workers=self.num_workers, pin_memory=True)
#         return valid_loader
    
    def configure_optimizers(self):
        optim = torch.optim.AdamW(self.parameters(),lr=config.lr, betas= (0.9,0.999), 
                                              weight_decay= self.weight_decay, amsgrad=False)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim,T_max=self.epochs, eta_min=1e-5)
        self.optimizer = optim
        self.scheduler = scheduler        

        return [optim], [scheduler]

In [None]:
DEVICE="cuda:0"
class RFCXNet(BaseNet):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.net = get_model(pretrained=True, n_class=24)
    def forward(self, x):
        return self.net(x)
    
    def training_step(self, batch, batch_idx):
        x, y = [b.to(self.DEVICE) for b in batch]
        preds = torch.sigmoid(self(x))
        loss = self.loss_fn( preds.float(), y.float())
        with torch.no_grad():
            preds = preds.cpu()>0.5
            f1 = f1_score(preds, y.cpu(), average='samples')
        self.log("f1_loss", f1, prog_bar=True)
        return loss
    
#     @torch.no_grad()
#     def validation_step(self, batch, batch_idx):
#         x, y =  [b.to(self.DEVICE) for b in batch]
#         preds = torch.sigmoid(self(x))
#         valid_loss = self.loss_fn( preds.float(), y.float()) 
#         self.log("val_loss", valid_loss, on_epoch=True, on_step=True)
#         return valid_loss
    
#     def validation_epoch_end(self, outputs) -> None:
#         torch.stack([x['val_loss'] for x in outputs]).mean()

In [None]:
model = RFCXNet(batch_size=config.batch_size, 
                lr= config.lr,AUDIO_ROOT=config.TRAIN_AUDIO_ROOT, weight_decay=config.weight_decay, num_workers=config.num_workers, DEVICE = config.DEVICE)

checkpoint_callback = ModelCheckpoint(
    filepath=config.ROOT,
    save_top_k=5,
    verbose=0,
    monitor='val_loss',
    mode='min',
    prefix='hubmap_',
)


logger = TensorBoardLogger(
    save_dir=config.ROOT,
    name='lightning_logs'
)

print(model)

In [None]:
trainer = Trainer(
    max_epochs=config.EPOCHS,
    gradient_clip_val=1,
    logger=logger,
    checkpoint_callback=checkpoint_callback,
    limit_val_batches=1,
    gpus=int(torch.cuda.is_available())
)

trainer.fit(model)