In [None]:
!pip install ../input/torchlibrosa/torchlibrosa-0.0.5-py3-none-any.whl > /dev/null

In [None]:
import os
import ast
import numpy as np
import warnings
from sklearn import metrics
from torch.utils.data import Dataset
import pandas as pd
import torchaudio
import glob
from sklearn.model_selection import StratifiedKFold, GroupKFold
import os
from torch.utils.data import Dataset
import pandas as pd
import torchaudio
from sklearn.preprocessing import MultiLabelBinarizer
from fastai.vision.all import *
from typing import Optional,Tuple,List
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')
import timm
from timm import create_model
#new
import audioread
import librosa
import soundfile as sf
from albumentations.core.transforms_interface import ImageOnlyTransform
from torchlibrosa.stft import LogmelFilterBank, Spectrogram
from torchlibrosa.augmentation import SpecAugmentation

import albumentations as A
import albumentations.pytorch.transforms as T
import matplotlib.pyplot as plt

In [None]:
class AudioParams:
    sr = 32000
    duration = 5
    n_samples = sr * duration
    # Melspectrogram
    n_mels = 224
    fmin = 20
    fmax = 16000
    hop_length = 512
    n_fft = 2048
    window = 'hann'
    mode = 'train'
    #train
    bs = 16
    
params = AudioParams
MODE = 'train'

# loading training data

In [None]:
scored_classes = ["akiapo","aniani","apapan","barpet","crehon","elepai"
                  ,"ercfra","hawama","hawcre","hawgoo","hawhaw","hawpet1","houfin"
                  ,"iiwi","jabwar","maupar","omao","puaioh","skylar","warwhe1","yefcan"]

In [None]:
train = pd.read_csv('../input/birdclef-2022/train_metadata.csv')
train['new_target'] = train['primary_label'].map(lambda x: [x]) + train['secondary_labels'].map(lambda x: [i for i in eval(x) if i in scored_classes])
train['len_new_target'] = train['new_target'].map(lambda x: len(x))
train['full_path'] = train.filename.map(lambda x: '../input/birdclef2022-audio-image-dataset/' + str(x) + '.npy')
train.head()

In [None]:
#train = train[train.primary_label.isin(scored_classes)]
#train.reset_index(inplace=True)

In [None]:
Fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for n, (trn_index, val_index) in enumerate(Fold.split(train, train['primary_label'])):
    train.loc[val_index, 'kfold'] = int(n)
train['kfold'] = train['kfold'].astype(int)

# Creating dataset

In [None]:
class get_audio_sample_path(Transform):
    def encodes(self, x):
        return x.full_path
    
class get_audio_sample_label(Transform):
    def __init__(self,scored_classes=scored_classes):
        self.scored_classes = scored_classes
        self.target = []
        
    def encodes(self,x):
        for _class in x.new_target:
            if _class in self.scored_classes:
                self.target.append(_class)
        return self.target

class load_signal(Transform):
    def __init__(self,device='cpu'):self.device=device
    def encodes(self, x:str):
        signal, sr = torchaudio.load(x)
        return [signal,sr,self.device]

class resample_if_necessary(Transform):
    def __init__(self,target_sample_rate):self.target_sample_rate = target_sample_rate
    def encodes(self, x):
        signal, sr ,device= x
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate)
            signal = resampler(signal)
            signal = signal
        return [signal, device]

class mix_down_if_necessary(Transform):
    def encodes(self, x):
        signal, device= x
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return [signal, device]

class cut_if_necessary(Transform):
    def __init__(self,num_samples=params.n_samples):
        self.num_samples = num_samples
    def encodes(self, x):
        signal, device = x
        if MODE == 'train':
            if signal.shape[1] > self.num_samples:
                diff = signal.shape[1] - self.num_samples
                start = random.randint(0,diff)
                end = start + self.num_samples
                signal = signal[:,start:end]
        else:       
            if signal.shape[1] > self.num_samples:
                signal = signal[:, :self.num_samples]
        return [signal, self.num_samples, device]

class right_pad_if_necessary(Transform):
    def encodes(self, x):
        signal, num_samples, device = x
        length_signal = signal.shape[1]
        if length_signal < num_samples:
            num_missing_samples = num_samples - length_signal
            last_dim_padding = (0, num_missing_samples)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        return signal

class mel_spec(Transform):
    def __init__(self,sample_rate=params.sr,
                      n_fft = params.n_fft,
                      hop_length=params.hop_length,
                      n_mels=params.n_mels,
                      fmin = params.fmin,
                      fmax = params.fmax,
                      window = params.window):

        self.mel_spectrogram = torchaudio.transforms.MelSpectrogram(
                sample_rate=sample_rate,n_fft=n_fft,f_min=fmin,f_max=fmax,
                hop_length=hop_length,n_mels=n_mels)   
        self.db = torchaudio.transforms.AmplitudeToDB()
        
    def encodes(self, x):
        melspec = self.mel_spectrogram(x)
        melspec = self.db(melspec)
        return melspec

class mono_to_color(Transform):
    def __init__(self,mean=None, std=None,  eps=1e-6):
        self.mean = mean
        self.std = std
        self.eps = eps
        
    def encodes(self,x):
        X = x.repeat([3,1,1])
        mean = self.mean or X.mean()
        std = self.std or X.std()
        X = (X - mean) / (std + self.eps)
        _min, _max = X.min(), X.max()
        if (_max - _min) > self.eps:
            V = torch.clip(X, _min, _max)
            V = 255 * (V - _min) / (_max - _min)
            V = V.type(torch.FloatTensor)
        else:
            V = torch.zeros_like(X, dtype=torch.FloatTensor)
        V = V.transpose(1,2)
        return V
    
class ohe(Transform):
    def __init__(self,targets):
        self.mlb = MultiLabelBinarizer()
        self.mlb.fit(targets.tolist())
        self.vocab = self.mlb.classes_
    def encodes(self, x):
        return self.mlb.transform([x]).reshape(-1).astype(np.float32)

class get_sound_images(Transform):
    def encodes(self,x):
        image = np.load(x) # (224, 313, 3)
        image = albu_transforms[params.mode](image=image)['image']
        image = image.T
        return image

In [None]:
mean = (0.485, 0.456, 0.406) # RGB
std = (0.229, 0.224, 0.225) # RGB

albu_transforms = {
    'train' : A.Compose([
            A.HorizontalFlip(p=0.5),
            A.OneOf([
                A.Cutout(max_h_size=5, max_w_size=16),
                A.CoarseDropout(max_holes=4),
            ], p=0.5),
            A.Normalize(mean, std),
    ]),
    'valid' : A.Compose([
            A.Normalize(mean, std),
    ]),
}

In [None]:
if torch.cuda.is_available():
        device = "cuda"
else:
        device = "cpu"
print(f"Using {device}")

In [None]:
val_fold = 0
def get_dls(val_fold=0,bs=64):
    splits = [train[train.kfold != val_fold].index.tolist() , train[train.kfold == val_fold].index.tolist()]
    #x_tfms = [get_audio_sample_path,load_signal(device),resample_if_necessary(32000),
    #          mix_down_if_necessary,cut_if_necessary(),right_pad_if_necessary,
    #          mel_spec,mono_to_color]
    x_tfms = [get_audio_sample_path,get_sound_images,ToTensor]
    y_tfms = [get_audio_sample_label,ohe(train['new_target']),ToTensor]

    dsets = Datasets(items = train ,tfms=[x_tfms, y_tfms],splits=splits)

    dls = dsets.dataloaders(bs=bs)   
    return dls

In [None]:
xb, yb = get_dls(val_fold=0,bs=64).one_batch()
xb.shape, yb.shape

In [None]:
dls = get_dls(val_fold=0,bs=params.bs)

# Creating the model

In [None]:
def interpolate(x: torch.Tensor, ratio: int):
    (batch_size, time_steps, classes_num) = x.shape
    upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
    upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
    return upsampled


def pad_framewise_output(framewise_output: torch.Tensor, frames_num: int):
    output = F.interpolate(
        framewise_output.unsqueeze(1),
        size=(frames_num, framewise_output.size(2)),
        align_corners=True,
        mode="bilinear").squeeze(1)

    return output

In [None]:
def init_layer(layer):
    nn.init.xavier_uniform_(layer.weight)

    if hasattr(layer, "bias"):
        if layer.bias is not None:
            layer.bias.data.fill_(0.)


def init_bn(bn):
    bn.bias.data.fill_(0.)
    bn.weight.data.fill_(1.0)

class AttBlockV2(nn.Module):
    def __init__(self,
                 in_features: int,
                 out_features: int,
                 activation="linear"):
        super().__init__()

        self.activation = activation
        self.att = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True)
        
        self.cla = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True)

        self.init_weights()

    def init_weights(self):
        init_layer(self.att)
        init_layer(self.cla)

    def forward(self, x):
        # x: (n_samples, n_in, n_time)
        norm_att = torch.softmax(torch.tanh(self.att(x)), dim=-1)
        cla = self.nonlinear_transform(self.cla(x))
        x = torch.sum(norm_att * cla, dim=2)
        return x, norm_att, cla

    def nonlinear_transform(self, x):
        if self.activation == 'linear':
            return x
        elif self.activation == 'sigmoid':
            return torch.sigmoid(x)

In [None]:
import random
class building_model(Module):
    def __init__(self,num_classes:int,arch:str ='efficientnet_b0', pretrained:bool=False):
        self.spec_augmenter = SpecAugmentation(time_drop_width=64//2, time_stripes_num=2,
                                                   freq_drop_width=8//2, freq_stripes_num=2)
        self.bn0 = nn.BatchNorm2d(params.n_mels)
        self.base_model = create_model(arch, pretrained=pretrained)
        self.linear = nn.Linear(self.base_model.get_classifier().out_features, num_classes)
        self.layers = list(self.base_model.children())[:-2]
        self.encoder = nn.Sequential(*self.layers)
        self.in_features = self.base_model.classifier.in_features
        self.fc1 = nn.Linear(self.in_features, self.in_features, bias=True)
        self.fc2 = nn.Linear(num_classes,num_classes, bias=False)

        self.drop_1 = nn.Dropout(0.5)
        self.drop_2 = nn.Dropout(0.5)
        self.att_block = AttBlockV2(self.in_features, num_classes, activation="sigmoid")  
        self.init_weight()
        self.mode = 'train'
    def init_weight(self):
        init_bn(self.bn0)
        init_layer(self.fc1)
        
    def change_mode(self,mode='train'):
        self.mode = mode

    def forward(self,input_data):
        frames_num = input_data.shape[2]
        x = input_data # (batch_size, 3, time_steps, mel_bins)
        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)

        if self.mode == 'train':
            if random.random() < 0.75:
                x = self.spec_augmenter(x)
                
        x = x.transpose(2, 3)
        x = self.encoder(x)  
        x = torch.mean(x, dim=3)
        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
        x = x1 + x2
        x = self.drop_1(x)
        x = x.transpose(1, 2)
        x = F.relu_(self.fc1(x))
        x = x.transpose(1, 2)
        x = self.drop_2(x)
        (clipwise_output, norm_att, segmentwise_output) = self.att_block(x)
        logit = torch.sum(norm_att * self.att_block.cla(x), dim=2)
        segmentwise_logit = self.att_block.cla(x).transpose(1, 2)
        segmentwise_output = segmentwise_output.transpose(1, 2)

        interpolate_ratio = frames_num // segmentwise_output.size(1)

        # Get framewise output
        framewise_output = interpolate(segmentwise_output,
                                       interpolate_ratio)
        framewise_output = pad_framewise_output(framewise_output, frames_num)

        framewise_logit = interpolate(segmentwise_logit, interpolate_ratio)
        framewise_logit = pad_framewise_output(framewise_logit, frames_num)

        output_dict = {
            'framewise_output': framewise_output,
            'clipwise_output': clipwise_output,
            'logit': logit,
            'framewise_logit': framewise_logit,
        }

        return output_dict    

# Loss

In [None]:
# https://www.kaggle.com/c/rfcx-species-audio-detection/discussion/213075
class BCEFocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, preds, targets):
        bce_loss = nn.BCEWithLogitsLoss(reduction='none')(preds, targets)
        probas = torch.sigmoid(preds)
        loss = targets * self.alpha * \
            (1. - probas)**self.gamma * bce_loss + \
            (1. - targets) * probas**self.gamma * bce_loss
        loss = loss.mean()
        return loss


class BCEFocal2WayLoss(nn.Module):
    def __init__(self, weights=[1, 1], class_weights=None):
        super().__init__()

        self.focal = BCEFocalLoss()

        self.weights = weights

    def forward(self, input, target):
        input_ = input["logit"]
        target = target.float()

        framewise_output = input["framewise_logit"]
        clipwise_output_with_max, _ = framewise_output.max(dim=1)

        loss = self.focal(input_, target)
        aux_loss = self.focal(clipwise_output_with_max, target)

        return self.weights[0] * loss + self.weights[1] * aux_loss

In [None]:
def loss_fn(logits, targets):
    loss_fct = BCEFocal2WayLoss()
    loss = loss_fct(logits, targets)
    return loss

# metric

In [None]:
label_mask = []
for n,x in enumerate(dls.vocab):
    if x in scored_classes:
        label_mask.append(n)

In [None]:
label_mask

In [None]:
def MetricMeter(y_pred, y_true):
    with torch.no_grad():
        y_true = y_true.cpu().detach().numpy()
        y_true = y_true[:,label_mask]
        y_pred = y_pred["clipwise_output"].cpu().detach().numpy()
        y_pred = y_pred[:,label_mask]
        y_pred = np.where(y_pred > 0.3,1.0,0.0)
        f1_03 = metrics.f1_score(y_true,y_pred, average="micro")        
        return f1_03

In [None]:
import numpy as np
import sklearn.metrics

def comp_metric(y_pred, y_true, epsilon=1e-9):
    with torch.no_grad():
        y_true = y_true.cpu().numpy()
        y_true = y_true[:,label_mask]
        y_pred = y_pred["clipwise_output"].cpu().numpy()
        y_pred = y_pred[:,label_mask]
        y_pred = np.where(y_pred > 0.3, 1.0, 0.0)
    # Get representative confusion matrices for each label
    mlbl_cms = sklearn.metrics.multilabel_confusion_matrix(y_true, y_pred)

    # Get two scores (TP and TN SCORES)
    tp_scores = np.array([
        mlbl_cm[1, 1]/(epsilon+mlbl_cm[:, 1].sum()) \
        for mlbl_cm in mlbl_cms
        ])
    tn_scores = np.array([
        mlbl_cm[0, 0]/(epsilon+mlbl_cm[:, 0].sum()) \
        for mlbl_cm in mlbl_cms
        ])

    # Get average
    tp_mean = tp_scores.mean()
    tn_mean = tn_scores.mean()

    return round((tp_mean+tn_mean)/2, 8)


In [None]:
target_cols = dls.vocab

In [None]:
target_cols

In [None]:
class train_val_clb(Callback):
        def after_validate(self):
            self.learn.model.change_mode('train')
            params.mode = 'train'
            
        def before_validate(self):
            self.learn.model.change_mode('valid')
            params.mode = 'valid'

In [None]:
model = building_model(num_classes=dls.vocab.shape[0],pretrained=False)
#model = resnet18(num_classes=152)
learn = Learner(dls,model,loss_func=loss_fn,metrics=comp_metric,cbs=[train_val_clb])
#learn.summary()

In [None]:
#learn.lr_find()

In [None]:
learn.fit_one_cycle(20,lr_max=1e-3,cbs=MixUp(0.7))

In [None]:
learn.export(fname='model.pkl')

# references

1- https://www.kaggle.com/code/kaerunantoka/birdclef2022-audio-to-numpy-1-4/notebook

2- https://www.kaggle.com/code/kaerunantoka/birdclef2022-n001-training/notebook