In [None]:
import os
import ast
from torch.utils.data import Dataset
import pandas as pd
import torchaudio
import glob
from sklearn.model_selection import StratifiedKFold, GroupKFold
import os
from torch.utils.data import Dataset
import pandas as pd
import torchaudio
from sklearn.preprocessing import MultiLabelBinarizer
from fastai.vision.all import *
from typing import Optional,Tuple,List
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')
from timm import create_model

# loading training data

In [None]:
train = pd.read_csv('../input/birdclef-2022/train_metadata.csv')
train['new_target'] = train['primary_label'].map(lambda x: [x]) + train['secondary_labels'].map(lambda x: eval(x))
train['len_new_target'] = train['new_target'].map(lambda x: len(x))
train['full_path'] = train.filename.map(lambda x: '../input/birdclef-2022/train_audio/' + str(x))

In [None]:
Fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for n, (trn_index, val_index) in enumerate(Fold.split(train, train['primary_label'])):
    train.loc[val_index, 'kfold'] = int(n)
train['kfold'] = train['kfold'].astype(int)

# Creating dataset

In [None]:
SAMPLE_RATE = 32000
NUM_SAMPLES = 22050
N_SEC = 5
VAL_FOLD = 0

In [None]:
class get_audio_sample_path(Transform):
    def encodes(self, x):
        return x.full_path
    
class get_audio_sample_label(Transform):
    def encodes(self, x):
        return x.new_target

class load_signal(Transform):
    def __init__(self,device='cpu'):self.device=device
    def encodes(self, x:str):
        signal, sr = torchaudio.load(x)
        return [signal,sr,self.device]

class resample_if_necessary(Transform):
    def __init__(self,target_sample_rate):self.target_sample_rate = target_sample_rate
    def encodes(self, x):
        signal, sr ,device= x
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate)
            signal = resampler(signal)
            signal = signal
        return [signal, device]

class mix_down_if_necessary(Transform):
    def encodes(self, x):
        signal, device= x
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return [signal, device]

class cut_if_necessary(Transform):
    def __init__(self,num_samples):self.num_samples = num_samples
    def encodes(self, x):
        signal, device = x
        if signal.shape[1] > self.num_samples:
            signal = signal[:, :self.num_samples]
        return [signal, self.num_samples, device]

class right_pad_if_necessary(Transform):
    def encodes(self, x):
        signal, num_samples, device = x
        length_signal = signal.shape[1]
        if length_signal < num_samples:
            num_missing_samples = num_samples - length_signal
            last_dim_padding = (0, num_missing_samples)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        return signal

class mel_spec(Transform):
    def __init__(self,sample_rate=32000,
                      n_fft=1024,
                      hop_length=512,
                      n_mels=64):
        
         self.sample_rate = sample_rate
         self.n_fft = n_fft
         self.hop_length = hop_length
         self.n_mels = n_mels
         self.mel_spectrogram = torchaudio.transforms.MelSpectrogram(
                sample_rate=self.sample_rate,
                n_fft=self.n_fft,
                hop_length=self.hop_length,
                n_mels=self.n_mels
            ) 
           
    def encodes(self, x):
        signal = self.mel_spectrogram(x)
        signal = signal.repeat(3,1,1)
        return signal

class ohe(Transform):
    def __init__(self,targets):
        self.mlb = MultiLabelBinarizer()
        self.mlb.fit(targets.tolist())
    def encodes(self, x):
        return tensor(self.mlb.transform([x])).view(-1)

In [None]:
def get_dls(val_fold):
    splits = [train[train.kfold != val_fold].index.tolist() , train[train.kfold == val_fold].index.tolist()]
    

In [None]:
if torch.cuda.is_available():
        device = "cuda"
else:
        device = "cpu"
print(f"Using {device}")

In [None]:
val_fold = 0
def get_dls(val_fold=0,bs=64):
    splits = [train[train.kfold != val_fold].index.tolist() , train[train.kfold == val_fold].index.tolist()]
    x_tfms = [get_audio_sample_path,load_signal(device),resample_if_necessary(32000),
              mix_down_if_necessary,cut_if_necessary(SAMPLE_RATE*N_SEC),right_pad_if_necessary,
              mel_spec(SAMPLE_RATE)]

    y_tfms = [get_audio_sample_label,ohe(train['new_target'])]

    dsets = Datasets(items = train ,tfms=[x_tfms, y_tfms],splits=splits)

    dls = dsets.dataloaders(bs=bs)   
    return dls

In [None]:
xb, yb = get_dls(val_fold=0,bs=64).one_batch()
xb.shape, yb.shape

In [None]:
class building_model(Module):
    def __init__(self,num_classes:int,arch:str ='efficientnet_b0', pretrained:bool=True):
        self.model = create_model(arch, pretrained=pretrained)
        self.linear = nn.Linear(self.model.get_classifier().out_features, num_classes)
       
    def forward(self,x):
        features = self.model(x)
        output = self.linear(features)
        return output

# metric

In [None]:
import numpy as np
import sklearn.metrics

def comp_metric(y_pred, y_true, epsilon=1e-9):
    with torch.no_grad():
        y_true = y_true.cpu().numpy()
        y_pred = nn.functional.sigmoid(y_pred).cpu().numpy()
        y_pred = np.where(y_pred > 0.5, 1, 0)
    # Get representative confusion matrices for each label
    mlbl_cms = sklearn.metrics.multilabel_confusion_matrix(y_true, y_pred)

    # Get two scores (TP and TN SCORES)
    tp_scores = np.array([
        mlbl_cm[1, 1]/(epsilon+mlbl_cm[:, 1].sum()) \
        for mlbl_cm in mlbl_cms
        ])
    tn_scores = np.array([
        mlbl_cm[0, 0]/(epsilon+mlbl_cm[:, 0].sum()) \
        for mlbl_cm in mlbl_cms
        ])

    # Get average
    tp_mean = tp_scores.mean()
    tn_mean = tn_scores.mean()

    return round((tp_mean+tn_mean)/2, 8)


In [None]:
dls = get_dls(val_fold=0,bs=64)

In [None]:
model = building_model(num_classes=152,pretrained=False)
#model = resnet18(num_classes=152)
learn = Learner(dls,model,loss_func=BCEWithLogitsLossFlat(),metrics=comp_metric)
learn.summary()

In [None]:
learn.lr_find()

In [None]:
learn.fit_one_cycle(10,lr_max=1e-3)

In [None]:
learn.export(fname='model.pkl')