# Libraries

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import soundfile as sf
import librosa

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

# Path

In [None]:
path = '/kaggle/input/birdclef-2021/'
os.listdir(path)

# Functions
We define some helper functions.

In [None]:
def read_ogg_file(full_path):
    """ Read ogg audio file and return numpay array and samplerate"""
    data, samplerate = sf.read(full_path)
#     data, samplerate = librosa.load(full_path)
    return data, samplerate

# Load Data

In [None]:
train_labels = pd.read_csv(path+'train_soundscape_labels.csv')
train_meta = pd.read_csv(path+'train_metadata.csv')

In [None]:
train_labels.head()

In [None]:
train_meta.head()

In [None]:
labels = []
for row in train_labels.index:
    labels.extend(train_labels.loc[row, 'birds'].split(' '))
labels = list(set(labels))

print('Number of unique bird labels:', len(labels))

In [None]:
labels

In [None]:
import pickle

with open('labels.pkl', 'wb') as fp:
    pickle.dump(labels, fp)

We encode the labels and write them into a data frame:

In [None]:
df_labels_train = pd.DataFrame(index=train_labels.index, columns=labels)
for row in train_labels.index:
    birds = train_labels.loc[row, 'birds'].split(' ')
    for bird in birds:
        df_labels_train.loc[row, bird] = 1
df_labels_train.fillna(0, inplace=True)

In [None]:
train_labels = pd.concat([train_labels, df_labels_train], axis=1)

In [None]:
train_labels

# Parameter
Based on the EDA we define some parameters:

In [None]:
import torch

data_lenght = 160000
audio_lenght = 5
num_labels = len(labels)
batch_size = 4

if torch.cuda.is_available():
    device=torch.device('cuda:0')
else:
    device=torch.device('cpu')

# Train, Val And Test Data

In [None]:
X = train_labels[['row_id', 'site', 'audio_id', 'seconds']]
y = train_labels[labels]

from skmultilearn.model_selection import IterativeStratification

stratifier = IterativeStratification(n_splits=2, order=2, sample_distribution_per_fold=[0.25, 0.75])

list_IDs_train, list_IDs_val = next(stratifier.split(X, y))

# Audio Data Generator
We use a Data Generator to load the data on demand.

In [None]:
from skimage.transform import resize
from skimage.filters import gaussian
from skimage.color import rgb2gray
from skimage import exposure, util
import cv2
import numpy as np
import random

def addNoisy(img):
    noise_img = util.random_noise(img)
    return addChannels(noise_img)

def vertical_flip(img):
    vertical_flip_img = img[::-1, :]
    return addChannels(vertical_flip_img)

def contrast_stretching(img):
    p2, p98 = np.percentile(img, (2, 98))
    contrast_img = exposure.rescale_intensity(img, in_range=(p2, p98))
    return addChannels(contrast_img)

def randomGaussian(img):
    gaussian_img = gaussian(img, sigma=random.randint(0, 5))
    return addChannels(gaussian_img)

def grayScale(img):
    gray_img = rgb2gray(img)
    return addChannels(gray_img)

def randomGamma(img):
    gm = random.randrange(5, 15, 1)  / 10
    img_gamma = exposure.adjust_gamma(img, gamma=gm)
    return addChannels(img_gamma)

def addChannels(img):
    return np.stack((img, img, img))

def spec_to_image(spec):    
    spec = resize(spec, (224, 400))
    eps=1e-6
    mean = spec.mean()
    std = spec.std()
    spec_norm = (spec - mean) / (std + eps)
    spec_min, spec_max = spec_norm.min(), spec_norm.max()
    spec_scaled = 255 * (spec_norm - spec_min) / (spec_max - spec_min)
    spec_scaled = spec_scaled.astype(np.uint8)
    spec_scaled = np.asarray(spec_scaled)
    return spec_scaled

In [None]:
import librosa
from torch.utils.data import Dataset, DataLoader

class AudioData(Dataset):
    def __init__(self, path, list_IDs, df, data_type):
        self.data_type = data_type
        self.path = path
        self.df = df
        self.data = []
        self.labels = []
        
        for i, ID in enumerate(list_IDs):
            prefix = str(self.df.loc[ID, 'audio_id'])+'_'+self.df.loc[ID, 'site']
            file_list = [s for s in os.listdir(self.path) if prefix in s]
            if len(file_list) == 0:
                # Dummy for missing test audio files
                audio_file_fft = np.zeros((data_lenght//2))
                spectrogram = librosa.feature.melspectrogram(audio_file_fft)
                spec_db=librosa.power_to_db(spectrogram,top_db=80)
            else:
                file = file_list[0]#[s for s in os.listdir(self.path) if prefix in s][0]
                audio_file, sr = read_ogg_file(self.path+file)
                audio_file = audio_file[int((self.df.loc[ID, 'seconds']-5)/audio_lenght)*data_lenght:int(self.df.loc[ID, 'seconds']/audio_lenght)*data_lenght]
                audio_file_fft = np.abs(np.fft.fft(audio_file)[: len(audio_file)//2])
#                 # scale data
#                 audio_file_fft = (audio_file_fft-audio_file_fft.mean())/audio_file_fft.std()
            
                n_fft = sr//10
                hop_length = sr//(10*4)
                fmin = 0
                fmax = sr//2
                n_mels=128
                
                spectrogram = librosa.feature.melspectrogram(audio_file_fft, sr=sr, n_mels=n_mels, fmin=fmin, fmax=fmax, n_fft=n_fft, hop_length=hop_length)
                spec_db=librosa.power_to_db(spectrogram,top_db=80)
            
            img = spec_to_image(spec_db)
            mel_spec = np.stack((img, img, img))
            
            label = self.df.loc[ID, self.df.columns[5:]].values
            encoded = [int(w) for w in label]
            label = torch.tensor(encoded)
            
            self.data.append(mel_spec)
            self.labels.append(label)
            
            if data_type == "train" and len(file_list) > 0 and str(self.df.loc[ID, 'birds']) != "nocall":
                augmentation_functions = [
                    addNoisy, contrast_stretching,
                    randomGaussian, grayScale,
                    randomGamma, vertical_flip
                ]
                for fun in augmentation_functions:
                    mel_spec = fun(img)
                    self.data.append(mel_spec)
                    self.labels.append(label)

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

Test the Data Generator

In [None]:
train_data = AudioData(path+'train_soundscapes/', list_IDs_train, train_labels, "train")
val_data = AudioData(path+'train_soundscapes/', list_IDs_val, train_labels, "val")

In [None]:
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True)

# Define Model

In [None]:
from tqdm import tqdm
import copy
from torch import nn

learning_rate = 1e-3
epochs = 10
loss_fn = nn.MSELoss()

def setlr(optimizer, lr):
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    return optimizer

def lr_decay(optimizer, epoch):
    if epoch%10==0:
        new_lr = learning_rate / (10**(epoch//10))
        optimizer = setlr(optimizer, new_lr)
        print(f'Changed learning rate to {new_lr}')
    return optimizer

def train(model, loss_fn, train_loader, valid_loader, epochs, optimizer, change_lr=None):
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    train_losses = []
    valid_losses = []
    
    for epoch in tqdm(range(1,epochs+1)):
        model.train()
        batch_losses=[]
        if change_lr:
            optimizer = change_lr(optimizer, epoch)
        for i, data in enumerate(train_loader):
            x, y = data
            optimizer.zero_grad()
            x = x.to(device, dtype=torch.float32)
            y = y.to(device, dtype=torch.float)
            y_hat = model(x)
            loss = loss_fn(y_hat, y)
            loss.backward()
            batch_losses.append(loss.item())
            optimizer.step()
            
        train_losses.append(batch_losses)
        print(f'Epoch: {epoch} - Train Loss : {np.mean(train_losses[-1])}')
        
        
        model.eval()
        batch_losses=[]
        
        correct = 0.
        total = 0.
        
        for i, data in enumerate(valid_loader):
            x, y = data
            x = x.to(device, dtype=torch.float32)
            y = y.to(device, dtype=torch.float)
            y_hat = model(x)
            loss = loss_fn(y_hat, y)
            
            target = y.cpu().detach().numpy()
            predicted = y_hat.cpu().detach().numpy()
            
            result_target = np.round(target)
            result_predicted = np.where(predicted > 0.5, 1, 0)
            total += (batch_size * y.shape[1]) #batch_size * number_class
            correct += (result_predicted == result_target).sum()

            batch_losses.append(loss.item())

        valid_losses.append(batch_losses)
        
        accuracy = 100 * correct / total
        print(f'Epoch: {epoch} - Valid Loss: {np.mean(valid_losses[-1])} - Valid Accuracy: {accuracy}')
                
        # deep copy the model
        if accuracy > best_acc:
            best_acc = accuracy
            best_model_wts = copy.deepcopy(model.state_dict())
            torch.save(model.state_dict(), "best_model_state.pt")

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [None]:
from torchvision.models import resnet50

class BirdCLEFModel(nn.Module):
    def __init__(self, n_classes):
        super().__init__()
        resnet = resnet50(pretrained=True)
        resnet.fc = nn.Sequential(
            nn.Dropout(p=0.2),
            nn.Linear(in_features=resnet.fc.in_features, out_features=n_classes)
        )
        self.base_model = resnet
        self.sigm = nn.Sigmoid()

    def forward(self, x):
        return self.sigm(self.base_model(x))

In [None]:
!mkdir -p /root/.cache/torch/hub/checkpoints/
!cp ../input/pretrained-pytorch-models/resnet50-19c8e357.pth /root/.cache/torch/hub/checkpoints/

In [None]:
def create_model_and_train():
    resnet_model = BirdCLEFModel(num_labels)
    resnet_model = resnet_model.to(device)
    optimizer = torch.optim.Adam(resnet_model.parameters(), lr=learning_rate)
    resnet_model = train(resnet_model, loss_fn, train_loader, valid_loader, epochs, optimizer, lr_decay)

create_model_and_train()