In [None]:
!pip install pretrainedmodels
!pip install pydub

In [None]:
import numpy as np
import pandas as pd
import librosa
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import random

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold
import math
from collections import OrderedDict

from PIL import Image
import albumentations
from pydub import AudioSegment

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader

import pretrainedmodels

import warnings
warnings.filterwarnings('ignore')

## Preprocessing <a id="3"></a>

In [None]:
train = pd.read_csv("../input/birdsong-recognition/train.csv")
test = pd.read_csv("../input/birdsong-recognition/test.csv")
submission = pd.read_csv("../input/birdsong-recognition/sample_submission.csv")

### e-bird code

a code for the bird species. we need to predict `ebird_code` using metadata and audio data 


In [None]:
print("Number of Unique birds : ", train.ebird_code.nunique())

### top10 Birds

we are taking top10 birds to build stater model

In [None]:
top10_birds = list(train.ebird_code.value_counts().index[:10])

train = train[train.ebird_code.isin(top10_birds)]

# label encoding for target values
train["ebird_label"] = LabelEncoder().fit_transform(train.ebird_code.values)

### K-Fold

In [None]:
train.loc[:, "kfold"] = -1

train= train.sample(frac=1).reset_index(drop=True)

X = train.filename.values
y = train.ebird_code.values

kfold = StratifiedKFold(n_splits=5)

for fold, (t_idx, v_idx) in enumerate(kfold.split(X, y)):
    train.loc[v_idx, "kfold"] = fold

print(train.kfold.value_counts())

### Arguments

In [None]:
class args:
    
    ROOT_PATH = "../input/birdsong-recognition/train_audio"
    
    num_classes = 10
    max_duration= 5 # seconds
    
    sample_rate = 32000
    
    batch_size = 16
    num_workers = 4
    epochs = 10
    
    lr = 0.0009
    wd = 1e-5
    momentum = 0.9
    eps = 1e-8
    betas = (0.9, 0.999)
    
    melspectrogram_parameters = {
        "n_mels": 128,
        "fmin": 20,
        "fmax": 16000
    }
    

### Loading Audio Files

In [None]:
def load_audio(path):
    try:
        sound = AudioSegment.from_mp3(path)
        sound = sound.set_frame_rate(args.sample_rate)
        sound_array = np.array(sound.get_array_of_samples(), dtype=np.float32)
    except:
        sound_array = np.zeros(args.sample_rate * args.max_duration, dtype=np.float32)
        
    return sound_array, args.sample_rate

### Audo Albumentations

- check my other notebook [Audio Albumentations](https://www.kaggle.com/gopidurgaprasad/audio-albumentations)

In [None]:
from albumentations.core.transforms_interface import DualTransform, BasicTransform

class AudioTransform(BasicTransform):
    """Transform for Audio task"""

    @property
    def targets(self):
        return {"data": self.apply}
    
    def update_params(self, params, **kwargs):
        if hasattr(self, "interpolation"):
            params["interpolation"] = self.interpolation
        if hasattr(self, "fill_value"):
            params["fill_value"] = self.fill_value
        return params

class NoiseInjection(AudioTransform):
    """It simply add some random value into data by using numpy"""
    def __init__(self, noise_levels=(0, 0.5), always_apply=False, p=0.5):
        super(NoiseInjection, self).__init__(always_apply, p)

        self.noise_levels = noise_levels
    
    def apply(self, data, **params):
        sound, sr = data
        noise_level = np.random.uniform(*self.noise_levels)
        noise = np.random.randn(len(sound))
        augmented_sound = sound + noise_level * noise
        # Cast back to same data type
        augmented_sound = augmented_sound.astype(type(sound[0]))

        return augmented_sound, sr

class ShiftingTime(AudioTransform):
    """Shifting time axis"""
    def __init__(self, always_apply=False, p=0.5):
        super(ShiftingTime, self).__init__(always_apply, p)
    
    def apply(self, data, **params):
        sound, sr = data

        shift_max = np.random.randint(1,len(sound))
        shift = np.random.randint(int(sr * shift_max))
        direction = np.random.randint(0,2)
        if direction == 1:
            shift = -shift

        augmented_sound = np.roll(sound, shift)
        # Set to silence for heading/ tailing
        if shift > 0:
            augmented_sound[:shift] = 0
        else:
            augmented_sound[shift:] = 0

        return augmented_sound, sr

class PitchShift(AudioTransform):
    
    def __init__(self, always_apply=False, p=0.5):
        super(PitchShift, self).__init__(always_apply, p)
    
    def apply(self, data, **params):
        sound, sr = data

        n_steps = np.random.randint(-10, 10)
        augmented_sound = librosa.effects.pitch_shift(sound, sr, n_steps)

        return augmented_sound, sr

class TimeStretch(AudioTransform):
    
    def __init__(self, always_apply=False, p=0.5):
        super(TimeStretch, self).__init__(always_apply, p)
    
    def apply(self, data, **params):
        sound, sr = data

        rate = np.random.uniform(0, 2)
        augmented_sound = librosa.effects.time_stretch(sound, rate)

        return augmented_sound, sr

class RandomAudio(AudioTransform):
    
    def __init__(self,  seconds=5, always_apply=False, p=0.5):
        super(RandomAudio, self).__init__(always_apply, p)

        self.seconds = seconds
    
    def apply(self, data, **params):
        sound, sr = data

        shift = np.random.randint(len(sound))
        trim_sound = np.roll(sound, shift)

        min_samples = int(sr * self.seconds)

        if len(trim_sound) < min_samples:
            padding = min_samples - len(trim_sound)
            offset = padding // 2
            trim_sound = np.pad(trim_sound, (offset, padding - offset), "constant")
        else:
            trim_sound = trim_sound[:min_samples]

        return trim_sound, sr

class MelSpectrogram(AudioTransform):

    def __init__(self, parameters, always_apply=False, p=0.5):
        super(MelSpectrogram, self).__init__(always_apply, p)

        self.parameters = parameters
    
    def apply(self, data, **params):
        sound, sr = data

        melspec = librosa.feature.melspectrogram(sound, sr=sr, **self.parameters)
        melspec = librosa.power_to_db(melspec)
        melspec = melspec.astype(np.float32)

        return melspec, sr

class SpecAugment(AudioTransform):
    
    def __init__(self, num_mask=2, freq_masking=0.15, time_masking=0.20, always_apply=False, p=0.5):
        super(SpecAugment, self).__init__(always_apply, p)

        self.num_mask = num_mask
        self.freq_masking = freq_masking
        self.time_masking = time_masking
    
    def apply(self, data, **params):
        melspec, sr = data

        spec_aug = self.spec_augment(melspec, 
                                     self.num_mask,
                                     self.freq_masking,
                                     self.time_masking,
                                     melspec.min())
        


        return spec_aug, sr
    
    # Source: https://www.kaggle.com/davids1992/specaugment-quick-implementation
    def spec_augment(self, 
                    spec: np.ndarray,
                    num_mask=2,
                    freq_masking=0.15,
                    time_masking=0.20,
                    value=0):
        spec = spec.copy()
        num_mask = random.randint(1, num_mask)
        for i in range(num_mask):
            all_freqs_num, all_frames_num  = spec.shape
            freq_percentage = random.uniform(0.0, freq_masking)

            num_freqs_to_mask = int(freq_percentage * all_freqs_num)
            f0 = np.random.uniform(low=0.0, high=all_freqs_num - num_freqs_to_mask)
            f0 = int(f0)
            spec[f0:f0 + num_freqs_to_mask, :] = value

            time_percentage = random.uniform(0.0, time_masking)

            num_frames_to_mask = int(time_percentage * all_frames_num)
            t0 = np.random.uniform(low=0.0, high=all_frames_num - num_frames_to_mask)
            t0 = int(t0)
            spec[:, t0:t0 + num_frames_to_mask] = value

        return spec

class SpectToImage(AudioTransform):

    def __init__(self, always_apply=False, p=0.5):
        super(SpectToImage, self).__init__(always_apply, p)
    
    def apply(self, data, **params):
        image, sr = data
        delta = librosa.feature.delta(image)
        accelerate = librosa.feature.delta(image, order=2)
        image = np.stack([image, delta, accelerate], axis=0)
        image = image.astype(np.float32) / 100.0

        return image

In [None]:
### Example

train_audio_augmentation = albumentations.Compose([
     RandomAudio(seconds=args.max_duration, always_apply=True),
     NoiseInjection(p=0.33),
     MelSpectrogram(parameters=args.melspectrogram_parameters,always_apply=True),
     SpecAugment(p=0.33),
     SpectToImage(always_apply=True)
])

valid_audio_augmentation = albumentations.Compose([
     RandomAudio(seconds=args.max_duration, always_apply=True),
     MelSpectrogram(parameters=args.melspectrogram_parameters,always_apply=True),
     SpectToImage(always_apply=True)
])

path = f"{args.ROOT_PATH}/aldfly/XC134874.mp3"
data = load_audio(path)
image = train_audio_augmentation(data=data)['data']

plt.imshow(image.transpose(1,2,0))
plt.show()

### Pytorch DataLoader

In [None]:
class BirdDataset:
    def __init__(self, df, valid=False):
        
        self.filename = df.filename.values
        self.ebird_label = df.ebird_label.values
        self.ebird_code = df.ebird_code.values
        
        if valid:
            self.aug = valid_audio_augmentation
        else:
            self.aug = train_audio_augmentation
        
    
    def __len__(self):
        return len(self.filename)
    
    def __getitem__(self, item):
        
        filename = self.filename[item]
        ebird_code = self.ebird_code[item]
        ebird_label = self.ebird_label[item]

        data = load_audio(f"{args.ROOT_PATH}/{ebird_code}/{filename}")
        spect = self.aug(data=data)["data"]
        
        target = ebird_label
        
        return {
            "spect" : torch.tensor(spect, dtype=torch.float), 
            "target" : torch.tensor(target, dtype=torch.long)
        }
    

In [None]:
# Example 
dataset = BirdDataset(train)
d = dataset.__getitem__(10)

print(d["spect"].shape, d["target"])

plt.imshow(d["spect"].permute(1,2,0))
plt.show()

### ResNet18 Model

In [None]:
class ResNet18(nn.Module):
    def __init__(self, pretrained):
        super(ResNet18, self).__init__()
        if pretrained is True:
            self.model = pretrainedmodels.__dict__["resnet18"](pretrained="imagenet")
        else:
            self.model = pretrainedmodels.__dict__["resnet18"](pretrained=None)
        
        self.l0 = nn.Linear(512, args.num_classes)
        
    def forward(self, x):
        bs, _, _, _ = x.shape
        x = self.model.features(x)
        x = F.adaptive_avg_pool2d(x, 1).reshape(bs, -1)
        x = self.l0(x)
        
        return x
    

### Utility functions

In [None]:
def to_list(tensor):
    return tensor.detach().cpu().tolist()

class AverageMeter(object):
    """Computes and stores the average and current values"""
    def __init__(self):
        self.reset()
    
    def __init__(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def get_position_accuracy(logits, labels):
    predictions = np.argmax(F.softmax(logits, dim=1).cpu().data.numpy(), axis=1)
    labels = labels.cpu().data.numpy()
    total_num = 0
    sum_correct = 0
    for i in range(len(labels)):
        if labels[i] >= 0:
            total_num += 1
            if predictions[i] == labels[i]:
                sum_correct += 1
    if total_num == 0:
        total_num = 1e-7
    return np.float32(sum_correct) / total_num, total_num

### Loss function

In [None]:
def loss_fn(preds, labels):
    loss = nn.CrossEntropyLoss(ignore_index=-1)(preds, labels)
    return loss

### train & validation functions

In [None]:
def train_fn(train_loader, model, optimizer, epoch):
    total_loss = AverageMeter()
    accuracies = AverageMeter()
    
    model.train()

    t = tqdm(train_loader)
    for step, d in enumerate(t):
        
        spect = d["spect"].to(args.device)
        targets = d["target"].to(args.device)
        
        outputs = model(spect)

        loss = loss_fn(outputs, targets)

        acc, n_position = get_position_accuracy(outputs, targets)
        

        total_loss.update(loss.item(), n_position)
        accuracies.update(acc, n_position)

        optimizer.zero_grad()
        
        loss.backward()
        optimizer.step()
        
        t.set_description(f"Train E:{epoch+1} - Loss:{total_loss.avg:0.4f} - Acc:{accuracies.avg:0.4f}")
    
    return total_loss.avg

def valid_fn(valid_loader, model, epoch):
    total_loss = AverageMeter()
    accuracies = AverageMeter()
    
    model.eval()

    t = tqdm(valid_loader)
    for step, d in enumerate(t):
        
        with torch.no_grad():
        
            spect = d["spect"].to(args.device)
            targets = d["target"].to(args.device)

            outputs = model(spect)

            loss = loss_fn(outputs, targets)

            acc, n_position = get_position_accuracy(outputs, targets)


            total_loss.update(loss.item(), n_position)
            accuracies.update(acc, n_position)
            
            t.set_description(f"Eval E:{epoch+1} - Loss:{total_loss.avg:0.4f} - Acc:{accuracies.avg:0.4f}")

    return total_loss.avg, accuracies.avg

In [None]:
def main(fold_index):
    
    model = ResNet18(pretrained=False)
    
    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    
    # Setting seed
    seed = 42
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
    model.to(args.device)
    
    optimizer = torch.optim.AdamW(model.parameters(),
                                      lr=args.lr,
                                      betas=args.betas,
                                      eps=args.eps,
                                      weight_decay=args.wd)
    
    train_df = train[~train.kfold.isin([fold_index])]
    
    train_dataset = BirdDataset(df=train_df)
    
    train_loader = DataLoader(
        dataset = train_dataset,
        batch_size = args.batch_size,
        shuffle = True,
        num_workers = args.num_workers,
        pin_memory = True,
        drop_last = False
    )
    
    
    valid_df = train[train.kfold.isin([fold_index])]
    
    valid_dataset = BirdDataset(df=valid_df, valid=True)
    
    valid_loader = DataLoader(
        dataset = valid_dataset,
        batch_size = args.batch_size,
        shuffle = False,
        num_workers = args.num_workers,
        pin_memory = True,
        drop_last = False
    )
    
    best_acc = 0
    
    for epoch in range(args.epochs):
        train_loss = train_fn(train_loader, model, optimizer, epoch)
        valid_loss, valid_acc = valid_fn(valid_loader, model, epoch)
        
        print(f"**** Epoch {epoch+1} **==>** Accuracy = {valid_acc}")
        
        if valid_acc > best_acc:
            print("**** Model Improved !!!! Saving Model")
            torch.save(model.state_dict(), f"fold_{fold_index}.bin")
            best_acc = valid_acc  

### 5 Folds

In [None]:
# fold0
main(0)

<h2 style="color:red;"> Please upvote if you like it. It motivates me. Thank you ☺️ .</h2>