# What are Transformers?

Transformers are neural networks which make use of a concept called "attention layers" from NLP which means paying attention to only those parts of the inputs which are important. Through multiple attention layers Transformers learn the necessary parts of inputs and are able to learn better contextual embeddings in the case of text.

In recent times, Transformers are being heavily applied to Computer vision tasks. In this notebook I will try to experiment by applying it to audio data and see if we get results that are good enough.

Framework of choice is definitely Pytorch Lightning since it lets me set up a baseline and keep modifying it with ease. 

In [None]:
!pip install pytorch_lightning

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import librosa
import IPython.display as ipd
import torchaudio.transforms as a_trans
import soundfile as sf
import torch
import torchaudio
import pytorch_lightning as pl
import torch.nn as nn
import torch.nn.functional as F
from torchvision.models import resnext50_32x4d
from torch.utils.data import Dataset,DataLoader
import sklearn
from torchvision.transforms import RandomResizedCrop
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
tp_train = pd.read_csv("../input/rfcx-species-audio-detection/train_fp.csv")
fp_train = pd.read_csv("../input/rfcx-species-audio-detection/train_fp.csv")

In [None]:
tp_train.head()

In [None]:
### Describe train dataframe ####
tp_train.describe()

Multiple species are recorded in a single audio clip. I convert them into a one hot vector with ones at indices based on which sound is detected

In [None]:
### Convert problem to multiclass ###
ids = tp_train.recording_id.unique()
multilabel_list = []
for filename in ids:
    req = tp_train[tp_train.recording_id==filename]
    labels = req.species_id
    multilabel_list.append(labels.values)
new_df = pd.DataFrame()
new_df["rec_id"] = ids
new_df["labels"] = multilabel_list

In [None]:
new_df.head()

In [None]:
#### Check species distribution ###
fig = plt.figure(figsize=(10,8))
sns.countplot(tp_train.species_id)

In [None]:
def onehot(df,num_classes):
    onehot_labels = []
    for indices in df.labels:
        vec = np.zeros(num_classes)
        vec[indices-1] = 1
        onehot_labels.append(vec)
    return onehot_labels
labs = onehot(new_df,23)
new_df["onehot"] = labs

In [None]:
new_df.head()

In [None]:
## Check distribution of time and frequency limits ###
fig,ax = plt.subplots(2,2,figsize=(8,8))
sns.distplot(tp_train.t_min,ax=ax[0][0])
sns.distplot(tp_train.t_max,ax=ax[0][1])
sns.distplot(tp_train.f_min,ax=ax[1][0])
sns.distplot(tp_train.f_max,ax=ax[1][1])



In [None]:
import librosa.display

This metric is lwlwrap, which is a version of sklearn's lrap. More details can be found here: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.label_ranking_average_precision_score.html

In [None]:
def _one_sample_positive_class_precisions(scores, truth):
    """Calculate precisions for each true class for a single sample.

    Args:
    scores: np.array of (num_classes,) giving the individual classifier scores.
    truth: np.array of (num_classes,) bools indicating which classes are true.

    Returns:
    pos_class_indices: np.array of indices of the true classes for this sample.
    pos_class_precisions: np.array of precisions corresponding to each of those
      classes.
    """
    num_classes = scores.shape[0]
    pos_class_indices = np.flatnonzero(truth > 0)
    # Only calculate precisions if there are some true classes.
    if not len(pos_class_indices):
        return pos_class_indices, np.zeros(0)
    # Retrieval list of classes for this sample. 
    retrieved_classes = np.argsort(scores)[::-1]
    # class_rankings[top_scoring_class_index] == 0 etc.
    class_rankings = np.zeros(num_classes, dtype=np.int)
    class_rankings[retrieved_classes] = range(num_classes)
    # Which of these is a true label?
    retrieved_class_true = np.zeros(num_classes, dtype=np.bool)
    retrieved_class_true[class_rankings[pos_class_indices]] = True
    # Num hits for every truncated retrieval list.
    retrieved_cumulative_hits = np.cumsum(retrieved_class_true)
    # Precision of retrieval list truncated at each hit, in order of pos_labels.
    precision_at_hits = (
      retrieved_cumulative_hits[class_rankings[pos_class_indices]] / 
      (1 + class_rankings[pos_class_indices].astype(np.float)))
    return pos_class_indices, precision_at_hits


# All-in-one calculation of per-class lwlrap.

def calculate_per_class_lwlrap(truth, scores):
    """Calculate label-weighted label-ranking average precision.

    Arguments:
    truth: np.array of (num_samples, num_classes) giving boolean ground-truth
      of presence of that class in that sample.
    scores: np.array of (num_samples, num_classes) giving the classifier-under-
      test's real-valued score for each class for each sample.

    Returns:
    per_class_lwlrap: np.array of (num_classes,) giving the lwlrap for each 
      class.
    weight_per_class: np.array of (num_classes,) giving the prior of each 
      class within the truth labels.  Then the overall unbalanced lwlrap is 
      simply np.sum(per_class_lwlrap * weight_per_class)
    """
    assert truth.shape == scores.shape
    num_samples, num_classes = scores.shape
    # Space to store a distinct precision value for each class on each sample.
    # Only the classes that are true for each sample will be filled in.
    precisions_for_samples_by_classes = np.zeros((num_samples, num_classes))
    for sample_num in range(num_samples):
        pos_class_indices, precision_at_hits = (
          _one_sample_positive_class_precisions(scores[sample_num, :], 
                                                truth[sample_num, :]))
        precisions_for_samples_by_classes[sample_num, pos_class_indices] = (
            precision_at_hits)
    labels_per_class = np.sum(truth > 0, axis=0)
    weight_per_class = labels_per_class / float(np.sum(labels_per_class))
    # Form average of each column, i.e. all the precisions assigned to labels in
    # a particular class.
    per_class_lwlrap = (np.sum(precisions_for_samples_by_classes, axis=0) / 
                      np.maximum(1, labels_per_class))
    # overall_lwlrap = simple average of all the actual per-class, per-sample precisions
    #                = np.sum(precisions_for_samples_by_classes) / np.sum(precisions_for_samples_by_classes > 0)
    #           also = weighted mean of per-class lwlraps, weighted by class label prior across samples
    #                = np.sum(per_class_lwlrap * weight_per_class)
    return per_class_lwlrap, weight_per_class


def calculate_overall_lwlrap_sklearn(truth, scores):
    """Calculate the overall lwlrap using sklearn.metrics.lrap."""
    # sklearn doesn't correctly apply weighting to samples with no labels, so just skip them.
    sample_weight = np.sum(truth > 0, axis=1)
    nonzero_weight_sample_indices = np.flatnonzero(sample_weight > 0)
    overall_lwlrap = sklearn.metrics.label_ranking_average_precision_score(
      truth[nonzero_weight_sample_indices, :] > 0, 
      scores[nonzero_weight_sample_indices, :], 
      sample_weight=sample_weight[nonzero_weight_sample_indices])
    return overall_lwlrap


In [None]:
import math
class SpecDataset(Dataset):
    def __init__(self,csv,transforms=None):
        self.csv = csv
        self.csv.reset_index(inplace=True)
        
    
    def __len__(self):
        return len(self.csv)

    def __getitem__(self,idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        sig,sr =  torchaudio.load("../input/rfcx-species-audio-detection/train/"+self.csv.recording_id[idx]+".flac",normalization=True)
        label = self.csv.species_id[idx]
        f_min_1 = self.csv.f_min[idx]
        f_max_1 = self.csv.f_max[idx]
        t_min_1 = self.csv.t_min[idx]
        t_max_1 = self.csv.t_max[idx]
        #print("B SIG: ",sig.size())
        sig = sig[:,math.floor((t_min_1*sr)):math.floor(((t_min_1+4)*sr))]
        
        if sig.size()[1]!=4*sr:
            
            #print("SIG: ",sig.size()) 
            if sig.size()[1]>4*sr:
                sig = sig[:,:4*sr]
            else:
                tens = torch.zeros(1,4*sr)
                tens[:,:sig.size(1)] = sig
                sig = tens
        
                
        spec_fn = a_trans.MelSpectrogram(sample_rate=sr,n_fft=2048,hop_length=512,f_min=f_min_1,f_max=f_max_1)
        S = spec_fn(sig)
        S_db = a_trans.AmplitudeToDB().forward(S)
        #print(S_db.shape)
        return {"X": S_db,"Y": label}

I set up a data class to convert audio clips to their subsequent spectrograms. I use torchaudio because:

a) It allows me to generate spectrograms using a GPU, thus making it faster.<br>
b) Librosa is too damn slow :')

I convert the input to a 256 x 256 shaped tensor for easily inputting it to our backbone

In [None]:
class Transformer_Data(Dataset):
    def __init__(self,csv,transform=None):
        self.csv = csv
        self.csv.reset_index(inplace=True,drop=True)
        
    
    def __len__(self):
        return len(self.csv)
    
    def __getitem__(self,idx):
        if torch.is_tensor(idx):
            idx = idx.to_list()
        sig,sr =  torchaudio.load("../input/rfcx-species-audio-detection/train/"+self.csv.rec_id[idx]+".flac",normalization=True)
        
        label = self.csv.onehot[idx]
        spec_fn = a_trans.MelSpectrogram(sample_rate=sr,n_fft=2048,hop_length=512)
        S = spec_fn(sig)
        S_db = a_trans.AmplitudeToDB().forward(S)
        #S_db = S_db.reshape([S_db.shape[1],S_db.shape[2],S_db.shape[0]])
        crop = RandomResizedCrop((256,256))
        S_db = crop(S_db)
        #print(S_db.shape)
        return {"X": S_db,"y": label}
        
        
        

## The backbone

I use the Vision Transformer, which was a version of the transformer network used for image classification. More details about the same can be found in this paper: https://arxiv.org/abs/2010.11929

I did not implement it from scratch since another implementation was already available. Thanks to this amazing repo: https://github.com/lucidrains/vit-pytorch

In [None]:
!pip install vit-pytorch

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test = train_test_split(new_df,test_size=0.2,random_state=42)


Set up the datasets

In [None]:
train_dataset = Transformer_Data(X_train)
val_dataset = Transformer_Data(X_test)

Set up the dataloaders

In [None]:
train_dataloader = DataLoader(train_dataset,batch_size=32,num_workers=4,shuffle=True)
val_dataloader = DataLoader(val_dataset,batch_size=32,num_workers=4,shuffle=True)


Set up the model class. This class now makes it easy to train and can be modified at will for other networks. See how easy lightning makes it

In [None]:
from vit_pytorch import ViT
class AudioClassifier(pl.LightningModule):
    def __init__(self):
        super().__init__()
        #self.backbone = resnext50_32x4d()
        self.backbone = ViT(
                    image_size = 256,
                    patch_size = 32,
                    num_classes = 23,
                    dim = 1024,
                    depth = 6,
                    heads = 16,
                    mlp_dim = 2048,
                    dropout = 0.1,
                    emb_dropout = 0.1,
                    channels= 1
                )

        #self.backbone.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        #self.fc = nn.Linear(1000,24)
        
    
    def forward(self,x):
        out = self.backbone(x)
        #out = self.fc(x)
        return out
    
    def training_step(self,batch,batch_idx):
        X = batch["X"]
        y = batch["y"]
        y_hat = self.forward(X)
        #y_hat = y_hat.float()
        #y = y.float()
        loss = F.binary_cross_entropy_with_logits(y_hat,y)
        self.log("Training loss ",loss,logger=True,prog_bar=True)
        
        return loss
    
    def validation_step(self,batch,batch_idx):
        X = batch["X"]
        y = batch["y"]
        #X = torch.squeeze(X,1)
        #print("Y SHAPE: ",y.shape)
        #y_oh= F.one_hot(y,24)
        y_hat = self.forward(X)
        lwlwrap = calculate_overall_lwlrap_sklearn(y.detach().cpu().numpy(),y_hat.detach().cpu().numpy())
        #print("DType: ",y_hat.dtype)
        #print("ODtype: ",y.dtype)
        #y_hat = y_hat.float()
        #y = y.float()
        loss = F.binary_cross_entropy_with_logits(y_hat,y)
        self.log("Val loss ",loss,logger=True,prog_bar=True)
        return {"loss":loss,"lwlwrap": lwlwrap}
    
    def validation_epoch_end(self,val_ops):
        metric_list = [op["lwlwrap"] for op in val_ops]
        val_lwlwrap = sum(metric_list)/len(metric_list)
        self.log("Val lwlwrap ",val_lwlwrap,logger=True,prog_bar=True)
        return val_lwlwrap
    
    
        
        
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(),lr=1e-4)
        


Train the model
I train on a gpu currently since I am facing issues with the tpu. 16 bit precision is used to put less pressure on the memory

In [None]:
import warnings
warnings.filterwarnings("ignore")
model = AudioClassifier()
trainer = pl.Trainer(gpus=1,precision=16,max_epochs=5)
trainer.fit(model,train_dataloader,val_dataloader)

Load the model from checkpoint for inference

In [None]:
### LOAD MODEL FROM CHECKPOINT ###
model = AudioClassifier.load_from_checkpoint("lightning_logs/version_0/checkpoints/epoch=4.ckpt")
model.eval()

Make Inference on the test data using this function. This is currently facing memory issues, will fix it in the upcoming versions

In [None]:
import glob
def inference(model,test_folder):
    all_preds = []
    model.eval()
    for ii,filename in enumerate(glob.glob(test_folder+"/*")):
        print("Iteration: ",ii)
        sig,sr =  torchaudio.load(filename,normalization=True)
        spec_fn = a_trans.MelSpectrogram(sample_rate=sr,n_fft=2048,hop_length=512)
        S = spec_fn(sig)
        S_db = a_trans.AmplitudeToDB().forward(S)
        #S_db = S_db.reshape([S_db.shape[1],S_db.shape[2],S_db.shape[0]])
        crop = RandomResizedCrop((256,256))
        S_db = crop(S_db)
        preds = F.sigmoid(model(S_db.unsqueeze(0)))
        all_preds.append(preds)
    
        
        
        
        
    all_preds = torch.cat(all_preds)    
    return all_preds

In [None]:
'''import time
start = time.time()
preds = inference(model,"../input/rfcx-species-audio-detection/test")
print("End: ",time.time()-start)'''