# A clean, fast, simple bird identifier training pipeline in pytorch-lightning

This notebook is based on [kenroma's baseline](https://www.kaggle.com/kneroma/clean-fast-simple-bird-identifier-training-colab), and I use pytorch-lightning to control my workflow.

I'm new to deep learning, and I want to learn some skills from kaggle, any suggestions are welcome.

I will try to use bigger models, augmentation tircks, and so on.

You can find external dataset from this [notebook](https://www.kaggle.com/kneroma/clean-fast-simple-bird-identifier-training-colab), enjoy your kaggle journey.

## Prepare for the environment

### import packages

In [None]:
!pip install timm
!pip install librosa

In [None]:
import numpy as np
import librosa as lb
import librosa.display as lbd
import soundfile as sf
from  soundfile import SoundFile
import pandas as pd
from  IPython.display import Audio
from pathlib import Path

import torch
from torch import nn, optim
from  torch.utils.data import Dataset, DataLoader

import timm
import pytorch_lightning as pl

from matplotlib import pyplot as plt

import os, random, gc
import re, time, json
from  ast import literal_eval


from IPython.display import Audio
from sklearn.metrics import label_ranking_average_precision_score
from sklearn.model_selection import train_test_split

from tqdm.notebook import tqdm
import joblib

### Set seed for everything

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

### define a config class for experiement

This part is used to control our whole training pipeline.

You can change model by reset model_name, such model list can check [here](https://rwightman.github.io/pytorch-image-models/)

In [None]:
class config:
    debug          = True
    num_workers    = 8
    epochs = 12
    model_name     = 'resnest50d_4s2x40d'
    pretrained_path = '../input/resnest50-fast-package/resnest50_fast_4s2x40d-41d14ed0.pth'
    num_classes = 397
    sr = 32_000
    duration = 7
    max_read_samples = 5 # Each record will have 10 melspecs at most, you can increase this on Colab with High Memory Enabled
    data_root = Path("")
    mel_paths = sorted(Path("../input/").glob("kkiller-birdclef-mels-computer-d7-part?/rich_train_metadata.csv"))
    train_label_paths = sorted(Path("../input/").glob("kkiller-birdclef-mels-computer-d7-part?/LABEL_IDS.json"))
#     model_root = Path("../input/")
    batch_size = 128
    num_workers = 8
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    lr = 1e-3
    weight_decay = 0
    save_path = './pl_pipeline'
    precision = 16
    gradient_clip_val = 0.1
    seed = 42

## Prepare data for training

This part is used to prepare data for training. We will do a quick look at this dataset use dataframe

In [None]:
def get_df(mel_paths, train_label_paths):
    df = None
    label_ids = {}
    
    for file_path in mel_paths:
        temp = pd.read_csv(str(file_path), index_col=0)
        temp["impath"] = temp.apply(lambda row: file_path.parent/"audio_images/{}/{}.npy".format(row.primary_label, row.filename), axis=1) 
        df = temp if df is None else df.append(temp)
    
    df["secondary_labels"] = df["secondary_labels"].apply(literal_eval)

    for file_path in train_label_paths:
        with open(str(file_path)) as f:
            label_ids.update(json.load(f))

    return label_ids, df

In [None]:
label_ids, df = get_df(config.mel_paths, config.train_label_paths)
print(df.shape)
df.head()

In [None]:
df["primary_label"].value_counts()

In [None]:
df["label_id"].min(), df["label_id"].max()

### Show the pictures

Load all data and make a quick look at the picture format sound

In [None]:
def load_data(config, df):
    def load_row(row):
        # impath = TRAIN_IMAGES_ROOT/f"{row.primary_label}/{row.filename}.npy"
        return row.filename, np.load(str(row.impath))[:config.max_read_samples]
    pool = joblib.Parallel(4)
    mapper = joblib.delayed(load_row)
    tasks = [mapper(row) for row in df.itertuples(False)]
    res = pool(tqdm(tasks))
    res = dict(res)
    return res

In [None]:
# We cache the train set to reduce training time
audio_image_store = load_data(config, df)
len(audio_image_store)

In [None]:
print("shape:", next(iter(audio_image_store.values())).shape)
lbd.specshow(next(iter(audio_image_store.values()))[0])

In [None]:
pd.Series([len(x) for x in audio_image_store.values()]).value_counts()

### Define a dataset class

This part is used to define a dataset class for our dataloader. It organize the data into three channel images

In [None]:
class BirdClefDataset(Dataset):

    def __init__(self, audio_image_store, meta, config=config, is_train=True):
        
        self.audio_image_store = audio_image_store
        self.meta = meta.copy().reset_index(drop=True)
        self.sr = config.sr
        self.is_train = is_train
        self.num_classes = config.num_classes
        self.duration = config.duration
        self.audio_length = self.duration*self.sr
    
    @staticmethod
    def normalize(image):
        image = image.astype("float32", copy=False) / 255.0
        image = np.stack([image, image, image])
        return image

    def __len__(self):
        return len(self.meta)
    
    def __getitem__(self, idx):
        row = self.meta.iloc[idx]
        image = self.audio_image_store[row.filename]

        image = image[np.random.choice(len(image))]
        image = self.normalize(image)
        
        t = np.zeros(self.num_classes, dtype=np.float32) + 0.0025 # Label smoothing
        t[row.label_id] = 0.995
        
        return image, t

In [None]:
ds = BirdClefDataset(audio_image_store, meta=df, config=config, is_train=True)
len(df)

In [None]:
x, y = ds[np.random.choice(len(ds))]
x.shape, y.shape, np.where(y >= 0.5)

In [None]:
lbd.specshow(x[0])

In [None]:
y[:5]

## Define LitModule

In [None]:
class LitModel(pl.LightningModule):

    def __init__(self, config, audio_image_store, df):
        super().__init__()
        """
        Loads a pretrained model from timm (https://github.com/rwightman/pytorch-image-models).

        Arguments:
            config {object} -- config of the whole notebook
        """
        self.config = config
        self.model = timm.create_model(config.model_name, pretrained = False)
        self.model.load_state_dict(torch.load(config.pretrained_path))
        
        if hasattr(self.model, "fc"):
            nb_ft = self.model.fc.in_features
            self.model.fc = nn.Linear(nb_ft, config.num_classes)
        elif hasattr(self.model, "_fc"):
            nb_ft = self.model._fc.in_features
            self.model._fc = nn.Linear(nb_ft, config.num_classes)
        elif hasattr(self.model, "classifier"):
            nb_ft = model.classifier.in_features
            self.model.classifier = nn.Linear(nb_ft, config.num_classes)
        elif hasattr(self.model, "last_linear"):
            nb_ft = self.model.last_linear.in_features
            self.model.last_linear = nn.Linear(nb_ft, config.num_classes)
            
        train_idx, val_idx = train_test_split(np.arange(len(df.index)), test_size=0.2, random_state=config.seed)
        self.trainset = BirdClefDataset(audio_image_store, meta=df.iloc[train_idx].reset_index(drop=True),
                                        config=config, is_train=True)
        
        self.valset = BirdClefDataset(audio_image_store, meta=df.iloc[val_idx].reset_index(drop=True),
                                      config=config, is_train=False)

    def forward(self, x):
        # in lightning, forward defines the prediction/inference actions
        output = self.model(x)
        return output
    
    def train_dataloader(self):
        # Simply define a pytorch dataloader here that will take care of batching. Note it works well with dictionnaries !
        train_dl = DataLoader(self.trainset, batch_size=self.config.batch_size, shuffle=True,
                                    num_workers=self.config.num_workers)
        return train_dl
    
    def val_dataloader(self):
        # Simply define a pytorch dataloader here that will take care of batching. Note it works well with dictionnaries !
        val_dl = DataLoader(self.valset, batch_size=self.config.batch_size, shuffle=False,
                                    num_workers=self.config.num_workers)
        return val_dl
    
    def loss_function(self, preds, labels):
        # How to calculate the loss. Note this method is actually not a part of pytorch lightning ! It's only good practice
        loss_fn = nn.BCEWithLogitsLoss()  # Let's rebalance the weights for each class here.
#         loss_fn = FocalLoss(logits=True)
        loss = loss_fn(preds, labels)
        return loss
    
    def train_dataloader(self):
        # Simply define a pytorch dataloader here that will take care of batching. Note it works well with dictionnaries !
        train_dl = DataLoader(self.trainset, batch_size=self.config.batch_size, shuffle=True,
                                    num_workers=self.config.num_workers)
        return train_dl
    
    def training_step(self, batch, batch_idx):
        # training_step defined the train loop.
        # It is independent of forward
        x, y = batch
        x, y = x.to(self.config.device), y.to(self.config.device)
        
        output = self(x)
        loss = self.loss_function(output, y)
        
        with torch.no_grad():
            output = output.sigmoid()
            y = (y > 0.5 )*1.0
            label_rank_avg_precision_score = label_ranking_average_precision_score(y.cpu().numpy(), output.cpu().numpy())

            output = (output > 0.5)*1.0

            precision = (output*y).sum()/(1e-6 + output.sum())
            recall = (output*y).sum()/(1e-6 + y.sum())
            f1 = 2*precision*recall/(1e-6+precision+recall)
        
        self.log('train_loss', loss, on_step=True, on_epoch=True, 
                 prog_bar=True, logger=True) 
        self.log('train_label_rank_avg_precision_score', label_rank_avg_precision_score,
                 on_step=True, on_epoch=True, 
                 prog_bar=True, logger=True)
        self.log('train_f1', f1, on_step=True, on_epoch=True, 
                 prog_bar=True, logger=True)
        self.log('train_recall', recall, on_step=True, on_epoch=True, 
                 prog_bar=True, logger=True)
        self.log('train_precision',precision, on_step=True, on_epoch=True, 
                 prog_bar=True, logger=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        # This is where you must define what happens during a validation step (per batch)
        x, y = batch
        x, y = x.to(self.config.device), y.to(self.config.device)
        output = self(x)
        
        return {'output': output, 'y': y}
    
    def validation_epoch_end(self, outputs):
        # This is what happens at the end of validation epoch. Usually gathering all predictions
        # outputs is a list of dictionary from each step.
        outputs_, y_ = [], []
        for output in outputs:
            y_.append(output['y'])
            outputs_.append(output['output'])
        
        y_ = torch.cat(y_)
        outputs_ = torch.cat(outputs_)
        loss = self.loss_function(outputs_, y_)
        outputs_ = outputs_.sigmoid()
        y_ = (y_ > 0.5 )*1.0
        label_rank_avg_precision_score = label_ranking_average_precision_score(y_.cpu().numpy(), outputs_.cpu().numpy())

        outputs_ = (outputs_ > 0.5)*1.0

        precision = (outputs_*y_).sum()/(1e-6 + outputs_.sum())
        recall = (outputs_*y_).sum()/(1e-6 + y_.sum())
        f1 = 2*precision*recall/(1e-6+precision+recall)
        self.log('val_loss', loss, on_epoch=True, 
                 prog_bar=True, logger=True) 
        self.log('val_label_rank_avg_precision_score', label_rank_avg_precision_score,
                 on_epoch=True, prog_bar=True, logger=True)
        self.log('val_f1', f1, on_epoch=True, 
                 prog_bar=True, logger=True)
        self.log('val_recall', recall, on_epoch=True, 
                 prog_bar=True, logger=True)
        self.log('val_precision',precision, on_epoch=True, 
                 prog_bar=True, logger=True)
    
    def configure_optimizers(self):
        # Optimizers and schedulers. Note that each are in lists of equal length to allow multiple optimizers (for GAN for example)
        optimizer = torch.optim.Adam(self.model.parameters(), lr=self.config.lr, weight_decay=self.config.weight_decay)
        scheduler = scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, eta_min=1e-5, T_max=config.epochs)
        
        return [optimizer], [scheduler]

In [None]:
model = LitModel(config, audio_image_store, df)

checkpoint_callback = pl.callbacks.ModelCheckpoint(
                                        dirpath=config.save_path,
                                        filename=f'{config.model_name}'+'-{epoch}-{val_loss:.3f}-{val_f1:.3f}',
                                        save_weights_only=True,
                                        monitor="val_loss",
                                        mode="min",
                                        save_last=True,
                                    )

early_stop_callback = pl.callbacks.EarlyStopping(monitor="val_loss", mode="min")
# Define trainer
# Here you can
trainer = pl.Trainer(
                    gpus=1,
                    callbacks=[checkpoint_callback, early_stop_callback],
                    max_epochs=1 if config.debug else config.epochs,
                    gradient_clip_val=config.gradient_clip_val,
                    precision=config.precision,
                   )

In [None]:
trainer.fit(model)

In [None]:
checkpoint_callback.best_model_path

In [None]:
best_model = torch.load(checkpoint_callback.best_model_path)

In [None]:
torch.save(best_model['state_dict'], f'{config.save_path}/best_model.pth')