## RainForest Species Audio Detection Pytorch Starter

In [None]:
tez_path = '../input/tez-modified-tqdm/'
effnet_path = '../input/pytorch-efficientnet'
import sys
sys.path.append(tez_path)
sys.path.append(effnet_path)
sys.path.append('../input/multistartifiedkfold')

**Special Thanks to [Abhishek Thakur](https://www.kaggle.com/abhishek) for tez Library which makes traing faster and Still pretty close to raw pytorch code**

In [None]:
import albumentations
import pandas as pd
import plotly.express as px
import seaborn as sns
import tez
import torch
from tqdm.notebook import tqdm
import torch.nn as nn
from torch.nn import functional as F
import torchaudio
import librosa
import random
import tez
import numpy as np
import matplotlib.pyplot as plt
import audioread
import cv2
from sklearn.metrics import label_ranking_average_precision_score
from sklearn.model_selection import train_test_split
import torch.nn as nn
from efficientnet_pytorch import EfficientNet
import soundfile
from pathlib import Path
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [None]:
path = Path('../input/rfcx-species-audio-detection')

In [None]:
df = pd.read_csv(path/'train_tp.csv')

In [None]:
df['recording_id'] = df['recording_id']

In [None]:
files = df.recording_id.tolist()

In [None]:
print(df.shape)
df.head()

## Processing csv files

In [None]:
fnames = df.recording_id.unique().tolist()
df_gr = df.groupby(['recording_id'])

## Converting Labels to OneHotEncoded targets

In [None]:
bird_dict = {}
for fn in tqdm(fnames):
    lbls = np.zeros(24)
    temp = df_gr.get_group(fn)
    sps = temp.species_id.unique()
    for ss in sps:
        lbls[ss] = 1
    bird_dict[fn] = lbls

In [None]:
bird_df = pd.DataFrame.from_dict(bird_dict,orient='index').reset_index()
bird_df.columns = ['recording_id'] + ['species_id_'+str(x) for x in range(24)]

In [None]:
bird_df.head()

In [None]:
df_agg = df.groupby(['recording_id']).agg({'t_min':lambda x :min(x),'t_max':lambda x :max(x)}).reset_index()

In [None]:
df_agg['duration'] = df_agg['t_max'] - df_agg['t_min']
df_agg['duration'] = df_agg['duration'].apply(lambda x: x+abs(x-10) if x<=3 else x)

In [None]:
trn_df = bird_df.merge(df_agg,on='recording_id',how='left')

In [None]:
trn_df['recording_id'] = '../input/rfcx-species-audio-detection/train/' +trn_df['recording_id'] + '.flac'

In [None]:
trn_df.sample(n=10)

In [None]:
tar_cols = ['species_id_'+str(x) for x in range(24)]

## Returns spectogram

In [None]:
import librosa
import cv2,os
#from https://www.kaggle.com/daisukelab/creating-fat2019-preprocessed-data
def mono_to_color(X, mean=None, std=None, norm_max=None, norm_min=None, eps=1e-6):
    # Stack X as [X,X,X]
#     X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = mean or X.mean()
    X = X - mean
    std = std or X.std()
    Xstd = X / (std + eps)
    _min, _max = Xstd.min(), Xstd.max()
    norm_max = norm_max or _max
    norm_min = norm_min or _min
    if (_max - _min) > eps:
        # Normalize to [0, 255]
        V = Xstd
        V[V < norm_min] = norm_min
        V[V > norm_max] = norm_max
        V = 255 * (V - norm_min) / (norm_max - norm_min)
        V = V.astype(np.uint8)
    else:
        # Just zero
        V = np.zeros_like(Xstd, dtype=np.uint8)
    return V

def build_spectrogram(path,offset,duration=15):
    y, sr = librosa.load(path,offset=np.floor(offset),duration=np.ceil(duration))
    total_secs = y.shape[0] / sr
    M = librosa.feature.melspectrogram(y=y, sr=sr)
    M = librosa.power_to_db(M)
    #print(M.shape)
    M = mono_to_color(M)
    if M.shape[1]<600:
        new_img = np.zeros((128,600))
        new_img[:M.shape[0],:M.shape[1]] = M
        M = new_img
    else:
        M = M
    return M[:,:600]

In [None]:
n = random.randint(0,400)
img = build_spectrogram(trn_df.iloc[n]['recording_id'],offset=int(trn_df.iloc[n]['t_min']),duration=15);img.shape

In [None]:
plt.figure(1,figsize=(10,6))
plt.imshow(img,cmap='inferno');

## MultiStratifiedKfold

In [None]:
trn_df = trn_df.sample(frac=1.,random_state = 2020)
trn_df['kfold'] = -1
y = trn_df[tar_cols].values
kf = MultilabelStratifiedKFold(n_splits=5,random_state = 2020,shuffle = True)
for fold ,(trn_,val_ )in enumerate(kf.split(X=trn_df,y=y)):
    trn_df.loc[val_,'kfold'] = fold

In [None]:
trn_df.to_csv('rain_forest_train_kfold.csv',index=False)

In [None]:
px.violin(data_frame=trn_df,x='duration',title='duration',box=True)

In [None]:
IMAGE_SIZE = 128
train_aug = albumentations.Compose(
    [
        #albumentations.Resize(128, 600,p=1.0),
        albumentations.Normalize(
            mean=[0.485],
            std=[0.229],
            max_pixel_value=255.0,
            p=1.0,
        ),
    ],
    p=1.0,
)

valid_aug = albumentations.Compose(
    [
        #albumentations.Resize(128, 600, p=1.0),
        albumentations.Normalize(
            mean=[0.485],
            std=[0.229],
            max_pixel_value=255.0,
            p=1.0,
        ),
    ],
    p=1.0,
)

## Audio Dataset

In [None]:
class AudioDataset:
    def __init__(self,audio_paths,targets,offset,duration,augmentations=None,channel_first=False,grayscale=True):
        self.audio_paths = audio_paths
        self.targets = targets
        self.offset = offset
        self.duration = duration
        self.augmentations = augmentations
        
    def __len__(self):
        return len(self.audio_paths)
    
    def __getitem__(self,item):
        targets = self.targets[item]
        image = build_spectrogram(self.audio_paths[item],self.offset[item],10)
        image = np.array(image)
        #print(image.shape)
        if self.augmentations is not None:
            augmented = self.augmentations(image=image)
            image = augmented["image"]
        image = np.nan_to_num(image)
        image_tensor = torch.tensor(image)
        image_tensor = image_tensor.unsqueeze(0)
        return {"image": image_tensor,
                "targets": torch.tensor(targets,dtype=torch.float)}

In [None]:
FOLD = 2

In [None]:
df_train,df_valid = trn_df[trn_df.kfold!=FOLD],trn_df[trn_df.kfold==FOLD]
df_train = df_train.reset_index(drop=True)
df_valid = df_valid.reset_index(drop=True)
train_targets = df_train[tar_cols].values
valid_targets = df_valid[tar_cols].values

In [None]:
train_dataset = AudioDataset(df_train.recording_id,
                             train_targets,
                             offset=df_train['t_min'].values,
                             duration=df_train['duration'].values,
                             augmentations=train_aug)
valid_dataset = AudioDataset(df_valid.recording_id,
                             valid_targets,
                             offset=df_valid['t_min'].values,
                             duration=df_valid.duration.values,
                             augmentations=valid_aug)

### Let's check a valid_dataset sample:

In [None]:
plt.figure(1,figsize=(10,6))
plt.imshow(valid_dataset[5]['image'].numpy()[0,:,:],cmap='inferno');

In [None]:
wp3 = '../input/efficientnet-pytorch/efficientnet-b3-c8376fa2.pth'

## Species Audio Detection Model

In [None]:
class SpeciesModel(tez.Model):
    def __init__(self):
        super().__init__()

        self.effnet = EfficientNet.from_pretrained("efficientnet-b3",weights_path=wp3)

        self.effnet._conv_stem.in_channels = 1
        weight = self.effnet._conv_stem.weight.mean(1, keepdim=True)
        self.effnet._conv_stem.weight = torch.nn.Parameter(weight)

        self.dropout = nn.Dropout(0.1)
        self.out = nn.Linear(1536, 24)
        self.step_scheduler_after = "epoch"
        self.step_scheduler_metric = "valid_label_rank_avg_prec_sc"
        

    def monitor_metrics(self, outputs, targets):
        outputs = outputs.cpu().detach().numpy()
        targets = targets.cpu().detach().numpy()
        
        return {"label_rank_avg_prec_sc": label_ranking_average_precision_score(targets,outputs)}

    def fetch_optimizer(self):
        opt = torch.optim.Adam(self.parameters(), lr=1e-3)
        return opt

    def fetch_scheduler(self):
        rlr = torch.optim.lr_scheduler.ReduceLROnPlateau(
            self.optimizer,
            verbose=True,
            factor=0.7,
            mode="max",
            patience=2,
            threshold=0.01,
        )
        return rlr

    def forward(self, image, targets=None):
        batch_size, _, _, _ = image.shape

        x = self.effnet.extract_features(image)
        x = F.adaptive_avg_pool2d(x, 1).reshape(batch_size, -1)
        outputs = self.out(self.dropout(x))
        if targets is not None:
            loss = nn.BCEWithLogitsLoss()(
                outputs, targets.type_as(outputs)
            )
            metrics = self.monitor_metrics(outputs, targets)
            return outputs, loss, metrics
        return outputs, None, {}

In [None]:
model = SpeciesModel()

In [None]:
from tez.callbacks import EarlyStopping
es = EarlyStopping(
    monitor="valid_label_rank_avg_prec_sc", model_path="model.bin", patience=5, mode="max"
    )

In [None]:
model.fit(
        train_dataset,
        valid_dataset=valid_dataset,
        train_bs=64,
        valid_bs=16,
        device="cuda",
        epochs=10,
        callbacks=[es],
        fp16=True)