In [None]:
!pip install /kaggle/input/timm-pytorch-image-models/pytorch-image-models-master/
!pip install --no-deps /kaggle/input/evaluations/

In [None]:
!pip install ../input/torchlibrosa/torchlibrosa-0.0.5-py3-none-any.whl

In [None]:
import os
import cv2
import sys
import time
import math

import random
import librosa
import warnings
import torchaudio
import torchvision
import numpy as np
import pandas as pd
import typing as tp
import IPython.display as ipd
import matplotlib.pyplot as plt

from pathlib import Path

import torch
import torch.nn as nn
import torch.utils.data as data
import torch.nn.functional as F

from torch.utils.data import DataLoader
from torch.nn.modules.utils import _pair
from torch.nn import Conv2d, Module, Linear, BatchNorm2d, ReLU


pd.options.display.max_rows = 500
pd.options.display.max_columns = 500

### Utils

In [None]:
def load_audio(path, sr):
    clip, _ = librosa.load(path, sr=sr, mono=True, res_type="kaiser_fast")
    return clip

In [None]:
def load_model_weights(model, weights):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    state_dict = torch.load(weights, map_location=device)
    model.load_state_dict(state_dict)

In [None]:
def seed_everything(seed):
    """
    Seeds basic parameters for reproductibility of results
    
    Arguments:
        seed {int} -- Number of the seed
    """
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True #False

## Data

In [None]:
SEED = 1213
seed_everything(SEED)

In [None]:
ROOT = Path.cwd().parent
INPUT_ROOT = ROOT / "input"
RAW_DATA = INPUT_ROOT / "birdsong-recognition"
TRAIN_AUDIO_DIR = RAW_DATA / "train_audio"

In [None]:
train = pd.read_csv(RAW_DATA / "train.csv")

In [None]:
TEST_AUDIO_DIR = RAW_DATA / "test_audio"

if not TEST_AUDIO_DIR.exists():
    TEST_AUDIO_DIR = INPUT_ROOT / "birdcall-check" / "test_audio"
    test = pd.read_csv(INPUT_ROOT / "birdcall-check" / "test.csv")
else:
    test = pd.read_csv(RAW_DATA / "test.csv")

## Parameters

In [None]:
CLASSES = sorted(os.listdir(TRAIN_AUDIO_DIR))
NUM_CLASSES = len(CLASSES)
NUM_WORKERS = 4

In [None]:
class AudioParams:
    sr = 32000
    stride = 5
    true_kernel_size = 5

    img_size = None
    
    # Melspectrogram
    n_mels = 128
    fmin = 20
    fmax = 16000

## Dataset

In [None]:
def convert_site_3(df, clip_length, params):
    n_samples = clip_length // (params.sr * params.true_kernel_size)  # may lose the end 
    
    audio_id = [df['audio_id'].values[0]] * n_samples
    site = ['site_3'] * n_samples
    seconds = [i * params.true_kernel_size for i in range(1, n_samples + 1)]
    row_id = [f'site_3_{audio_id[0]}_{int(s)}' for s in seconds]
    
    new_df = pd.DataFrame(data={'site': site,
                                'row_id': row_id,
                                'seconds': seconds,
                                'audio_id': audio_id
                               })
    
    return new_df

In [None]:
def compute_melspec(y, params):
    melspec = librosa.feature.melspectrogram(
        y,
        sr=params.sr,
        n_mels=params.n_mels,
        fmin=params.fmin,
        fmax=params.fmax
    )
    
    melspec = librosa.power_to_db(melspec).astype(np.float32)
    
    return melspec

In [None]:
class TestDataset(data.Dataset):
    def __init__(self, df, clip, params):
        self.df = df
        self.clip = clip
        self.params = params
        
        if df['site'].values[0] == 'site_3':
            self.df = convert_site_3(df, len(clip), params)
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row_id = self.df['row_id'][idx]
        
        end_seconds = int(self.df['seconds'][idx])
        start_seconds = int(end_seconds - 5)

        start_index = self.params.sr * start_seconds
        end_index = self.params.sr * end_seconds

        y = self.clip[start_index:end_index].astype(np.float32)
        
        return y

In [None]:
import timm
class AudioParams:
    sr = 32000
    stride = 5
    true_kernel_size = 5

    img_size = None
    
    # Melspectrogram
    n_mels = 128
    fmin = 20
    fmax = 16000

### Model loader

In [None]:
from torchlibrosa.stft import LogmelFilterBank, Spectrogram
from torchlibrosa.augmentation import SpecAugmentation

class BirdCLEFNet(nn.Module):
    def __init__(self, model_name,params):
        super(BirdCLEFNet, self).__init__()
        self.model_name = model_name
        self.n_label = 264
        self.params=params
        self.spectrogram_extractor = Spectrogram(n_fft=2048, hop_length=512,
                                                 win_length=None, window="hann", center=True, pad_mode="reflect",
                                                 freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=params.sr, n_fft=2048,
                                                 n_mels=params.n_mels, fmin=params.fmin, fmax=params.fmax, ref=1.0, amin=1e-10, top_db=80.0,
                                                 freeze_parameters=True)
        
        self.spec_augmenter = SpecAugmentation(time_drop_width=8,time_stripes_num=2,
                                               freq_drop_width=4,freq_stripes_num=2)
        
        self.base_model = timm.create_model(model_name, pretrained=False,num_classes=self.n_label,in_chans=3)


    def forward(self, x):  # input x: (batch, channel, Hz, time)
        x = self.spectrogram_extractor(x)
        x = self.logmel_extractor(x)
        x=(x-x.mean())/x.std()
        x=torch.squeeze(x,dim=1)
        x = torch.stack([x,x,x],dim=1)
        x=self.base_model(x)
        return x

## Inference

### Predict

In [None]:
def predict(model, dataset, batch_size=16):
    model.eval()
    preds = np.empty((0, NUM_CLASSES))
    
    loader = DataLoader(
        dataset, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS
    )

    with torch.no_grad():
        for x in loader:
#             print(x.shape)
            y_pred = model(x.cuda()).detach()
            preds = np.concatenate([preds, torch.sigmoid(y_pred).cpu().numpy()])
    
    return preds

### Post-process

In [None]:
def post_process_site_12(preds, threshold=0.5, maxpreds=3):
    preds = preds * (preds >= threshold)   # remove preds < threshold
    
#     next_preds = np.concatenate([preds[1:], preds[-1:]])  # pred corresponding to next window
#     prev_preds = np.concatenate([preds[:1], preds[:-1]])  # pred corresponding to previous window

    next_preds = np.concatenate([preds[1:], np.zeros((1, preds.shape[-1]))])  # pred corresponding to next window
    prev_preds = np.concatenate([np.zeros((1, preds.shape[-1])), preds[:-1]])  # pred corresponding to previous window
    
    score = preds + next_preds + prev_preds  # Aggregating
    
    n_birds = (score >= threshold - 1e-5).sum(-1)   # threshold ?
    n_birds = np.clip(n_birds, 0, maxpreds)  # keep at most maxpreds birds
    
    labels = [np.argsort(- score[i])[:n_birds[i]].tolist() for i in range(len(preds))]
    class_labels = [" ".join([CLASSES[l] for l in label]) for label in labels]
    
    return class_labels

In [None]:
def post_process_site_3(preds, threshold=0.6, maxpreds=3):
    preds = preds * (preds >= threshold)   # remove preds < threshold

    score = np.sum(preds, 0)    # Aggregating
    
    n_birds = (score >= threshold - 1e-5).sum(-1)
    n_birds = np.clip(n_birds, 0, maxpreds)  # keep at most maxpreds birds
    
    label = np.argsort(- score)[:n_birds].tolist()
    
    class_labels = " ".join([CLASSES[l] for l in label])
    return class_labels

In [None]:
def max_pred_gen(site, duration):
    if site != "site_3":
        return 3
    else:
        rets = [(7,2), (15, 3), (30, 5), (60, 7)]
        
        for ref_duration,thresh in rets:
            if ref_duration >= duration:
                return thresh
        return 10

In [None]:
def reformat_preds(preds, df, site):
    prediction_df = pd.DataFrame({
        "row_id": df['row_id'].values,
        "birds": preds
    })
    
    prediction_df['birds'] = prediction_df['birds'].replace([''],'nocall')
    
    return prediction_df

### Inference

In [None]:
def inference(test_df, test_audio,params,threshold=0.5):
    unique_audio_id = test_df.audio_id.unique()
    
    models=BirdCLEFNet('resnext50_32x4d',AudioParams)
    models.to('cuda:0')
    models.load_state_dict(torch.load('../input/cornell-data-downloading-version1/birdclefnet_f0_best_model_resnext50_32x4dtry7_cornell(30).pth'))
    pred_dfs = []
    for audio_id in unique_audio_id :
        
        audio_df = test_df[test_df['audio_id'] == audio_id].reset_index(drop=True)
        site = audio_df["site"].values[0]
        
        print(f'\nMaking predictions for audio {audio_id} in {site} ')

        clip = load_audio(test_audio / (audio_id + ".mp3"), params.sr)
        clip_duration = len(clip) // params.sr
        
        dataset = TestDataset(audio_df, clip, params)
        
        preds = []
        pred = predict(models, dataset)
        preds.append(pred)
            
        preds = np.mean(preds, 0)
        
        maxpreds = max_pred_gen(site, clip_duration)
        print(f'Limiting the number of birds to {maxpreds}')
        
        if site == 'site_3':
            preds_pp = post_process_site_3(preds, threshold=threshold, maxpreds=maxpreds)
        else:
            preds_pp = post_process_site_12(preds, threshold=threshold, maxpreds=maxpreds)
        
        print("Predicted classes :", preds_pp)
        
        pred_df = reformat_preds(preds_pp, audio_df, site)
        pred_dfs.append(pred_df)
    
    sub = pd.concat(pred_dfs, axis=0, sort=False).reset_index(drop=True)
    return sub

## Prediction

### Used models

In [None]:
warnings.filterwarnings("ignore")
threshold=0.3
submission = inference(test, TEST_AUDIO_DIR, AudioParams,threshold=threshold)

### Submission

In [None]:
submission.to_csv("submission.csv", index=False)
submission