This notebook uses model which was trained here:
https://www.kaggle.com/takamichitoda/birdclef-starter-train-precomputed-spectrogram?scriptVersionId=59442176

In [None]:
!pip install /kaggle/input/timm-pytorch-image-models/pytorch-image-models-master/
!pip install --no-deps /kaggle/input/evaluations/

In [None]:
import gc
import os
import librosa
import psutil
import random

import numpy as np
import pandas as pd
import soundfile as sf

import matplotlib.pyplot as plt
from tqdm.notebook import tqdm as tqdm_notebook

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingLR
from torchvision import transforms
import timm
from pathlib import Path

from evaluations.kaggle_2020 import row_wise_micro_averaged_f1_score

device = torch.device("cuda")

In [None]:
class config:
    INPUT_ROOT = "/kaggle/input/birdclef-2021"
    WORK_ROOT = "/kaggle/working"
    SAMPLING_RATE = 32000
    FMIN = 20
    FMAX = 16000
    N_FFT = 2048
    SPEC_HEIGHT = 128
    SPEC_WIDTH= 40  # 5s * sr / N_FFT / 2
    #SPEC_WIDTH = 313
    IMAGE_WIDTH = 313
    SEED = 416
    BATCH_SIZE = 256
    MODEL_NAME = "resnet18"
    LEAENING_RATE = 1e-3
    T_MAX = 10
    NUM_EPOCHS = 10
    N_ACCUMULATE = 1
    DATA_N_LIMIT = 100

In [None]:
def set_seed(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
train_metadata_df = pd.read_csv(f"{config.INPUT_ROOT}/train_metadata.csv")
primary_labels = train_metadata_df["primary_label"].unique()

In [None]:
TEST = (len(list(Path(f"{config.INPUT_ROOT}/test_soundscapes/").glob("*.ogg"))) != 0)
if TEST:
    DATADIR = Path(f"{config.INPUT_ROOT}/test_soundscapes/")
    test_df = pd.read_csv(f"{config.INPUT_ROOT}/test.csv")
    test_df["birds"] = "nocall"
else:
    DATADIR = Path(f"{config.INPUT_ROOT}/train_soundscapes/")
    test_df = pd.read_csv(f"{config.INPUT_ROOT}/train_soundscape_labels.csv")
    
all_audios = list(DATADIR.glob("*.ogg"))

In [None]:
MODEL_HEADER_INFO = {
    "resnet18": (-2, 512)
}

def interpolate(x: torch.Tensor, ratio: int):
    x = x.transpose(1, 2)
    (batch_size, time_steps, classes_num) = x.shape
    upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
    upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
    upsampled = upsampled.transpose(1, 2)
    return upsampled


class BirdCLEFNet(nn.Module):
    def __init__(self, model_name):
        super(BirdCLEFNet, self).__init__()
        self.model_name = model_name
        self.n_label = 397

        base_model = timm.create_model(model_name, pretrained=False)
        h_idx, n_dense = MODEL_HEADER_INFO[model_name]        
        self.model_head = nn.Sequential(*list(base_model.children())[:h_idx])
                
        self.fc_a = nn.Conv1d(n_dense, self.n_label, 1)
        self.fc_b = nn.Conv1d(n_dense, self.n_label, 1)

    def forward(self, x):  # input x: (batch, channel, Hz, time)
        frames_num = x.shape[3]
        x = x.transpose(3, 2)  # (batch, channel, time, Hz)
        h = self.model_head(x)  # (batch, unit, time, Hz)
        
        h = F.relu(h)
        ti_pool = torch.mean(h, dim=3)  # (batch, unit, time)

        # channel smoothing
        x1 = F.max_pool1d(ti_pool, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(ti_pool, kernel_size=3, stride=1, padding=1)
        ti_pool = x1 + x2
        
        xa = self.fc_a(ti_pool)  # (batch, n_class, time)
        xb = self.fc_b(ti_pool)  # (batch, n_class, time)
        xb = torch.softmax(xb, dim=2)

        # time pool
        clipwise_output = torch.sum(xa * xb, dim=2)
        segmentwise_output= interpolate(xa, 32)

        return {
            "clipwise_output": clipwise_output,
            "segmentwise_output": segmentwise_output,
        }

In [None]:
def mono_to_color(
    X: np.ndarray, mean=None, std=None,
    norm_max=None, norm_min=None, eps=1e-6
):
    # Stack X as [X,X,X]
    X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = mean or X.mean()
    X = X - mean
    std = std or X.std()
    Xstd = X / (std + eps)
    _min, _max = Xstd.min(), Xstd.max()
    norm_max = norm_max or _max
    norm_min = norm_min or _min
    if (_max - _min) > eps:
        # Normalize to [0, 255]
        V = Xstd
        V[V < norm_min] = norm_min
        V[V > norm_max] = norm_max
        V = 255 * (V - norm_min) / (norm_max - norm_min)
        V = V.astype(np.uint8)
    else:
        # Just zero
        V = np.zeros_like(Xstd, dtype=np.uint8)
    return V

class TestDataset(torch.utils.data.Dataset):
    def __init__(self, all_audios):
        self.all_audios = all_audios
        
        self.to_tensor = transforms.ToTensor()
        self.norm = transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))

    def __len__(self):
        return len(self.all_audios)
    
    def __getitem__(self, idx):
        audio_path = self.all_audios[idx]
        audio_id, site, _ = audio_path.name.split("_")
        clip, samplerate = sf.read(audio_path)
        
        mel_specs, row_ids = [], []
        for tail_s in range(5, 605, 5):
            head_s = tail_s - 5
            _clip = clip[head_s*config.SAMPLING_RATE:tail_s*config.SAMPLING_RATE]
            mel_spec = librosa.feature.melspectrogram(y=_clip, 
                                                      sr=config.SAMPLING_RATE, 
                                                      n_fft=config.N_FFT, 
                                                      n_mels=config.SPEC_HEIGHT, 
                                                      fmin=config.FMIN, 
                                                      fmax=config.FMAX)
            mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
            
            mel_spec = mono_to_color(mel_spec)
            mel_spec = self.to_tensor(mel_spec)
            mel_spec = self.norm(mel_spec)
            
            mel_specs.append(mel_spec)
            row_ids.append(f"{audio_id}_{site}_{tail_s}")
            
        mel_specs = torch.stack(mel_specs)
        return mel_specs, row_ids

In [None]:
model = BirdCLEFNet(config.MODEL_NAME)
model.to(device)
model.eval()
path = "/kaggle/input/birdclef-starter-train-precomputed-spectrogram/birdclefnet_f0_last_model.bin"
ckpt = torch.load(path, map_location="cpu")
model.load_state_dict(ckpt)

In [None]:
THRESHOLD = 0.5

res_dfs = []
dset = TestDataset(all_audios)
for mel_specs, row_ids in tqdm_notebook(dset):
    X = mel_specs.to(device)
    with torch.no_grad():
        outputs = model(X)
    clipwise_output = outputs["clipwise_output"].sigmoid().cpu()
    
    predict_labels = [list(primary_labels[posi.numpy()]) for posi in clipwise_output > THRESHOLD]
    predict_labels = [" ".join(i) if len(i) != 0 else "nocall" for i in predict_labels]
    
    res_df = pd.DataFrame(zip(row_ids, predict_labels), columns=["row_id", "birds"])
    res_dfs.append(res_df)
    
submission_df = pd.concat(res_dfs, axis=0)

In [None]:
rows = []
for row_id in test_df["row_id"]:
    row = submission_df.query(f"row_id=='{row_id}'")
    rows.append(row)
submission_df = pd.concat(rows).reset_index(drop=True)
submission_df.head()

In [None]:
y_true = test_df["birds"].tolist()
y_pred = submission_df["birds"].tolist()
local_score = row_wise_micro_averaged_f1_score(y_true, y_pred)
print(local_score)

In [None]:
submission_df.to_csv("submission.csv", index=None)

In [None]:
!ls

In [None]:
pd.read_csv("submission.csv")