In [None]:
!pip install timm evaluations

In [None]:
import gc
import os
import librosa
import psutil
import random

import numpy as np
import pandas as pd
import soundfile as sf

import matplotlib.pyplot as plt
from tqdm.notebook import tqdm as tqdm_notebook

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingLR
from torchvision import transforms

import timm
from evaluations.kaggle_2020 import row_wise_micro_averaged_f1_score

device = torch.device("cuda")

In [None]:
class config:
    INPUT_ROOT = "/kaggle/input/birdclef-2021"
    WORK_ROOT = "/kaggle/working"
    # FMIN = 20
    # FMAX = 16000
    # N_FFT = 2048
    SPEC_HEIGHT = 128
    SPEC_WIDTH = 313
    SEED = 416
    BATCH_SIZE = 64
    MODEL_NAME = "resnet18"
    LEAENING_RATE = 1e-3
    T_MAX = 10
    NUM_EPOCHS = 10
    N_ACCUMULATE = 1
    DATA_N_LIMIT = 200

In [None]:
def set_seed(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
def mono_to_color(
    X: np.ndarray, mean=None, std=None,
    norm_max=None, norm_min=None, eps=1e-6
):
    # Stack X as [X,X,X]
    X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = mean or X.mean()
    X = X - mean
    std = std or X.std()
    Xstd = X / (std + eps)
    _min, _max = Xstd.min(), Xstd.max()
    norm_max = norm_max or _max
    norm_min = norm_min or _min
    if (_max - _min) > eps:
        # Normalize to [0, 255]
        V = Xstd
        V[V < norm_min] = norm_min
        V[V > norm_max] = norm_max
        V = 255 * (V - norm_min) / (norm_max - norm_min)
        V = V.astype(np.uint8)
    else:
        # Just zero
        V = np.zeros_like(Xstd, dtype=np.uint8)
    return V

SPEC_DATA_ROOT = [
    "/kaggle/input/birdclef-spectrogram-01-a",
    "/kaggle/input/birdclef-spectrogram-02-b",
    "/kaggle/input/birdclef-spectrogram-03-c",
    "/kaggle/input/birdclef-spectrogram-04-dg",
    "/kaggle/input/birdclef-spectrogram-05-hm",
    "/kaggle/input/birdclef-spectrogram-06-np",
    "/kaggle/input/birdclef-spectrogram-07-qs",
    "/kaggle/input/birdclef-spectrogram-08-tz",
]
SPEC_DATA_INITIAL = ["a", "b", "c", "defg", "hijklm", "nop", "qrs", "tuvwxyz"]
class BirdCLEFTrainDataset(torch.utils.data.Dataset):
    def __init__(self, X, y, mode):
        self.X = X
        self.y = y
        self.mode = mode
        self.to_tensor = transforms.ToTensor()
        self.norm = transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        #self.crop = transforms.RandomCrop((config.SPEC_HEIGHT, config.SPEC_WIDTH), pad_if_needed=True, padding_mode="constant")
        
    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        fname = self.X[idx]
        label = self.y[idx]


        initial_w = label[0]
        root_idx = [i for i, v in enumerate(SPEC_DATA_INITIAL) if initial_w in v][0]
        spec_root = SPEC_DATA_ROOT[root_idx]
        mel_spec = np.load(f"{spec_root}/{label}/{fname}.npy")
        
        if self.mode == "train":
            mel_spec = mel_spec[:, :config.SPEC_WIDTH]  # use head 5s only
        
        img = mono_to_color(mel_spec)
        img = self.to_tensor(img)
        img = self.norm(img)
        #if self.mode == "train":
        #    img = self.crop(img)
        #elif self.mode == "valid":
        #    pass
        
        label_ohe = torch.eye(n_labels)[label_dic[label]]
        
        return img, label_ohe

In [None]:
def mixup(input, gamma, perm):
    perm_input = input[perm]
    return input.mul_(gamma).add_(1 - gamma, perm_input)

#def birdclef_criterion(outputs, targets, gamma, perm):
def birdclef_criterion(outputs, targets):
    clipwise_output = outputs["clipwise_output"]
    #clipwise_output = mixup(clipwise_output, gamma, perm)
    loss = nn.BCEWithLogitsLoss(reduction="mean")(clipwise_output, targets)
    return loss

def interpolate(x: torch.Tensor, ratio: int):
    x = x.transpose(1, 2)
    (batch_size, time_steps, classes_num) = x.shape
    upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
    upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
    upsampled = upsampled.transpose(1, 2)
    return upsampled

MODEL_HEADER_INFO = {
    "resnet18": (-2, 512)
}

class BirdCLEFNet(nn.Module):
    def __init__(self, model_name):
        super(BirdCLEFNet, self).__init__()
        self.model_name = model_name
        self.n_label = n_labels

        base_model = timm.create_model(model_name, pretrained=True)
        h_idx, n_dense = MODEL_HEADER_INFO[model_name]        
        self.model_head = nn.Sequential(*list(base_model.children())[:h_idx])
                
        self.fc_a = nn.Conv1d(n_dense, self.n_label, 1)
        self.fc_b = nn.Conv1d(n_dense, self.n_label, 1)

    def forward(self, x):  # input x: (batch, channel, Hz, time)
        frames_num = x.shape[3]
        x = x.transpose(3, 2)  # (batch, channel, time, Hz)
        h = self.model_head(x)  # (batch, unit, time, Hz)
        
        h = F.relu(h)
        ti_pool = torch.mean(h, dim=3)  # (batch, unit, time)

        # channel smoothing
        #x1 = F.max_pool1d(ti_pool, kernel_size=3, stride=1, padding=1)
        #x2 = F.avg_pool1d(ti_pool, kernel_size=3, stride=1, padding=1)
        #ti_pool = x1 + x2
        
        xa = self.fc_a(ti_pool)  # (batch, n_class, time)
        xb = self.fc_b(ti_pool)  # (batch, n_class, time)
        xb = torch.softmax(xb, dim=2)

        # time pool
        clipwise_output = torch.sum(xa * xb, dim=2)
        segmentwise_output= interpolate(xa, 32)

        return {
            "clipwise_output": clipwise_output,
            "segmentwise_output": segmentwise_output,
        }

In [None]:
train_metadata_df = pd.read_csv(f"{config.INPUT_ROOT}/train_metadata.csv")

print("before filter train data:", len(train_metadata_df))
dfs = []
for primary_label, df in train_metadata_df.groupby("primary_label"):
    if len(df) > config.DATA_N_LIMIT:
        df = df.sort_values("rating", ascending=False).iloc[:config.DATA_N_LIMIT]
    dfs.append(df)
train_metadata_df = pd.concat(dfs).reset_index(drop=True)
print("after filter train data:", len(train_metadata_df))

filenames = train_metadata_df["filename"]
primary_labels = train_metadata_df["primary_label"]
label_dic = {v:i for i, v in enumerate(primary_labels.unique())}
label_dic_inv = {i:v for i, v in enumerate(primary_labels.unique())}
n_labels = len(label_dic)

In [None]:
def train_loop(train_data_loader, model, optimizer, scheduler):
    losses = []
    model.train()
    optimizer.zero_grad()
    for n_iter, (X, y) in tqdm_notebook(enumerate(train_data_loader), total=len(train_data_loader)):
        X, y = X.to(device), y.to(device)
        
        # mixup
        #gamma = np.random.beta(0.1, 0.1)
        #perm = torch.randperm(X.size(0))
        #X = mixup(X, gamma, perm)
        
        outputs = model(X)
        #loss = birdclef_criterion(outputs, y, gamma, perm)
        loss = birdclef_criterion(outputs, y)
        loss.backward()
        
        if n_iter % config.N_ACCUMULATE == 0:
            optimizer.step()
            optimizer.zero_grad()
        
        if scheduler is not None:
            scheduler.step()

        losses.append(loss.item())
    return losses

In [None]:
def make_cut_image_batch(img):
    cut_images = []
    h_idx, t_idx = 0, config.SPEC_WIDTH
    for idx in range(img.shape[2]):
        if t_idx > img.shape[2]:
            break
        h_idx += config.SPEC_WIDTH//2
        t_idx += config.SPEC_WIDTH//2
        _img = img[:, :, h_idx:t_idx]
        if _img.shape[2] != config.SPEC_WIDTH:
            pad = torch.zeros((3, config.SPEC_HEIGHT, config.SPEC_WIDTH-_img.shape[2]))
            _img = torch.cat([_img, pad], dim=2)
        cut_images.append(_img)
    cut_img_batch = torch.stack(cut_images)
    return cut_img_batch

def valid_loop(valid_dset, model):
    losses = []
    predicts = []
    model.eval()
    for img, y in tqdm_notebook(valid_dset):
        X = make_cut_image_batch(img)
        X, y = X.to(device), y.to(device)
        with torch.no_grad():
            outputs = model(X)

        clipwise_output_max, _ = outputs["clipwise_output"].max(0)
        loss = nn.BCEWithLogitsLoss(reduction="mean")(clipwise_output_max, y)
        losses.append(loss.item())
        predicts.append(clipwise_output_max)
    return losses, torch.vstack(predicts).cpu()

In [None]:
def output_to_label(clipwise_output, thr):
    lst = []
    for pred in tqdm_notebook(clipwise_output):
        pred_labs = [label_dic_inv[i] for i, v in enumerate(pred) if v > thr]
        if len(pred_labs) == 0:
            pred_labs = "nocall"
        else:
            pred_labs = " ".join(pred_labs)
        lst.append(pred_labs)
    return lst

In [None]:
!ls /kaggle/input/birdclef-resnet18/

In [None]:
skf = StratifiedKFold(n_splits=5,  shuffle=True, random_state=config.SEED)
for fold, (train_index, valid_index) in enumerate(skf.split(filenames, primary_labels)):
    print(f"### FOLD-{fold} ###")
    set_seed(config.SEED)
    
    train_primary_labels = primary_labels.loc[train_index].values
    valid_primary_labels = primary_labels.loc[valid_index].values
    train_filenames = filenames.loc[train_index].values 
    valid_filenames = filenames.loc[valid_index].values
    train_dset = BirdCLEFTrainDataset(train_filenames, train_primary_labels, "train")
    train_data_loader = torch.utils.data.DataLoader(train_dset, batch_size=config.BATCH_SIZE, shuffle=True, num_workers=2)
    valid_dset = BirdCLEFTrainDataset(valid_filenames, valid_primary_labels, "valid")
    
    model = BirdCLEFNet(config.MODEL_NAME)
    model.to(device)
    
    #path = "/kaggle/input/birdclef-resnet18/birdclefnet_f0_model_mixup_epoch20.bin"
    #ckpt = torch.load(path, map_location="cpu")
    #model.load_state_dict(ckpt)
    
    optimizer = Adam(model.parameters(), lr=config.LEAENING_RATE)
    scheduler = CosineAnnealingLR(optimizer, T_max=len(train_data_loader)*config.T_MAX, eta_min=0.0)
    
    results = []
    for epoch in range(config.NUM_EPOCHS):
        print(f"epoch={epoch}")
        train_losses = train_loop(train_data_loader, model, optimizer, scheduler)
        valid_losses, valid_predicts = valid_loop(valid_dset, model)
        
        y_true = [label_dic[i] for i in valid_primary_labels]
        y_pred = valid_predicts.argmax(1).numpy()
        f1_micro = f1_score(y_true, y_pred, average="micro")
        
        predict_labels = output_to_label(valid_predicts.sigmoid(), 0.5)
        f1_micro_avg = row_wise_micro_averaged_f1_score(valid_primary_labels, predict_labels)

        t_loss, v_loss = np.array(train_losses).mean(), np.array(valid_losses).mean()
        
        res = {"t_loss": t_loss, "v_loss": v_loss, "f1_micro": f1_micro, "f1_micro_avg": f1_micro_avg}
        
        print(res)
        results.append(res)
        torch.save(model.state_dict(), f"{config.WORK_ROOT}/birdclefnet_f{fold}_last_model.bin")
        
        gc.collect()
        torch.cuda.empty_cache()
        
    break  # FOLD-0 only

In [None]:
result_df = pd.DataFrame(results)
result_df

In [None]:
result_df["t_loss"].plot()
result_df["v_loss"].plot()
plt.show()

In [None]:
result_df["f1_micro"].plot()
result_df["f1_micro_avg"].plot()
plt.show()