This notebook is training Transformer model used CNN.  
I don't got good score now, but I will try more idea in this architecture.


Trained Dataset is [Spectrogram Image](https://www.kaggle.com/takamichitoda/birdcall-spectrogram-images-cut/notebooks?sortBy=dateRun&group=profile&pageSize=20&datasetId=760815) which cut on  312x128.

In [None]:
import numpy as np
import pandas as pd
import os
import tqdm
import random
import time

import math


import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.optim import Adam, AdamW
from torchvision.models import resnet18, resnet34, resnet50
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import CosineAnnealingLR, ReduceLROnPlateau

import matplotlib.pyplot as plt

from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

from contextlib import contextmanager
from typing import Optional
import logging
from numpy.random import beta

device = torch.device('cuda')

In [None]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

class config:
    SEED = 416
    N_FOLDS = 5
    FOLD = 0
    PRETRAINED = True
    INPUT = "/kaggle/input/birdcall-spectrogram-images-cut/cut_image_from_resnet18_08"
    OUTPUT = "/kaggle/working"
    N_LABEL = 264
    
    TRAIN_BS = 128
    VALID_BS = 128
    TRAIN_WORKS = 0
    VALID_WORKS = 0
    
    DROPOUT_RATE = 0.2
    EPOCHS = 100
    TF_LR = 3e-5

In [None]:
def get_dataloder():
    train_transform = transforms.Compose([
        transforms.RandomCrop((128, 313), pad_if_needed=True, padding_mode="constant"),
        transforms.ToTensor(),
    ])
    valid_transform = transforms.Compose([
        transforms.CenterCrop((128, 313)),
        transforms.ToTensor()
    ])
    

    train_datasets = datasets.ImageFolder(root=config.INPUT, transform=train_transform)
    valid_datasets = datasets.ImageFolder(root=config.INPUT, transform=valid_transform)

    skf = StratifiedKFold(n_splits=config.N_FOLDS, shuffle=True, random_state=config.SEED)

    _t = train_datasets.targets
    trn_idx, val_idx = [(trn_idx, val_idx) for trn_idx, val_idx in skf.split(_t, _t)][config.FOLD]

    train_datasets = torch.utils.data.Subset(train_datasets, trn_idx)
    valid_datasets = torch.utils.data.Subset(valid_datasets, val_idx)

    train_data_loader = torch.utils.data.DataLoader(train_datasets, batch_size=config.TRAIN_BS, shuffle=True, num_workers=config.TRAIN_WORKS)
    valid_data_loader = torch.utils.data.DataLoader(valid_datasets, batch_size=config.VALID_BS, shuffle=False, num_workers=config.VALID_WORKS)
    
    return train_data_loader, valid_data_loader

train_data_loader, _ = get_dataloder()
for d in train_data_loader:
    break
img = d[0][0]
plt.imshow(np.rollaxis(img.numpy(), 0, 3))

In [None]:
def train_fn(train_data_loader, model, optimizer):

    losses, lrs = [], []
    model.train()
    t = tqdm.notebook.tqdm(train_data_loader, total=len(train_data_loader))
    for (X, y) in t:
        y_pred = model(X.to(device))
        loss = loss_fn(y_pred,  y.to(device))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        losses.append(loss.item())
        lrs.append(np.array([param_group["lr"] for param_group in optimizer.param_groups]).mean())
    
    return sum(losses)/len(losses), lrs

        
def valid_fn(valid_data_loader, model, threshould=0.5):
    losses, f1_lst = [], []
    model.eval()
    t = tqdm.notebook.tqdm(valid_data_loader, total=len(valid_data_loader))
    for (X, y) in t:
        with torch.no_grad():
            y_pred = model(X.to(device))

        loss = loss_fn(y_pred,  y.to(device))
        losses.append(loss.item())

        y_pred = y_pred.argmax(1).cpu()
        f1 = f1_score(y, y_pred, average="micro")
        f1_lst.append(f1)

    return sum(f1_lst)/len(f1_lst), sum(losses)/len(losses)

In [None]:
def loss_fn(output, target):
    loss = nn.CrossEntropyLoss()(output, target)
    return loss

    
class BirdcallTransformer(nn.Module):
    def __init__(self):
        super(BirdcallTransformer, self).__init__()         
        
        embed = 512
        self.conv = nn.Conv2d(in_channels=3, out_channels=embed, kernel_size=(128, 16))
        #self.pe = PositionalEncoding(embed)  # not good work for me
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed, nhead=4)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=4)
        self.decoder = nn.Linear(embed, config.N_LABEL)
        
        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)
        
    def forward(self, x):
        h = self.conv(x).squeeze(2).permute((0, 2, 1))
        #h = self.pe(h)  # not good work for me
        h = self.transformer_encoder(h)
        logits, _ = self.decoder(h).max(1)
        return logits

In [None]:
print(f"### Fold-{config.FOLD} ###")

set_seed(config.SEED+config.FOLD)

train_data_loader, valid_data_loader = get_dataloder()

model = BirdcallTransformer()
model.to(device)


optimizer = Adam(model.parameters(), lr=config.TF_LR)

best_loss, best_score = 9999, 0
trn_losses, trn_lrs, val_losses, val_scores = [], [], [], []

endure = 0
for epoch in range(config.EPOCHS):
    print(f"{epoch} epoch")
    tloss, lrs = train_fn(train_data_loader, model, optimizer)
    val_f1, vloss = valid_fn(valid_data_loader, model)

    # save best score model
    if best_score <= val_f1:
        best_score = val_f1
        torch.save(model.state_dict(), f"{config.OUTPUT}/birdcallnet_f{config.FOLD}_best_score.bin")
        print(f"Best Score Update!!! -> {best_score}")

    # save best loss model
    if best_loss >= vloss:
        best_loss = vloss
        torch.save(model.state_dict(), f"{config.OUTPUT}/birdcallnet_f{config.FOLD}_best_loss.bin")
        print(f"Best Loss Update!!! -> {best_loss}")
        endure = 0
    else:
        endure += 1

    # save training logs
    trn_losses.append(tloss)
    val_losses.append(vloss)
    val_scores.append(val_f1)
    trn_lrs.extend(lrs)
    log_df = pd.DataFrame(zip(trn_losses, val_losses, val_scores), columns=["train loss", "valid loss", "score"])
    log_df.to_csv(f"{config.OUTPUT}/valid_f1_fold{config.FOLD}_score.csv", index=True)
    
    if endure > 8:
        print("*** early stop ***")
        break

In [None]:
print(f"Best Score: {best_score} / Best Loss: {best_loss}")
plt.plot(trn_lrs); plt.show()
plt.plot(val_scores); plt.show()
plt.plot(trn_losses)
plt.plot(val_losses)
plt.show()

In [None]:
log_df