## An example of training and inference with Deep Label Distribution Learning (DLDL) model used in [9th place solution](https://www.kaggle.com/c/petfinder-pawpularity-score/discussion/300947)

The following points are different from the competition due to environmental constraints. Therefore, the score is poor.

| Difference | This Notebook | Competition |
| --- | --- | --- |
| model_name | swin_base_patch4_window7_224 | swin_large_patch4_window12_384 |
| image_size | 224 | 384 |
| n_epochs | 8 | 16 |
| Training environment | Kaggle | Google Colab Pro|
| PyTorch | 1.9.1 | 1.10.1 |
| RandAugment | Not used | Used |
| Metadata | Not used | Used (Probably not necessary)|

In [None]:
import sys
sys.path.append('../input/petfinderpublic/pytorch-image-models-master/pytorch-image-models-master')
sys.path.append('../input/petfinderpublic/Ranger21-main/Ranger21-main')
!mkdir -p "/root/.cache/torch/hub/checkpoints"
!cp "../input/petfinderpublic/swin_pretrained/swin_base_patch4_window7_224_22kto1k.pth" "/root/.cache/torch/hub/checkpoints/"

In [None]:
import os
import gc
import time
import math
import copy
import random
import datetime
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
from torchvision.io import read_image

import timm

import pickle
from sklearn.svm import SVR
from catboost import CatBoostRegressor

## Config

In [None]:
TRAIN_CSV = '../input/petfinder-pawpularity-score/train.csv'
TEST_CSV = '../input/petfinder-pawpularity-score/test.csv'
SAMPLE_SUBMISSION_CSV = '../input/petfinder-pawpularity-score/sample_submission.csv'
TRAIN_DATA_PATH = '../input/petfinder-pawpularity-score/train/'
TEST_DATA_PATH = '../input/petfinder-pawpularity-score/test/'
TRAIN_CROP_PATH = '../input/petfinder2-cropped-dataset/crop/'
OUTPUT_PATH = '../output/'

os.makedirs(OUTPUT_PATH, exist_ok=True)

In [None]:
class CFG:
    seed = 113
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    ### model and image features
    # I used model_name='swin_large_patch4_window12_384' and image_size=384 in competition.
    model_name = 'swin_base_patch4_window7_224'  # backbone model
    image_size = 224
    embed_dim = 512
    n_classes = 121
    dropout_p = 0.4  # dropout probability of head layer
    replace_mixout = True  # replace Dropout layers to Mixout layers
    mixout_p = 0.8

    ### training
    n_epochs = 8 # I used 16 in competition
    n_folds = 5
    trn_folds = [0, 1, 2, 3, 4]
    batch_size = 12
    batch_size_infer = batch_size * 2
    accum_iter = 1
    use_amp = True
    num_workers = 4

    ### learning rate and loss
    lr_max = 1.6e-4
    weight_decay = 1e-3
    normal_sampling_std = 2.0  # standard deviation of target distribution
    optimizer_name = 'Ranger21'
    loss_weights = [1e-3, 1e-5]  # [kl_div_loss/focal_loss, l1_loss]

    ### Scheduler
    # OneCycleLR
    div_factor = 25
    final_div_factor = 2
    warmup_epo = n_epochs * 0.6
    lr_start = lr_max / div_factor  # starting learning rate
    lr_min = lr_max / final_div_factor  # last minimum learning rate

    ### image
    train_crop_p = 0.7

    ### MixUp
    mixup_p = 0.0
    mix_alpha = 0.2
    mixup_epoch_p = {1: 0.0, 2: 0.5}
 
    ### save model weights
    min_save_epoch = int(n_epochs * 0.3)
    save_file_name = f"{model_name}-ep{n_epochs}-bs{batch_size}-seed{seed}"
    output_dir = '../output/'
    save_model_path = os.path.join(output_dir, save_file_name)

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True  # set True to be faster

seed_everything(CFG.seed)

## Load CSV

In [None]:
def paw_to_target(x):
    return x + 10


def target_to_paw(x):
    return x - 10

In [None]:
def create_stratified_k_folds(df, cfg, binning=True):

    df = df.reset_index(drop=True)
    df["kfold"] = -1
    skf = StratifiedKFold(n_splits=cfg.n_folds, shuffle=True, random_state=cfg.seed)

    n_bins = int(np.floor(1+(3.3)*(np.log2(len(df)))))
    df.loc[:, "bins"] = pd.cut(
        df["target"], bins=n_bins, labels=False
    )
    splits = skf.split(X=df, y=df.bins)

    for fold, (_, valid_idx) in enumerate(splits):
        df.loc[valid_idx, 'kfold'] = fold

    return df

In [None]:
df = pd.read_csv(TRAIN_CSV)

df['target'] = df['Pawpularity'].apply(paw_to_target)

df = create_stratified_k_folds(df, cfg=CFG, binning=False)

df["file_path"] = [os.path.join(TRAIN_DATA_PATH, f"{id}.jpg") for id in df.Id]
df["crop_path"] = [os.path.join(TRAIN_CROP_PATH, f"{id}.jpg") for id in df.Id]

df.reset_index(drop=True)

## Functions

In [None]:
def make_scheduler(dataloader_length, optimizer, cfg):
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=cfg.lr_max,
        epochs=cfg.n_epochs,
        steps_per_epoch=dataloader_length,
        pct_start=cfg.warmup_epo / cfg.n_epochs,
        anneal_strategy='cos',
        div_factor=cfg.div_factor,
        final_div_factor=cfg.final_div_factor,
    )
    return scheduler


def make_optimizer(model, cfg):
    # https://github.com/lessw2020/Ranger21
    from ranger21 import Ranger21
    optimizer = Ranger21(model.parameters(), lr=cfg.lr_max,
        betas=(0.95, 0.999), eps=1e-5, weight_decay=cfg.weight_decay,  # Adam options
        use_cheb=False,
        lookahead_active=True,
        lookahead_mergetime=5,
        lookahead_blending_alpha=0.5,
        lookahead_load_at_validation=False,
        normloss_active=True,
        normloss_factor=6e-4,
        use_adaptive_gradient_clipping=True,
        agc_clipping_value=0.1,
        use_madgrad=False,
        warmdown_active=False,
        use_warmup=False,
        num_epochs=cfg.n_epochs,
        using_gc=True,
        num_batches_per_epoch=len(train_loader),
    )
    return optimizer

In [None]:
def normal_sampling(mean, label, std=CFG.normal_sampling_std):
    return math.exp(- (label - mean)**2 / (2 * std**2)) / (math.sqrt(2 * math.pi) * std)


def worker_init_fn(worker_id):
    np.random.seed(np.random.get_state()[1][0] + worker_id)

## Torchvision transforms

In [None]:
train_aug = T.Compose([
    #T.RandAugment(2, 10),  # Cannot use in kaggle now. I used RA with torchvision v0.11.0.
    T.RandomChoice([
        T.Resize(CFG.image_size),
        T.Resize(CFG.image_size),
        T.Resize((CFG.image_size, CFG.image_size)),
    ]),
    T.CenterCrop(CFG.image_size),
    T.ConvertImageDtype(torch.float),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
])

valid_aug = nn.Sequential(
    T.Resize(CFG.image_size),
    T.CenterCrop(CFG.image_size),
    T.ConvertImageDtype(torch.float),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
)


class PawpularDataset(Dataset):
    def __init__(self, df, cfg, augment=None, mode="test"):
        self.df = df
        self.cfg = copy.copy(cfg)
        self.augmentations = augment
        self.mode = mode
        if mode in ["train", "valid"]:
            self.targets = df.target.values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if (np.random.rand() < self.cfg.train_crop_p) and (self.mode == "train"):
            file_path = self.df.crop_path[idx]
        else:
            file_path = self.df.file_path[idx]

        image = read_image(file_path)
        #image = read_image(file_path).to(self.cfg.device)
        
        if self.augmentations is not None:
            image = self.augmentations(image)

        if self.mode in ["train", "valid"]:
            target = self.targets[idx]
            target_dist = [normal_sampling(target, i) for i in range(self.cfg.n_classes)]
            target_dist = [i if i > 1e-8 else 1e-8 for i in target_dist]
            target_dist = torch.tensor(target_dist, dtype=torch.float32)

            return {
                "image": image,
                "target": torch.tensor(target, dtype=torch.float),
                "target_dist": target_dist,  # target age distribution
            }
        else:
            return {
                "image": image,
            }

## MixUp

In [None]:
def mixup(batch, alpha):

    lam = np.random.beta(alpha, alpha) if alpha > 0 else 1
    
    rand_idx = torch.randperm(batch["image"].size(0))

    batch["image"] = lam * batch["image"] + (1 - lam) * batch["image"][rand_idx]
    batch["target_b"] = batch["target"][rand_idx]
    batch["target"] = lam * batch["target"] + (1 - lam) * batch["target"][rand_idx]
    batch["target_dist"] = lam * batch["target_dist"] + (1 - lam) * batch["target_dist"][rand_idx]

    return batch, lam

## MixOut

In [None]:
# https://arxiv.org/abs/1909.11299
# https://github.com/bloodwass/mixout
# https://www.ai-shift.co.jp/techblog/2170

import math
from torch.autograd.function import InplaceFunction
from torch.nn import Parameter
import torch.nn.init as init

class Mixout(InplaceFunction):
    @staticmethod
    def _make_noise(input):
        return input.new().resize_as_(input)

    @classmethod
    def forward(cls, ctx, input, target=None, p=0.0, training=False, inplace=False):
        if p < 0 or p > 1:
            raise ValueError("A mix probability of mixout has to be between 0 and 1," " but got {}".format(p))
        if target is not None and input.size() != target.size():
            raise ValueError(
                "A target tensor size must match with a input tensor size {},"
                " but got {}".format(input.size(), target.size())
            )
        ctx.p = p
        ctx.training = training

        if ctx.p == 0 or not ctx.training:
            return input

        if target is None:
            target = cls._make_noise(input)
            target.fill_(0)
        target = target.to(input.device)

        if inplace:
            ctx.mark_dirty(input)
            output = input
        else:
            output = input.clone()

        ctx.noise = cls._make_noise(input)
        if len(ctx.noise.size()) == 1:
            ctx.noise.bernoulli_(1 - ctx.p)
        else:
            ctx.noise[0].bernoulli_(1 - ctx.p)
            ctx.noise = ctx.noise[0].repeat(input.size()[0], 1)
        ctx.noise.expand_as(input)

        if ctx.p == 1:
            output = target
        else:
            output = ((1 - ctx.noise) * target + ctx.noise * output - ctx.p * target) / (1 - ctx.p)
        return output

    @staticmethod
    def backward(ctx, grad_output):
        if ctx.p > 0 and ctx.training:
            return grad_output * ctx.noise, None, None, None, None
        else:
            return grad_output, None, None, None, None


def mixout(input, target=None, p=0.0, training=False, inplace=False):
    return Mixout.apply(input, target, p, training, inplace)


class MixLinear(torch.nn.Module):
    __constants__ = ["bias", "in_features", "out_features"]
    def __init__(self, in_features, out_features, bias=True, target=None, p=0.0):
        super(MixLinear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.Tensor(out_features, in_features))
        if bias:
            self.bias = Parameter(torch.Tensor(out_features))
        else:
            self.register_parameter("bias", None)
        self.reset_parameters()
        self.target = target
        self.p = p

    def reset_parameters(self):
        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
            fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
            bound = 1 / math.sqrt(fan_in)
            init.uniform_(self.bias, -bound, bound)

    def forward(self, input):
        return F.linear(input, mixout(self.weight, self.target, self.p, self.training), self.bias)

    def extra_repr(self):
        type = "drop" if self.target is None else "mix"
        return "{}={}, in_features={}, out_features={}, bias={}".format(
            type + "out", self.p, self.in_features, self.out_features, self.bias is not None
        )


def replace_mixout(model, mixout_p):
    for sup_module in model.modules():
        for name, module in sup_module.named_children():
            if isinstance(module, nn.Dropout):
                module.p = 0.0
            if isinstance(module, nn.Linear):
                target_state_dict = module.state_dict()
                bias = True if module.bias is not None else False
                new_module = MixLinear(
                    module.in_features, module.out_features, bias, target_state_dict["weight"], mixout_p
                )
                new_module.load_state_dict(target_state_dict)
                setattr(sup_module, name, new_module)
    return model

## Model

In [None]:
class DLDLModel(nn.Module):
    def __init__(self, cfg, pretrained=False):
        super().__init__()
        self.cfg = copy.copy(cfg)
        self.rank = torch.Tensor([i for i in range(cfg.n_classes)]).to(cfg.device)

        self.backbone = timm.create_model(
            cfg.model_name,
            pretrained=pretrained,
            in_chans=3
        )
        head_in_channels = self.backbone.head.in_features
        self.backbone.head = nn.Identity()

        self.embedding = nn.Sequential(
            nn.Dropout(cfg.dropout_p),
            nn.Linear(head_in_channels, cfg.embed_dim)
        )
        self.fc = nn.Sequential(
            nn.ReLU(),
            nn.Dropout(cfg.dropout_p),
            nn.Linear(cfg.embed_dim, cfg.n_classes)
        )
        for m in [*self.embedding, *self.fc]:
            self._init_params(m)

        self.kldivloss_fn = nn.KLDivLoss(reduction="batchmean")
        self.l1loss_fn = nn.HuberLoss(reduction="mean")

    def _init_params(self, m):
        if isinstance(m, nn.Conv2d):
            n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
            m.weight.data.normal_(0, math.sqrt(2. / n))
        elif isinstance(m, nn.BatchNorm2d):
            m.weight.data.fill_(1)
            m.bias.data.zero_()
        elif isinstance(m, nn.Linear):
            nn.init.xavier_normal_(m.weight)
            nn.init.constant_(m.bias, 0)

    def forward(self, batch, embed=False):
        feats = self.backbone(batch["image"])
        x = self.embedding(feats)
        x = self.fc(x)
        ps_dist = F.softmax(x, dim=1)
        ps = torch.sum(ps_dist * self.rank, dim=1)  # expected value
        if embed == True:
            return {
                "ps_dist": ps_dist,
                "ps": ps,
                "feat": feats
            }
        else:
            return {
                "ps_dist": ps_dist,
                "ps": ps
            }

    def loss(self, outputs, batch, lam):
        loss_kl = self.kldivloss_fn(torch.log(outputs["ps_dist"]), batch["target_dist"])
        if lam > 0 and lam <= 1:
            loss_l1 = self.l1loss_fn(outputs["ps"], lam * batch["target"] + (1 - lam) * batch["target_b"])
        else:
            loss_l1 = self.l1loss_fn(outputs["ps"], batch["target"])

        loss = loss_kl * self.cfg.loss_weights[0] + loss_l1 * self.cfg.loss_weights[1]

        return loss

## Training and validation

In [None]:
def train_one_epoch(fold, epoch, model, data_loader, optimizer, scheduler, cfg):

    model.train()
    optimizer.zero_grad()

    scaler = torch.cuda.amp.GradScaler()

    total_se = 0.0
    total_loss = 0.0
    total_processed = 0

    pbar = tqdm(train_loader, desc=f"Epoch [{epoch}/{cfg.n_epochs}]", ncols=120)
    for step, batch in enumerate(pbar):
        batch = {key: val.to(cfg.device, non_blocking=True) for key, val in batch.items()}

        # MixUp
        if np.random.rand() < cfg.mixup_p:
            do_mixup = True
            batch, lam = mixup(batch, alpha=cfg.mix_alpha)
        else:
            do_mixup = False
            lam = 0.

        # Training
        with torch.cuda.amp.autocast():
            outputs = model(batch)
            loss = model.loss(outputs, batch, lam)
        scaler.scale(loss).backward()
        # mini-batch accumulation
        if (step + 1) % cfg.accum_iter == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        # running loss
        bs = batch["image"].size(0)
        total_loss += loss.cpu().detach().item() * bs
        total_processed += bs
        running_loss = total_loss / total_processed

        if do_mixup:
            total_se += lam * torch.square(batch["target"].cpu().detach() - outputs["ps"].cpu().detach()).numpy().sum() \
            + (1 - lam ) * torch.square(batch["target_b"].cpu().detach() - outputs["ps"].cpu().detach()).numpy().sum()
        else:
            total_se += torch.square(batch["target"].cpu().detach() - outputs["ps"].cpu().detach()).numpy().sum()

        running_rmse = (total_se / total_processed) ** 0.5

        try:
            pbar.set_postfix({
                'loss' : '%.4f' %float(running_loss),
                "rmse": '%.4f' %float(running_rmse),
                'lr' : optimizer.param_groups[0]['lr']
            })
        except:
            pass

        del loss, batch, outputs
        torch.cuda.empty_cache()
        gc.collect()

        scheduler.step()  # update scheduler on every step end

    return model, running_loss, running_rmse

In [None]:
def valid_fn(model, data_loader, test_data, cfg):

    model.eval()

    total_se = 0.0
    total_loss = 0.0
    total_processed = 0

    #with torch.inference_mode():
    with torch.no_grad():
        for step, batch in enumerate(data_loader):
            batch = {key: val.to(cfg.device, non_blocking=True) for key, val in batch.items()}
            bs = batch["image"].size(0)
            outputs = model(batch)
            loss = model.loss(outputs, batch, lam=0.)

            # running loss & mse
            total_loss += loss.cpu().detach().item() * bs
            total_processed += bs
            running_loss = total_loss / total_processed

            total_se += torch.square(batch["target"].detach() - outputs["ps"].detach()).cpu().numpy().sum()
    
    rmse = (total_se / total_processed) ** 0.5

    valstr = "Test       :" if test_data else "Validation :"
    print(f"[{datetime.datetime.now()}] {valstr} loss={running_loss:.6f}, "
          f"rmse={rmse:.6f}"
         )

    del loss, batch, outputs
    torch.cuda.empty_cache()
    gc.collect()

    return running_loss, rmse

In [None]:
def plot_history(history_trn, history_val):
    '''Plot training and validation history graphs.'''

    train_score = history_trn['score']
    train_loss = history_trn['loss']
    val_score = history_val['score']
    val_loss = history_val['loss']
    epochs = range(1, len(train_score) + 1)
    plt.figure(figsize=(12, 6), tight_layout=True)

    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_score, 'b', label='training rmse')
    plt.plot(epochs, val_score, 'r', label='validation rmse')
    plt.title('RMSE')
    plt.legend(loc='best')
    plt.grid()
    plt.xlabel('epoch')
    plt.ylabel('rmse')

    plt.subplot(1, 2, 2)
    plt.plot(epochs, train_loss, 'b', label='training loss')
    plt.plot(epochs, val_loss, 'r', label='validation loss')
    plt.title('loss')
    plt.legend(loc='best')
    plt.grid()
    plt.xlabel('epoch')
    plt.ylabel('loss')

    plt.show()

## Main

In [None]:
train_start_time = time.time()
best_valid_scores = []

### Training
cfg = CFG

for fold in range(cfg.n_folds):
    if fold not in cfg.trn_folds:
        continue

    print("")
    print("=" * 100)
    print(f"[{datetime.datetime.now()}] Fold {fold} / {cfg.n_folds - 1}")
    print("=" * 100)

    fold_start_time = time.time()

    train_df = df[df["kfold"] != fold].reset_index(drop=True)
    valid_df = df[df["kfold"] == fold].reset_index(drop=True)

    ### Model
    model = DLDLModel(cfg=cfg, pretrained=True)
    if cfg.replace_mixout:
        model = replace_mixout(model, cfg.mixout_p)
    model = model.to(cfg.device)
    model.zero_grad()

    ### Dataloader
    train_dataset = PawpularDataset(df=train_df, cfg=cfg, augment=train_aug, mode="train")
    valid_dataset = PawpularDataset(df=valid_df, cfg=cfg, augment=valid_aug, mode="valid")
    train_loader = DataLoader(
        train_dataset,
        batch_size=cfg.batch_size,
        shuffle=True,
        drop_last=True,
        sampler=None,
        worker_init_fn=worker_init_fn,
        pin_memory=False,
        num_workers=cfg.num_workers
    )
    valid_loader = DataLoader(
        valid_dataset,
        batch_size=cfg.batch_size_infer,
        shuffle=False,
        worker_init_fn=worker_init_fn,
        pin_memory=False,
        num_workers=cfg.num_workers
    )
    ### Optimizer
    optimizer = make_optimizer(model, cfg)

    ### Scheduler
    scheduler = make_scheduler(len(train_loader), optimizer, cfg)

    ### Train epochs
    best_valid_score_epoch = 0
    best_valid_score = np.inf
    best_valid_loss = np.inf
    history_train = {'loss': [], 'score': []}
    history_valid = {'loss': [], 'score': []}

    for epoch in range(1, cfg.n_epochs + 1):
        if epoch in cfg.mixup_epoch_p:
            cfg.mixup_p = cfg.mixup_epoch_p[epoch]
            print(f"[{datetime.datetime.now()}] MixUp p={cfg.mixup_p}, alpha={cfg.mix_alpha}")

        time.sleep(0.3)  # prevent splitting tqdm progress bar

        ### train
        model, train_loss, train_score = train_one_epoch(fold, epoch, model, train_loader, optimizer, scheduler, cfg)
        history_train['loss'].append(train_loss)
        history_train['score'].append(train_score)

        ### valid score
        valid_loss, valid_score = valid_fn(model, valid_loader, test_data=False, cfg=cfg)
        history_valid['loss'].append(valid_loss)
        history_valid['score'].append(valid_score)

        if valid_score < best_valid_score:
            best_valid_score = valid_score
            best_valid_score_epoch = epoch
            best_valid_loss = valid_loss
            if epoch >= cfg.min_save_epoch:
                print(f"[{datetime.datetime.now()}] Validation score improved. Saving model weights to {cfg.save_model_path}-fold{fold}.pth")
                torch.save(model.state_dict(), cfg.save_model_path + f"-fold{fold}.pth")

        if (valid_loss < best_valid_loss) and (epoch >= cfg.min_save_epoch) \
            and (valid_score == best_valid_score) and (epoch > best_valid_score_epoch):
                best_valid_loss = valid_loss
                print(f"[{datetime.datetime.now()}] Validation loss improved. Saving model weights to {cfg.save_model_path}-fold{fold}.pth")
                torch.save(model.state_dict(), cfg.save_model_path + f"-fold{fold}.pth")


    ### fold summary
    best_valid_scores.append(best_valid_score)

    # Print fold summary
    train_elapsed_time = time.time() - train_start_time
    fold_elapsed_time = time.time() - fold_start_time
    print("")
    print('Fold elapsed time: {:.0f} min {:.0f} sec'.format(fold_elapsed_time // 60, fold_elapsed_time % 60))
    print('Training elapsed time: {:.0f} min {:.0f} sec'.format(train_elapsed_time // 60, train_elapsed_time % 60))

    ### valid score
    print('Epoch({}/{}): Best validation accuracy: {:4f}'.format(best_valid_score_epoch, cfg.n_epochs, best_valid_score))

    print()
    print("history_valid")
    plot_history(history_train, history_valid)

    del model, train_loader, valid_loader, train_dataset, valid_dataset, optimizer, scheduler, history_train, history_valid, train_df, valid_df
    torch.cuda.empty_cache()
    gc.collect()


print(f"CV RMSE : {np.mean(best_valid_scores)}")

## OOF pred and save SVR & Catboost models

In [None]:
# CatBoost Parameters
cb_params = {'loss_function' : 'RMSE',
             'eval_metric' : 'RMSE',
             'iterations' : 1000,
             'grow_policy' : 'SymmetricTree',
             'depth' : 6,
             'l2_leaf_reg' : 2.0,
             'random_strength' : 1.0,
             'learning_rate' : 0.05,
             'task_type' : 'CPU',
             'devices' : '0',
             'verbose' : 0,
             'random_state': CFG.seed}

In [None]:
def get_test_aug(image_size, tta):
    if tta == 0:
        return nn.Sequential(
            T.Resize(image_size),
            T.CenterCrop(image_size),
            T.ConvertImageDtype(torch.float),
            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        )
    elif tta == 1:
        return nn.Sequential(
            T.Resize((image_size, image_size)),
            T.ConvertImageDtype(torch.float),
            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        )
    else:
        return nn.Sequential(
            T.Resize(image_size),
            T.CenterCrop(image_size),
            T.RandomHorizontalFlip(p=1),
            T.ConvertImageDtype(torch.float),
            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        )

In [None]:
df_oof_pred = df[["Id", "Pawpularity"]].copy()
df_oof_pred["oof_pred"] = -1

oof_trues = []
oof_valid_embeds = []

for fold_ in range(CFG.n_folds):
    print(f"[{datetime.datetime.now()}] Fold {fold_}/{CFG.n_folds - 1} predicting oof...")

    ### NN Model
    model = DLDLModel(cfg=CFG, pretrained=False)
    model.load_state_dict(torch.load(CFG.save_model_path + f"-fold{fold_}.pth", map_location='cpu'))
    model = model.eval().to(CFG.device)

    ### Datasets and Dataloaders
    df_valid = df[df.kfold == fold_].reset_index(drop=True)
    valid_dataset = PawpularDataset(df=df_valid, cfg=CFG, augment=get_test_aug(CFG.image_size, tta=0), mode="valid")
    valid_loader = DataLoader(
        valid_dataset,
        batch_size=CFG.batch_size_infer,
        shuffle=False,
        worker_init_fn=worker_init_fn,
        pin_memory=False,
        num_workers=CFG.num_workers
    )

    ### OOF predictions of NN
    valid_embeds = []
    valid_pss = []
    #with torch.inference_mode():
    with torch.no_grad():
        for step, batch in enumerate(tqdm(valid_loader, ncols=100)):
            batch = {key: val.to(CFG.device, non_blocking=True) for key, val in batch.items()}
            output_ = model(batch, embed=True)
            valid_embed = output_["feat"].detach().cpu()
            valid_embeds.append(valid_embed)
            valid_ps = output_["ps"].detach().cpu()
            valid_pss.append(valid_ps)

        valid_pred_nn = np.clip(target_to_paw(torch.cat(valid_pss, dim=0).numpy()), 1, 100)

    df_oof_pred["oof_pred"].iloc[np.where(df.kfold == fold_)] = valid_pred_nn
    oof_valid_embeds.append(torch.cat(valid_embeds, dim=0))
    oof_trues.append(df_valid['Pawpularity'].values.astype('int32'))
#     valid_embeds = torch.cat(valid_embeds, dim=0).numpy()
#     if fold_ == 0:
#         sorted_oof_embeds = np.zeros((len(df), valid_embeds.shape[1]), dtype=np.float32)
#     sorted_oof_embeds[np.where(df.kfold == fold_)] = valid_embeds

valid_embeds_all = torch.cat(oof_valid_embeds, dim=0).numpy()
oof_trues_all = np.concatenate(oof_trues)

df_oof_pred.to_csv(CFG.save_model_path + f".csv", index=False)
print()
print(" RMSE of NN OOF preds :", mean_squared_error(df_oof_pred.Pawpularity.values, df_oof_pred.oof_pred, squared=False))
print()
display(df_oof_pred)

## Histogram
plt.figure(figsize=(12, 4), tight_layout=True)
plt.hist(df_oof_pred.oof_pred, bins=100, alpha=0.3, color='red', label='Pred')
plt.hist(df_oof_pred.Pawpularity, bins=100, alpha=0.3, color='blue', label='True')
plt.legend()
plt.show()

## Scatter plot
plt.figure(figsize=(8, 8), tight_layout=True)
plt.scatter(df_oof_pred.Pawpularity, df_oof_pred.oof_pred, s=2)
plt.xlabel("True")
plt.ylabel("Pred")
plt.show()

## Train additional heads

In [None]:
df_oof_pred["oof_pred_svr"] = -1
df_oof_pred["oof_pred_cat"] = -1

for fold_ in range(CFG.n_folds):
    print("")
    print("=" * 100)
    print(f"[{datetime.datetime.now()}] Fold {fold_} / {CFG.n_folds - 1}")
    print("=" * 100)

    ### Split embeddings
    ## Caution: following folds are different from the folds used in NN training.
    train_embeds = valid_embeds_all[np.where(df.kfold != fold_)]
    valid_embeds = valid_embeds_all[np.where(df.kfold == fold_)]
    train_trues = oof_trues_all[np.where(df.kfold != fold_)]
    valid_trues = oof_trues_all[np.where(df.kfold == fold_)]

    ### Fit SVR
    print('Fitting SVR...')
    clf1 = SVR(C=20.0)
    clf1.fit(train_embeds.astype('float32'), train_trues)

    ### Fit Catboost
    print(f'Fitting Catboost...')
    clf2 = CatBoostRegressor(**cb_params)
    clf2.fit(train_embeds, train_trues,
            eval_set=[(valid_embeds, valid_trues)],
            early_stopping_rounds=100, use_best_model=True, verbose=25)

    ### Save extra head model
    fname_svr = os.path.join(OUTPUT_PATH, f"SVR_fold_{fold_}.pkl")
    fname_cat = os.path.join(OUTPUT_PATH, f"CAT_fold_{fold_}.pkl")
    pickle.dump(clf1, open(fname_svr, "wb"))
    pickle.dump(clf2, open(fname_cat, "wb"))

    ### OOF predictions of additional heads
    print()
    print("Predicting SVR...")
    valid_pred_svr = clf1.predict(valid_embeds)
    df_oof_pred["oof_pred_svr"].loc[df["kfold"] == fold_] = valid_pred_svr
    print("Predicting Catboost...")
    valid_pred_cat = clf2.predict(valid_embeds)
    df_oof_pred["oof_pred_cat"].loc[df["kfold"] == fold_] = valid_pred_cat
    print("RMSE of valid predictions:")
    print(" - SVR :", mean_squared_error(valid_trues, valid_pred_svr, squared=False))
    print(" - CAT :", mean_squared_error(valid_trues, valid_pred_cat, squared=False))

In [None]:
### CV
print("RMSE of OOF predictions:")
print(" - NN :", mean_squared_error(df_oof_pred.Pawpularity.values, df_oof_pred["oof_pred"].values, squared=False))
print(" - SVR :", mean_squared_error(oof_trues_all, df_oof_pred["oof_pred_svr"].values, squared=False))
print(" - CAT :", mean_squared_error(oof_trues_all, df_oof_pred["oof_pred_cat"].values, squared=False))

# Inference

In [None]:
N_TTA = 3

STATE_DICTS = [
    f"{cfg.save_model_path}-fold0.pth",
    f"{cfg.save_model_path}-fold1.pth",
    f"{cfg.save_model_path}-fold2.pth",
    f"{cfg.save_model_path}-fold3.pth",
    f"{cfg.save_model_path}-fold4.pth",
]
PKL_PATH_SVRS = [
    f"{OUTPUT_PATH}/SVR_fold_0.pkl",
    f"{OUTPUT_PATH}/SVR_fold_1.pkl",
    f"{OUTPUT_PATH}/SVR_fold_2.pkl",
    f"{OUTPUT_PATH}/SVR_fold_3.pkl",
    f"{OUTPUT_PATH}/SVR_fold_4.pkl",
]
PKL_PATH_CATS = [
    f"{OUTPUT_PATH}/CAT_fold_0.pkl",
    f"{OUTPUT_PATH}/CAT_fold_1.pkl",
    f"{OUTPUT_PATH}/CAT_fold_2.pkl",
    f"{OUTPUT_PATH}/CAT_fold_3.pkl",
    f"{OUTPUT_PATH}/CAT_fold_4.pkl",
]

In [None]:
df_test = pd.read_csv(TEST_CSV)
df_test["file_path"] = [os.path.join(TEST_DATA_PATH, f"{id}.jpg") for id in df_test.Id]

In [None]:
cfg = CFG

### model list
models = []
clf_svrs = []
clf_cats = []

### Loading NN models
for state_dict in STATE_DICTS:
    print(f"Loading {CFG.model_name} : {state_dict}")
    model = DLDLModel(cfg=cfg)
    model.load_state_dict(torch.load(state_dict, map_location='cpu'))
    model.eval().to(cfg.device)
    models.append(model)

## Loading SVR head models
for pkl_path_svr in PKL_PATH_SVRS:
    print(f"Loading {pkl_path_svr}")
    clf_svrs.append(pickle.load(open(pkl_path_svr, "rb")))

## Loading CAT head models
for pkl_path_cat in PKL_PATH_CATS:
    print(f"Loading {pkl_path_cat}")
    clf_cats.append(pickle.load(open(pkl_path_cat, "rb")))


### TTA loop
preds_nn_all = []
preds_svr_all = []
preds_cat_all = []

for tta in range(N_TTA):
    seed_everything(cfg.seed + 20 + tta)
    print()
    print("=" * 40)
    print(f"Inference TTA [{tta}/{N_TTA - 1}]")
    print("=" * 40)

    ### Dataloader
    dataset = PawpularDataset(df_test, cfg, augment=get_test_aug(cfg.image_size, tta), mode="test")
    data_loader = DataLoader(
        dataset,
        batch_size=cfg.batch_size_infer,
        shuffle=False,
        pin_memory=False,
        num_workers=cfg.num_workers
    )

    ### inference
    ps_nns2_ = []
    ps_svrs2_ = []
    ps_cats2_ = []
    pbar = tqdm(data_loader, desc=f"Inference", ncols=80)
    for _, batch in enumerate(pbar):
        batch = {key: val.to(cfg.device, non_blocking=True) for key, val in batch.items()}
        ps_nns_ = []
        ps_svrs_ = []
        ps_cats_ = []
        #with torch.inference_mode():
        with torch.no_grad():
            for num, model in enumerate(models):
                ### NN pred and extract embed
                output_ = model(batch, embed=True)
                ps_ = target_to_paw(output_['ps'].detach().cpu())
                ps_nns_.append(ps_)

                ### extra head pred
                embed_ = output_["feat"].detach().cpu().numpy()
                ps_svr_ = clf_svrs[num].predict(embed_)  # (bs)
                ps_cat_ = clf_cats[num].predict(embed_)  # (bs)
                ps_svrs_.append(ps_svr_)  # -> [models, (bs)]
                ps_cats_.append(ps_cat_)  # -> [models, (bs)]

        ps_nns_ = torch.stack(ps_nns_).permute(1, 0)  # (bs, models)
        ps_nns2_.append(ps_nns_)  # -> [steps, (bs, models)]
        ps_svrs_ = np.stack(ps_svrs_).transpose(1, 0)  # (bs, models)
        ps_svrs2_.append(ps_svrs_)  # -> [steps, (bs, models)]
        ps_cats_ = np.stack(ps_cats_).transpose(1, 0)  # (bs, models)
        ps_cats2_.append(ps_cats_)  # -> [steps, (bs, models)]

    ps_nns2_ = torch.cat(ps_nns2_).permute(1, 0)  # (models, steps*bs)
    preds_nn_all.append(ps_nns2_)  # -> [tta, (models, bs*steps)]
    ps_svrs2_ = np.concatenate(ps_svrs2_).transpose(1, 0)  # (models, steps*bs)
    preds_svr_all.append(ps_svrs2_)  # -> [tta, (models, bs*steps)]
    ps_cats2_ = np.concatenate(ps_cats2_).transpose(1, 0)  # (models, steps*bs)
    preds_cat_all.append(ps_cats2_)  # -> [tta, (models, bs*steps)]

preds_nn_all = torch.cat(preds_nn_all).detach().cpu().numpy()  # (tta*models, bs*steps)
preds_svr_all = np.concatenate(preds_svr_all)  # (tta*models, bs*steps)
preds_cat_all = np.concatenate(preds_cat_all)  # (tta*models, bs*steps)

del batch, dataset, data_loader, output_
gc.collect()
torch.cuda.empty_cache()

In [None]:
df_test["Pawpularity"] = (np.mean(preds_nn_all, axis=0) + np.mean(preds_cat_all, axis=0) + np.mean(preds_svr_all,axis=0)) / 3
df_test[["Id", "Pawpularity"]].to_csv("submission.csv", index=False)
df_test[["Id", "Pawpularity"]]

EOF