In [None]:
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')
sys.path.append('../input/pytorch-optimizers/')

In [None]:
import os
import gc
import cv2
import copy
import time
import yaml
import random
import shutil
import warnings
import subprocess
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from glob import glob as glob_file
from tqdm import tqdm
from PIL import Image, ImageDraw
from shutil import copyfile
from IPython.core.display import Video, display
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold, GroupKFold, StratifiedKFold

import timm
import torch
from fastai.vision.all import *

warnings.simplefilter('ignore')
pd.set_option("max_columns", 150)
pd.set_option('display.max_rows', 150)

In [None]:
# https://github.com/rwightman/pytorch-image-models/blob/master/results/results-imagenet.csv
# timm.list_models(pretrained=True)

# Configuration

In [None]:
CFG = {
    "save_prev"     : [True, ["petfinder_*.pth"]],
    "seed"          : 42,
    'device'        : "cuda:0" if torch.cuda.is_available() else "cpu",
    "input_img"     : '../input/petfinder-pawpularity-score/train/',
    "input_path"    : '../input/petfinder-smogn-dataset/train_drop_duplicated.csv',
    "output_path"   : './',
    "pretrain"      : "../input/petfinder-fastai-semisupervised-models/petfinder_swin_binary_ss_fastai_smogn_4.pth",
    "save_name"     : "petfinder_swin_binary_fastai_ss_smogn",
    "model"         : "swin_large_patch4_window7_224_in22k",
    "loss"          : "binary",
    "size"          : 224,
    "fold"          : 4,
    "batch_size"    : 32,
    "epochs"        : 5,
    "n_data"        : None,  # Batch * step
    "mixup_ratio"   : 0,
    "lr"            : 1e-6,
    "accum_iter"    : 2,
    'early_stopping': 3,
    'verbose_step'  : 1,
    "num_workers"   : 4
}

CFG

In [None]:
def get_img(path):
    im_bgr = cv2.imread(path)
    im_rgb = im_bgr[:, :, ::-1]
    return im_rgb

def sigmoid(a):
    return 1 / (1 + np.exp(-a))

def softmax(x):
    max = np.max(x,axis=1,keepdims=True)
    e_x = np.exp(x - max)
    sum = np.sum(e_x,axis=1,keepdims=True)
    return e_x / sum 

def seed_everything(seed = 42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(CFG["seed"])

In [None]:
if CFG['save_prev'][0]:
    for file_pattern in CFG['save_prev'][1]:
        for f in glob_file(f"../input/petfinder-fastai-models-pseudolabel-2/{file_pattern}"):
            filename = os.path.basename(f)
            print(filename)
            !cp {f} ./{filename}

# Load data

In [None]:
df_train = pd.read_csv(CFG['input_path'])
df_train["path"] = [f"{CFG['input_img']}{i}.jpg" for i in df_train.Id]
df_train["Pawclass"] = df_train.Pawpularity / 100

num_bins = int(np.ceil(2*((len(df_train))**(1./3))))
df_train['bins'] = pd.cut(df_train['Pawclass'], bins=num_bins, labels=False)

meta_features = [c for c in df_train.columns if c not in ["Id","path", "Pawpularity","Pawclass","bins"]]

print(meta_features)
print(df_train.shape)
df_train.head(2)

In [None]:
kfold = StratifiedKFold(n_splits=CFG['fold'], random_state=CFG["seed"], shuffle=True)
df_train["fold"] = 0
for i, (_, train_index) in enumerate(kfold.split(df_train.index, df_train['bins'])):
    df_train.loc[train_index, "fold"] = i
df_train['fold'] = df_train['fold'].astype('int')

df_train.fold.value_counts()

In [None]:
df_train.Pawpularity.hist(bins=100)

# Define model

In [None]:
def prepare_dataloader(df, fold):
    df = df.copy()
    df['is_valid'] = (df['fold'] == fold)
    
    label_col  = "Pawclass" if CFG["loss"] == "binary" else "Pawpularity"
    dataloader = ImageDataLoaders.from_df(
        df,
        valid_col='is_valid',
        seed=CFG["seed"],
        fn_col='path',
        label_col=label_col,
        y_block=RegressionBlock,
        bs=CFG['batch_size'],
        n=CFG["n_data"],
        shuffle=True,
        num_workers=CFG['num_workers'],
        item_tfms=Resize(CFG['size']),
        batch_tfms=setup_aug_tfms([Brightness(), Contrast(), Hue(), Saturation()])
    )
    return dataloader

def petfinder_rmse(input,target):
    return 100*torch.sqrt(F.mse_loss(F.sigmoid(input.flatten()), target))

In [None]:
class Model(nn.Module):
    def __init__(self, model_name, pretrained=True):
        super().__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained, in_chans=3)
        self.n_features = self.model.classifier.in_features
        self.model.classifier = nn.Linear(self.n_features, 1)

    def forward(self, x):
        output = self.model(x)
        return output

class SwinModel(nn.Module):
    def __init__(self, model_name, pretrained=True):
        super().__init__()
        self.model   = timm.create_model(model_name, pretrained=pretrained, num_classes=0, in_chans=3)
        num_features = self.model.num_features
        self.linear  = nn.Linear(num_features, 1)

    def forward(self, x):
        x = self.model(x)
        output = self.linear(x)
        return output

def get_learner(df, fold):
    dataloader = prepare_dataloader(df, fold)
    if CFG['pretrain'] is None:
        model = SwinModel(CFG["model"], pretrained=True)
    else:
        model = SwinModel(CFG["model"], pretrained=False)
        model.load_state_dict(torch.load(CFG['pretrain']))
    learner = Learner(
        dataloader,
        model,
        loss_func=BCEWithLogitsLossFlat(),
        metrics=AccumMetric(petfinder_rmse)).to_fp16()
    return learner

In [None]:
# Ref: https://fastai1.fast.ai/callbacks.lr_finder.html
# get_learner(df_train, fold=0).lr_find(end_lr=3e-2)
# SuggestedLRs(valley=3.307316728751175e-05) with pretrained on external data
# SuggestedLRs(valley=2.2654900021734647e-05)

# Run training

In [None]:
all_preds = []

for fold in range(CFG["fold"]):
    learn = get_learner(df_train, fold)
    learn.fit_one_cycle(
        CFG["epochs"],
        CFG["lr"],
        cbs=[SaveModelCallback(),
             EarlyStoppingCallback(monitor='petfinder_rmse',
                                   comp=np.less,
                                   patience=CFG['early_stopping'])]
    )
    shutil.move("./models/model.pth", f"{CFG['output_path']}{CFG['save_name']}_{fold}.pth")
    
    del learn
    torch.cuda.empty_cache()
    gc.collect()
    
!rm -rf ./models