## libraries

In [None]:
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')
from timm import create_model
from fastai.vision.all import *
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
import gc

## Config

In [None]:
class Config:
    debug = False
    if debug:
        n_fold = 2
        epoch = 1
    else:
        n_fold = 5
        epoch = 5
    seed = 555
    lr =2e-5
    batch_size = 32
    num_workers = 8
    model_path = 'swin_large_patch4_window7_224'
    im_size = 224
    base_dir = './drive/MyDrive/petfinder'
    data_dir = '../input/petfinder-pawpularity-score/'
    model_dir = '.'

## set up environments & prepare data

- set_seed
Set random seed for random, torch, and numpy

https://docs.fast.ai/torch_core.html#set_seed

if reproducible is True:
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
set_seed(Config.seed, reproducible=True)
if not os.path.exists('/root/.cache/torch/hub/checkpoints/'):
    os.makedirs('/root/.cache/torch/hub/checkpoints/')
!cp '../input/swin-transformer/swin_large_patch4_window7_224_22kto1k.pth' '/root/.cache/torch/hub/checkpoints/swin_large_patch4_window7_224_22kto1k.pth'


In [None]:
dataset_path = Path(Config.data_dir)
dataset_path.ls()
train_df = pd.read_csv(dataset_path/'train.csv')
train_df['path'] = train_df['Id'].map(lambda x:str(dataset_path/'train'/x)+'.jpg')
train_df = train_df.drop(columns=['Id'])
train_df.head()

## StratifiedKFold

In [None]:
if Config.debug:
    train_df = train_df.sample(500).reset_index(drop = True)
train_df['norm_score'] = train_df['Pawpularity'] / 100
#Sturges' rule
num_bins = int(np.floor(1+(3.3)*(np.log2(len(train_df)))))
train_df['bins'] = pd.cut(train_df['norm_score'], bins=num_bins, labels=False)
train_df['fold'] = -1

skf = StratifiedKFold(n_splits = Config.n_fold, shuffle=True, random_state =Config.seed)
for i, (_, train_index) in enumerate(skf.split(train_df.index, train_df['bins'])):
    train_df.iloc[train_index, -1] = i
    
train_df['fold'] = train_df['fold'].astype('int')

train_df.fold.value_counts().plot.bar()

In [None]:
train_df[train_df['fold']==0].head()

## helper function

### petfiner_rmse
calculate competition metrics

### get_data
return dataloaders using ImageDataloaders by each fold

- batch_tfms (augmentation) is None. this is applied to train data
- item_tfms: resizing is applied to train and valid data. method 'squish' is the same as 'resize' in albumentation
https://docs.fast.ai/vision.data.html#ImageDataLoaders.from_df
https://docs.fast.ai/vision.augment.html#Resize

### get_learner
get Learner instance which has model, data, traning params and so on.

- loss_fonc: BCEWithLogitsLossFlat is the same as nn.BCEWithLogitsLoss
- opt_func: default is Adam the same as torch.optim.AdamW, but eps and beta2 is different.
eps = 1e-5, sqr_mom(beta2)=0.99


reference
- https://docs.fast.ai/learner.html#Learner
- https://docs.fast.ai/optimizer.html#Adam
- https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html
- https://docs.fast.ai/losses.html#BCEWithLogitsLossFlat


In [None]:
def petfinder_rmse(input,target):
    return 100*torch.sqrt(F.mse_loss(F.sigmoid(input.flatten()), target))

def get_data(fold):
    train_df_f = train_df.copy()
    # add is_valid for validation fold
    train_df_f['is_valid'] = (train_df_f['fold'] == fold)
    
    dls = ImageDataLoaders.from_df(train_df_f,
                               valid_col='is_valid',
                               seed=Config.seed,
                               fn_col='path',
                               label_col='norm_score',
                               y_block=RegressionBlock,
                               bs=Config.batch_size,
                               num_workers=Config.num_workers,
                               item_tfms=Resize(Config.im_size, method='squish')
                                  )
    return dls
                            
def get_learner(fold_num):
    data = get_data(fold_num)
    
    model = create_model(Config.model_path, pretrained=True, num_classes=data.c)

    learn = Learner(data, model, loss_func=BCEWithLogitsLossFlat(), metrics=petfinder_rmse).to_fp16()
    
    return learn

## confirm image transforms

In [None]:
data_fold0 = get_data(0)
data_fold0.train.after_batch.fs

In [None]:
data_fold0.train.after_item.fs

## training loop

In [None]:
all_preds = []

for i in range(Config.n_fold):
    print(f'Fold {i} results')    
    learn = get_learner(fold_num=i)
    learn.fit(Config.epoch, Config.lr, wd=0, cbs=[SaveModelCallback()]) 
    learn.recorder.plot_loss()
    learn = learn.to_fp32()    
    learn.export(f'{Config.model_dir}/model_fold_{i}.pkl')
    del learn
    torch.cuda.empty_cache()
    gc.collect()

## calculate cv score

In [None]:
def get_valid_data(fold):
    valid_df = train_df[train_df['fold'] == fold].copy()
    
    dls = ImageDataLoaders.from_df(valid_df,
                               seed=Config.seed,
                               fn_col='path',
                               label_col='norm_score',
                               y_block=RegressionBlock,
                               bs=Config.batch_size,
                               num_workers=Config.num_workers,
                               item_tfms=Resize(Config.im_size)
                                  )
    
    return dls

def rmse_oof(_oof_df, fold=None):
    oof_df = _oof_df.copy()
    if fold is not None:
        oof_df = oof_df[oof_df["fold"] == fold]
    target = oof_df['Pawpularity'].values
    y_pred = oof_df['pred'].values
    if fold is not None:
        print(f'fold {fold}: {mean_squared_error(target, y_pred, squared=False)}')
    else:
        print(f'overall: {mean_squared_error(target, y_pred, squared=False)}')

In [None]:
oof_df = pd.DataFrame()
for i in range(Config.n_fold):
    print(f'fold{str(i)} inference')
    model_name = f'{Config.model_dir}/model_fold_{str(i)}.pkl'
    learn = load_learner(model_name, cpu=False)
    dls = ImageDataLoaders.from_df(train_df, #pass in train DataFrame
#                                valid_pct=0.2, #80-20 train-validation random split
                               #valid_col='is_valid', #
                               seed=Config.seed, #seed
                               fn_col='path', #filename/path is in the second column of the DataFrame
                               label_col='norm_score', #label is in the first column of the DataFrame
                               y_block=RegressionBlock, #The type of target
                               bs=Config.batch_size, #pass in batch size
                               num_workers=Config.num_workers,
                               item_tfms=Resize(Config.im_size), #pass in item_tfms
                               batch_tfms=setup_aug_tfms([Dihedral(), Brightness(), Contrast(), Hue(), Saturation()])) #pass in batch_tfms
    valid_df = train_df[train_df['fold'] == i].copy()
    valid_dl = dls.test_dl(valid_df)
    valid_preds, _ = learn.get_preds(dl = valid_dl)
    _oof_df = valid_df[['fold', 'Pawpularity']].copy()
    _oof_df['pred'] = valid_preds.cpu().numpy().ravel() * 100
    oof_df = pd.concat([oof_df, _oof_df])

In [None]:
for i in range(Config.n_fold):
    rmse_oof(oof_df, i)
rmse_oof(oof_df)

In [None]:
oof_df.sort_index().to_csv('oof.csv', index=False)

In [None]:
plt.hist(oof_df['Pawpularity'].values, alpha = 0.4, color = 'b', label = 'target', bins = 50)
plt.hist(oof_df['pred'].values, alpha = 0.4, color = 'g', label = 'prediction', bins = 50)
plt.show()