## Summary

This notebook is written in vanilla pytorch - no lightning / tez to remove some of the abstractions that come with the libraries.

It uses Optuna to find the best combination of hyperparameters which are:
* Number of layers
* Number of neurons in each layer
* Mixup coefficient
* Dropout probability
* Learning rate scheduler hyperparameters

The model is fine tuning the swin_224 transformer

It takes in only images

It uses BCEwLogitLoss in training and validates using RMSELoss

## Issues

The two main issues are:
* Each trial takes around 30 minutes to complete
* RMSE loss starts at around 20.5, meanwhile normal fine tuning notebooks start at ~18.5

Looking for fixes, please give a comment if you have any leads. Thanks :3

## Libraries

In [None]:
import sys
sys.path.append("../input/tez-lib")
sys.path.append("../input/timmmaster")

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import cv2
import tez
import albumentations
import timm
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from sklearn import metrics
import torch
from tez.callbacks import EarlyStopping
from tqdm import tqdm
import os
import random
import optuna
from optuna.trial import TrialState

## Configs

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything()

In [None]:
class args:
    batch_size=32
    image_size=224
    epochs = 10
    model_name = "swin_tiny_patch4_window7_224"
    device = "cuda"

## Setup Datasets and DataLoaders

In [None]:
class PawpularDataset(Dataset):
    def __init__(self, image_paths, targets, augmentations):
        self.image_paths = image_paths
        self.targets = targets
        self.augmentations = augmentations
        
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, index):
        # read in as BGR
        image = cv2.imread(self.image_paths[index])
        # convert to RGB
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if self.augmentations is not None:
            # applies compose function from albumentations on image
            augmented = self.augmentations(image=image)
            # maybe cv2 returns a dict and to access info on image have to call 'image' key
            image = augmented['image']
            
        # transform from HxWxC to CxHxW    
        image = np.transpose(image, (2,0,1)).astype(np.float32)
        
        # normalize to [0-1]
        targets = self.targets[index] / 100.
        
        return torch.tensor(image, dtype=torch.float), torch.tensor(targets, dtype=torch.float)

In [None]:
fold = 0

df = pd.read_csv('../input/no-dupes-pawpularity/train_5folds.csv')

df_train = df[df.kfold != fold].reset_index(drop=True)
df_valid = df[df.kfold == fold].reset_index(drop=True)

train_img_paths = [f"../input/petfinder-pawpularity-score/train/{x}.jpg" for x in df_train['Id'].values]
valid_img_paths = [f"../input/petfinder-pawpularity-score/train/{x}.jpg" for x in df_valid['Id'].values]

In [None]:
train_aug = albumentations.Compose([
    #albumentations.Resize(args.image_size, args.image_size, p=1.0),
    albumentations.RandomResizedCrop(
        height=args.image_size, width=args.image_size,
        scale=(0.08,1), ratio=(0.75, 1), p=1.0
    ),
    # color shift
    albumentations.HueSaturationValue(
        hue_shift_limit=0.2, sat_shift_limit=0.2, val_shift_limit=0.2, p=0.5
    ),
    albumentations.RandomBrightnessContrast(
        brightness_limit=(-0.1,0.1), contrast_limit=(-0.1, 0.1), p=0.5
    ),
    albumentations.HorizontalFlip(p=0.5),
    #albumentations.VerticalFlip(p=0.5),
    albumentations.Rotate(limit=180, p=0.7),
    albumentations.ShiftScaleRotate(
        shift_limit=0.1, scale_limit=0.1, rotate_limit=45, p=0.5
    ),
    albumentations.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
        max_pixel_value=255.0,
        p=1.0
    ),
], p=1.0)

valid_aug = albumentations.Compose([
    albumentations.Resize(args.image_size, args.image_size, p=1.0),
    albumentations.Normalize(
        mean = [0.485, 0.456, 0.406],
        std = [0.229, 0.224, 0.225],
        max_pixel_value = 255.0,
        p = 1.0,
    ),
], p=1.0)

In [None]:
train_dataset = PawpularDataset(
        image_paths = train_img_paths,
        targets = df_train.Pawpularity.values,
        augmentations = train_aug,
        )

valid_dataset = PawpularDataset(
        image_paths = valid_img_paths,
        targets = df_valid.Pawpularity.values,
        augmentations = valid_aug,
        )

In [None]:
train_loader = DataLoader(
    train_dataset,
    batch_size=args.batch_size,
    shuffle=True,
    pin_memory=True,
    num_workers=2
)

valid_loader = DataLoader(
    valid_dataset,
    batch_size=args.batch_size,
    shuffle=True,
    pin_memory=True,
    num_workers=2
)

In [None]:
# x is images, y is targets
def mixup_data(x, y, mixup_alpha):
    if mixup_alpha > 0:
        lam = np.random.beta(mixup_alpha, mixup_alpha)
    else:
        lam = 1
        
    batch_size = x.size()[0]
    # returns list of shuffled indices in batch size
    index = torch.randperm(batch_size).cuda()
    
    # mix current x with lambda n rest with pics from the shuffled indices
    mixed_x = lam * x + (1 - lam) * x[index, :]
    
    # returns targets for current x n ones used for mix
    y_a, y_b = y, y[index]
    
    return mixed_x, y_a, y_b, lam

# where pred is the output from the forward - predictions basically
def mixup_loss(loss_fn, pred, y_a, y_b, lam):
    # get loss from current x n loss from watermarks n add
    return lam * loss_fn(pred, y_a) + (1 - lam) * loss_fn(pred, y_b)

## Set model to optimize [Optuna]

In [None]:
# Source: https://github.com/optuna/optuna-examples/blob/main/pytorch/pytorch_simple.py

def define_model(trial):
    pretrained_model = timm.create_model(args.model_name, pretrained=True, in_chans=3)
    # Suggest how many linear layers to try (tries different # of layers each run)
    n_layers = trial.suggest_int('n_layers', 1,2)
    layers = []
    
    # get number of features from pretrained + 13 meta features
    in_features = pretrained_model.head.in_features
    for i in range(n_layers):
        # try different number of out features for a specific layer
        out_features = trial.suggest_int('n_units_l{}'.format(i), 64, 512)
        if (i == 0):
            # add pretrained model layer only on 1st layer
            pretrained_model.head = nn.Linear(in_features, out_features)
            layers.append(pretrained_model)
        else:
            # add layer to layers list
            layers.append(nn.Linear(in_features, out_features))
        # try different probabilities for dropout
        p = trial.suggest_float('dropout_l{}'.format(i), 0.1, 0.5)
        # add dropout layer with that probability
        layers.append(nn.Dropout(p))
        # Add the activation function
        layers.append(nn.ReLU())
        
        # first layer's out features becomes next layer's in features
        in_features = out_features
    
    # After run through optimization: insert these layers
    layers.append(nn.Linear(in_features, 1))
    
    return nn.Sequential(*layers)

## Set objective function [Optuna]

In [None]:
def objective(trial):
    model = define_model(trial).to(args.device)
    
    optimizer = optim.Adam(model.parameters())
    T_0 = trial.suggest_int('T_0', 10,20)
    eta_min = trial.suggest_loguniform('eta_min', 1e-6, 1e-4)
    lr_scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
        optimizer = optimizer, 
        T_0 = T_0, 
        eta_min = eta_min
    )
    mixup_alpha = trial.suggest_float('mixup_alpha', 0.1, 0.4)
    # other hyperparams to consider including: activation funcs (relu, gelu, leaky relu)
    
    for epoch in range(args.epochs):
        print(f'Epoch {epoch} start!')
        # Credit: https://www.kaggle.com/c/petfinder-pawpularity-score/discussion/289790 for calculating epoch rmse
        epoch_mse = 0
        val_size = 0
        model.train()
        
        for batch_idx, (img, target) in enumerate(train_loader):
            
            # mixup the data and send tensors to gpu
            mixed_img, target_a, target_b, lam = mixup_data(img.to(args.device), target.view(-1,1).to(args.device), mixup_alpha)
            mixed_img, target_a, target_b = mixed_img.to(args.device), target_a.to(args.device), target_b.to(args.device)
                        
            # small hunch: i think gpu not being used when calc loss, backward n step 
            
            # standard zero out gradients, calc loss, backprop and step pattern
            optimizer.zero_grad()
            output = model(mixed_img).to(args.device)
            loss = mixup_loss(nn.BCEWithLogitsLoss(), output, target_a, target_b, lam).to(args.device)
            loss.backward()
            optimizer.step()

        model.eval()
        
        with torch.no_grad():
            for batch_idx, (img, target) in enumerate(valid_loader):
                img, target = img.to(args.device), target.to(args.device)
                # make a prediction with the model
                output = model(img)
                # output still in the form of logits, use sigmoid to convert to 0-1 and * 100 to be in range [0-100]
                normalized_output = torch.sigmoid(output) * 100
                batch_mse = nn.MSELoss()(normalized_output, target.view(-1,1) * 100)
                # Note: apparently torch doesn't go thru entire dataset here
                # size goes up to 320 here: 32 x 10 epochs
                # so dataloader randomly picks 32 samples for an epoch (not whole dataset)
                batch_size = target.shape[0]
                epoch_mse += batch_mse * batch_size # original: batch_data.shape[0] 
                val_size += batch_size
        
        epoch_mse = epoch_mse / val_size 
        # basically sqrt
        epoch_rmse = epoch_mse ** 0.5
        print(f'Epoch RMSE: {epoch_rmse}')
        
        trial.report(epoch_rmse, epoch)
        
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
        
    return epoch_rmse

## Optimization happens here

In [None]:
study = optuna.create_study(direction='minimize')
# Optimize for 2 hours
study.optimize(objective, timeout=60*60*2)

In [None]:
pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

In [None]:
print("Study Statistics: ")
print("    Number of finished trials: ", len(study.trials))
print("    Number of pruned trials: ", len(pruned_trials))
print("    Number of complete trials: ", len(complete_trials))
print("-------------------------------")
print("Best Trial: ")
trial = study.best_trial
print("    BCELoss: ", trial.value)
print("    Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

## Visualizations

In [None]:
fig = optuna.visualization.plot_param_importances(study)
fig.show()

In [None]:
fig = optuna.visualization.plot_optimization_history(study)
fig.show()