In [None]:
!pip install python-box timm pytorch-lightning==1.4.0 

## Import Libraries / Load Data :

In [None]:
import os
import warnings
from pprint import pprint
from glob import glob
from tqdm import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torchvision.transforms as T
from box import Box
from timm import create_model
from sklearn.model_selection import StratifiedKFold
from torchvision.io import read_image
from torch.utils.data import DataLoader, Dataset

import pytorch_lightning as pl
from pytorch_lightning.utilities.seed import seed_everything
from pytorch_lightning import callbacks
from pytorch_lightning.callbacks.progress import ProgressBarBase
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning import LightningDataModule, LightningModule


warnings.filterwarnings("ignore")

In [None]:
torch.autograd.set_detect_anomaly(True)
seed_everything(2021)

train = pd.read_csv('/kaggle/input/petfinder-pawpularity-score/train.csv')
train["Id"] = train["Id"].apply(lambda x: '/kaggle/input/petfinder-pawpularity-score/train/'+ x + ".jpg")

test = pd.read_csv('/kaggle/input/petfinder-pawpularity-score/test.csv')
test["Id"] = test["Id"].apply(lambda x: '/kaggle/input/petfinder-pawpularity-score/test/'+ x + ".jpg")

## EDA on Tabular Data / Value Check :

In [None]:
train.shape

In [None]:
train.info()

In [None]:
len(train[train.duplicated()])

In [None]:
print('Min = ',train.Pawpularity.min())
print('Max = ', train.Pawpularity.max())

In [None]:
for col in train.columns[1:-1] :
    plt.figure(figsize = (10, 5))
    sns.countplot(x = col, data = train)

In [None]:
for col in train.columns[1:-1] :
    v1 = train[col].value_counts().index[0]
    v2 = train[col].value_counts().index[1]
    val1 = train[train[col] == v1]
    val2 = train[train[col] == v2]
    plt.figure(figsize = (10, 5))
    sns.histplot(val1.Pawpularity, label = v1, color = 'green')
    sns.histplot(val2.Pawpularity, label = v2, color = 'pink')
    plt.title(col)
    plt.legend()

In [None]:
for col in train.columns[1:-1] :
    plt.figure()
    sns.violinplot(y = 'Pawpularity', x = col, data = train)
    plt.title(col)

We recognize that there are no significant differences between the distributions over the values of a column! Which means our tabular data (metadata) will not bring value to our model, so we decide to ignore it during this competition.

Let's work just on images to extract meaningfull insights from it ;)

## config

In [None]:
config = {'trainer': {
              'gpus': 1,
              'accumulate_grad_batches': 1,
              'progress_bar_refresh_rate': 1,
              'fast_dev_run': False,
              'num_sanity_val_steps': 0,
              'resume_from_checkpoint': None,
          },
          'transform':{
              'name': 'get_default_transforms'
          },
          'model':{
              'name': 'swin_tiny_patch4_window7_224',
              'output_dim': 1
          },
          'optimizer':{
              'name': 'optim.AdamW',
              'params':{
                  'lr': 1e-5
              },
          },
          'scheduler':{
              'name': 'optim.lr_scheduler.CosineAnnealingWarmRestarts',
              'params':{
                  'T_0': 20,
                  'eta_min': 1e-4,
              }
          },
          'loss': 'nn.BCEWithLogitsLoss',
}

config = Box(config)

## Dataset Class :

In [None]:
class PetfinderDataset(Dataset):
    def __init__(self, df, image_size=224):
        self._X = df["Id"].values
        self._y = None
        if "Pawpularity" in df.keys():
            self._y = df["Pawpularity"].values
        self._transform = T.Resize([image_size, image_size])

    def __len__(self):
        return len(self._X)

    def __getitem__(self, idx):
        image_path = self._X[idx]
        image = read_image(image_path)
        image = self._transform(image)
        label = self._y[idx]
        if self._y is not None:
            label = self._y[idx]
            return image, label
        return image

In [None]:
class PetfinderDataModule(LightningDataModule):
    def __init__(self, train_df, val_df):
        super().__init__()
        self._train_df = train_df
        self._val_df = val_df

    def __create_dataset(self, train=True):
        return (
            PetfinderDataset(self._train_df, 224)
            if train
            else PetfinderDataset(self._val_df, 224)
        )
    
    def train_dataloader(self):
        dataset = self.__create_dataset(True)
        return DataLoader(dataset, 
              batch_size= 64,
              shuffle= True,
              num_workers= 4,
              pin_memory= False,
              drop_last= True)

    def val_dataloader(self):
        dataset = self.__create_dataset(False)
        return DataLoader(dataset, 
              batch_size= 64,
              shuffle=False,
              num_workers= 4,
              pin_memory= False,
              drop_last= False)


In [None]:
# augmentation
def get_default_transforms():
    transform = T.Compose(
            [
                T.RandomHorizontalFlip(),
                T.RandomVerticalFlip(),
                T.ConvertImageDtype(torch.float),
                T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            ]
        )
    return transform

## Modeling and Trainig : 

In [None]:
class Model(pl.LightningModule):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.__build_model()
        self._criterion = eval(self.cfg.loss)()
        self.transform = get_default_transforms()
        self.save_hyperparameters(cfg)

    def __build_model(self):
        self.backbone = create_model(
            self.cfg.model.name, pretrained=True, num_classes=0, in_chans=3
        )
        num_features = self.backbone.num_features
        self.fc = nn.Sequential(
            nn.Dropout(0.5), nn.Linear(num_features, self.cfg.model.output_dim)
        )

    def forward(self, x):
        f = self.backbone(x)
        out = self.fc(f)
        return out

    def training_step(self, batch, batch_idx):
        loss, pred, labels = self.__share_step(batch, 'train')
        return {'loss': loss, 'pred': pred, 'labels': labels}
        
    def validation_step(self, batch, batch_idx):
        loss, pred, labels = self.__share_step(batch, 'val')
        return {'pred': pred, 'labels': labels}
    
    def __share_step(self, batch, mode):
        images, labels = batch
        labels = labels.float() / 100.0
        images = self.transform(images)
        
        logits = self.forward(images).squeeze(1)
        loss = self._criterion(logits, labels)
        
        pred = logits.sigmoid().detach().cpu() * 100.
        labels = labels.detach().cpu() * 100.
        return loss, pred, labels
        
    def training_epoch_end(self, outputs):
        self.__share_epoch_end(outputs, 'train')

    def validation_epoch_end(self, outputs):
        self.__share_epoch_end(outputs, 'val')    
        
    def __share_epoch_end(self, outputs, mode):
        preds = []
        labels = []
        for out in outputs:
            pred, label = out['pred'], out['labels']
            preds.append(pred)
            labels.append(label)
        preds = torch.cat(preds)
        labels = torch.cat(labels)
        metrics = torch.sqrt(((labels - preds) ** 2).mean())
        self.log(f'{mode}_loss', metrics)

    def configure_optimizers(self):
        optimizer = eval(self.cfg.optimizer.name)(
            self.parameters(), **self.cfg.optimizer.params
        )
        scheduler = eval(self.cfg.scheduler.name)(
            optimizer,
            **self.cfg.scheduler.params
        )
        return [optimizer], [scheduler]

In [None]:
skf = StratifiedKFold(
    n_splits=5, shuffle=True, random_state=2021
)

for fold, (train_idx, val_idx) in enumerate(skf.split(train["Id"], train["Pawpularity"])):
    train_df = train.loc[train_idx].reset_index(drop=True)
    val_df = train.loc[val_idx].reset_index(drop=True)
    datamodule = PetfinderDataModule(train_df, val_df)
    model = Model(config)
    earystopping = EarlyStopping(monitor="val_loss")
    lr_monitor = callbacks.LearningRateMonitor()
    loss_checkpoint = callbacks.ModelCheckpoint(
        filename="best_loss",
        monitor="val_loss",
        save_top_k=1,
        mode="min",
        save_last=False,
    )
    logger = TensorBoardLogger(config.model.name)
    
    trainer = pl.Trainer(
        logger=logger,
        max_epochs=2,
        callbacks=[lr_monitor, loss_checkpoint, earystopping],
        **config.trainer,
    )
    trainer.fit(model, datamodule=datamodule)