<h1 style='text-align: center'> Pawpularity Outlier Prediction </h1>

Today I tried  building a model that predicts outliers from the dataset (regarding their pawpularity score), since this is something a general model should not be too good at.
I separated the data into 3 different categories: Pawpularity < 10, Pawpularity == 100, and the rest.
Then I trained the model using an EfficientNet Backbone.
Haven't done a lot of experiments, but to be honest the approach doesn't seem too promising. CV is not really better, if at all. But if you want more details, take a look at the code.

In [None]:
import sys
sys.path.append('/kaggle/input/packages/packages')

import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
from torchvision.io import read_image
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.loggers import WandbLogger
import wandb
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from box import Box
import timm

# wandb.login()

# Config

In [None]:
cfg = {
    'img_size': 240,
    'n_splits': 5,
    'ratio': 1,
    'model': {
        'name': 'efficientnet_b1',
        'dropout': 0.5,
        'out_dim': 3
    },
    'train_loader': {
        'batch_size': 64,
        'shuffle': True
    },
    'val_loader': {
        'batch_size': 64
    },
    'criterion': 'nn.BCEWithLogitsLoss',
    'optim': {
        'name': 'torch.optim.Adam',
        'params': {
            'lr': 1e-5
        }
    },
    'scheduler': {
        'name': 'torch.optim.lr_scheduler.CosineAnnealingLR',
        'params': {
            'T_max': 6
        }
    }
}

cfg = Box(cfg)

# Dataset/Module

In [None]:
class PDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.img_path = df['img_path'].values
        self.paw = df['Pawpularity'].values
        self.y = df[['<10', '100', 'middle']].values
        self._transform = T.Resize([self.cfg.img_size, self.cfg.img_size])
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        img = read_image(self.img_path[idx])
        img = self._transform(img)
        y = self.y[idx]
        paw = self.paw[idx]
        return img, y, paw
        
class PDataModule(pl.LightningDataModule):
    def __init__(self, cfg, df, train_idx, val_idx):
        super().__init__()
        self.cfg = cfg
        self.df = df
        self.train_idx = train_idx
        self.val_idx = val_idx
    
    def setup(self, stage = None):
        train, val = get_balanced_split(self.df, self.train_idx, self.val_idx, self.cfg.ratio)
        self.train = PDataset(self.cfg , train)
        self.val = PDataset(self.cfg, val)
        
    def train_dataloader(self):
        return DataLoader(self.train, **self.cfg.train_loader)
    
    def val_dataloader(self):
        return DataLoader(self.val, **self.cfg.val_loader)

# Augmentation

In [None]:
IMAGENET_MEAN = [0.485, 0.456, 0.406]  # RGB
IMAGENET_STD = [0.229, 0.224, 0.225]  # RGB

def get_augmentation():
    return {
        'train': T.Compose([
            T.RandomHorizontalFlip(p=0.5),
            T.RandomVerticalFlip(p=0.2),
            T.ColorJitter(brightness=.2, contrast=.2, saturation=.2), #about 15% proabablity that image is changed in some way
            T.ConvertImageDtype(torch.float),
            T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
        ]),
        'val': T.Compose([
            T.ConvertImageDtype(torch.float),
            T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
        ])
    }

# Network

In [None]:
class PModel(pl.LightningModule):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.__build_model()
        self.save_hyperparameters(cfg)
        self.criterion = eval(self.cfg.criterion)()
        self._augmentation = get_augmentation()
        
    def __build_model(self):
        self.backbone = timm.create_model(self.cfg.model.name, in_chans=3)
        self.classifier = nn.Sequential(
            nn.Dropout(p=self.cfg.model.dropout),
            nn.Linear(1000, self.cfg.model.out_dim)
        )
        
    def forward(self, x):
        out = self.backbone(x)
        out = self.classifier(out)
        return out
        
    def training_step(self, batch, batch_idx):
        loss = self.__share_step(batch, 'train')
        self.log('train_loss', loss, on_step = True, on_epoch = True)
        return loss
        
    def validation_step(self, batch, batch_idx):
        loss = self.__share_step(batch, 'val')
        self.log('val_loss', loss)
        
    def __share_step(self, batch, mode):
        img, y, paw = batch
        img = self._augmentation[mode](img)
        y_hat = self(img).squeeze()
        loss = self.criterion(y_hat, y.float())
        return loss

    def configure_optimizers(self):
        optim = eval(self.cfg.optim.name)(self.parameters(), **self.cfg.optim.params)
#         scheduler = eval(self.cfg.scheduler.name)(optim, **self.cfg.scheduler.params)
        return optim #, scheduler


    # used for calculating RMSE
    def predict_step(self, batch, batch_idx): 
        img, y, paw = batch
        img = self._augmentation['val'](img)
        y_hat = self(img).squeeze()
        return {'y_hat': y_hat, 'paw': paw}

# Prepare Data

In [None]:
BASE_PATH = '../input/petfinder-pawpularity-score'
META = ['Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory',
       'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur']

def get_file_path(path):
    return f'{BASE_PATH}/train/{path}.jpg'

def get_balanced_split(df, train_idx, val_idx, ratio):
    train = df.loc[train_idx]
    train_unpawpular_sum = train['<10'].sum()
    train_unpawpular_index = train[np.logical_or(train['<10'] == 1, train['100'] == 1)].index
    train_ratio = train[train['middle'] == 1].sample(train_unpawpular_sum * ratio).index
    train = train.loc[train_unpawpular_index.append(train_ratio)]
    
    val = df.loc[val_idx]
    val_unpawpular_sum = val['<10'].sum()
    val_unpawpular_index = val[np.logical_or(val['<10'] == 1, val['100'] == 1)].index
    val_ratio = val[val['middle'] == 1].sample(val_unpawpular_sum * ratio).index
    val = val.loc[val_unpawpular_index.append(val_ratio)]
    
    return train, val

In [None]:
df = pd.read_csv(os.path.join(BASE_PATH, 'train.csv'))
df['img_path'] = df['Id'].apply(lambda x:get_file_path(x))
df['<10'] = df['Pawpularity'].apply(lambda x: 1 if x < 10 else 0)
df['100'] = df['Pawpularity'].apply(lambda x: 1 if x == 100 else 0)
df['middle'] = df['Pawpularity'].apply(lambda x: 1 if (x > 9 and x < 100) else 0)
df = df.drop(columns=META)

df.head()

In [None]:
mean_base = df['Pawpularity'].mean()
mean_middle = df[df['middle'] == 1]['Pawpularity'].mean()
mean_less_10 = df[df['<10'] == 1]['Pawpularity'].mean()
mean_100 = 100

print(f'mean_base: {mean_base}, mean_middle: {mean_middle}, mean_less_10: {mean_less_10}, mean_100: {mean_100}')

In [None]:
preds = []

PATH_TO_MODELS = '/kaggle/input/outlier-models-paw/models'

skf = StratifiedKFold(n_splits=cfg.n_splits)

for fold, (train_idx, val_idx) in enumerate(skf.split(df['Id'], df['middle'])):
    
    
    if PATH_TO_MODELS is None:
    
        ##############
        # Initialize #
        ##############

        dm = PDataModule(cfg, df, train_idx, val_idx)
        model = PModel(cfg)

        wandb_logger = WandbLogger(
            project = 'pawpular_outlier_prediction',
            config = cfg,
            reinit = True,
            group = 'aug + CosineAnnealingLR + ratio:1',
            name = f'fold_{fold}',
        )
        checkpoint = ModelCheckpoint(
            dirpath = '/kaggle/working/models',
            filename = f'fold_{fold}',
            monitor = 'val_loss',
            save_top_k = 1
        )
        earlystopping = EarlyStopping(
            monitor = 'val_loss',
            patience = 4,
        )

        trainer = pl.Trainer(
            fast_dev_run = False,
            gpus = 1 if torch.cuda.is_available() else 0,
            logger = wandb_logger,
            callbacks = [checkpoint, earlystopping],
            progress_bar_refresh_rate = 5,
            log_every_n_steps = 3
        )

        #########
        # Train #
        #########

        trainer.fit(model, dm)
        wandb.finish()
    
    ############################
    # Validate on whole val set#
    ############################
    
    model = PModel.load_from_checkpoint(os.path.join(PATH_TO_MODELS, f'fold_{fold}-v1.ckpt'), cfg=cfg)
    trainer = pl.Trainer(gpus = 1 if torch.cuda.is_available() else 0)
    
    whole_validation_set = PDataset(cfg, df.loc[val_idx])
    whole_validation_loader = DataLoader(whole_validation_set, batch_size=64)
    
    y_hat = trainer.predict(model, whole_validation_loader)
    preds.append(y_hat)

In [None]:
pred_to_val = {
    0: mean_less_10,
    1: mean_100,
    2: mean_middle
}

def compute_rmse(y_hat, threshold):
    outlier_baseline = []
    pawpularity = []
    
    for output in y_hat:
        out_y_hat, paw = output['y_hat'], output['paw']
        pawpularity.append(paw)
        softmax = torch.nn.Softmax()
        out_y_hat = softmax(out_y_hat)
        for i, x in enumerate(out_y_hat):
            highest_prob = x.max()
            # If the model is *threshold* certain about it's prediction, we take this prediction; otherwise we don't
            # want to risk betting on an outlier, and just predict the mean value to be on the 'safe' side
            if highest_prob > threshold:
                for k in range(3):
                    if x[k] == highest_prob:
                        outlier_baseline.append(pred_to_val[k])
                        break
            else:
                outlier_baseline.append(pred_to_val[2])
                
    mean_baseline = torch.tensor(mean_base).repeat(len(outlier_baseline))
    outlier_baseline = torch.tensor(outlier_baseline)
    pawpularity = torch.tensor([y for x in pawpularity for y in x])
    
    mean_baseline_rmse = torch.sqrt((mean_baseline - pawpularity)**2).mean()
    outlier_baseline_rmse = torch.sqrt((outlier_baseline - pawpularity)**2).mean()
    
    outlier_count = 0
    for i in outlier_baseline:
        if i != mean_middle:
            outlier_count += 1
    
    return mean_baseline_rmse, outlier_baseline_rmse, outlier_count

In [None]:
threshold_values = [0, 0.2, 0.4, 0.6, 0.8, 1]
baseline_values = []
outlier_values = []
outlier_count = []

for confidence in threshold_values:
    bv, ov, oc = compute_rmse(y_hat, confidence)
    baseline_values.append(bv)
    outlier_values.append(ov)
    outlier_count.append(oc)

In [None]:
fig, ax = plt.subplots(1,2, figsize=(18,7))

ax[0].plot(threshold_values, outlier_values, label='Outlier Model')
ax[0].plot(threshold_values, baseline_values, label='Mean Baseline')
ax[0].set_xlabel('Threshold value', fontsize=18)
ax[0].legend(prop={'size': 18})
ax[0].set_title('CV RMSE', fontsize=18)

ax[1].bar(np.arange(len(outlier_count)),outlier_count)
ax[1].set(xticks=np.arange(len(outlier_count)), xticklabels=threshold_values)
ax[1].set_xlabel('Threshold value', fontsize=18)
ax[1].set_title('Outliers predicted', fontsize=18)

fig.tight_layout()
fig.show()