In [None]:
import os
import sys
import random
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import cv2
import gc
import seaborn as sns
import albumentations as A
import matplotlib.pyplot as plt

import torch
import torch.nn as nn

sys.path.append("../input/efficientnet")

from  efficientnet_pytorch import EfficientNet

1.  This kernel is an attempt to understand the behaviour of the traiing baseline model.[https://www.kaggle.com/narendra/pawpularity-baseline-submission]
2. Training Data of scores is similar to gausssian, which has some classes are underrepresented with Pawpularity <=10 and >60
3. During baseline training there will be a chance that model could be more biased to the higher representation images like around (20-40), due to more sampling.
4. As a result images with higher scores might be pulled down and lower scores can be pushed up which can significantly change the distribution of model to the expected.

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu' )
print(device)

In [None]:
def seed_everything():
    np.random.seed(10)
    random.seed(10)
    torch.manual_seed(10)

seed_everything()

In [None]:
train_folder="../input/pawpularity-resize-256/resized"
train_df=pd.read_csv("../input/petfinder-pawpularity-score/train.csv")
print(train_df.shape)

In [None]:
train_df['bin_num'] = train_df.Pawpularity.apply(lambda x: min(9, x//10))
train_df['score'] = train_df.Pawpularity/100
train_df.head()

In [None]:
def kfold(train_df, k=5):
    image_ids=train_df.Id.values
    bins=train_df.bin_num.unique()
    train_df=train_df.sample(frac=1.0, random_state=22)
    fold_map={}
    
    for bin_num in bins:
        image_ids=train_df[train_df.bin_num == bin_num].Id.values
        num_images=len(image_ids)
        slice_length=num_images//k
        for i in range(0, num_images, slice_length):
            fold_num=min(i//slice_length, k-1)
            for j in range(i, i+slice_length):
                if j >= num_images:
                    break
                fold_map[ image_ids[j] ] = fold_num
    
    df=train_df.copy()
    df['fold'] = df['Id'].apply(lambda x: fold_map[x])
    return df

# Transformations

In [None]:
train_transform=A.Compose([
    A.HorizontalFlip(p=0.5),
    A.Rotate(p=0.7, limit=(-20, 20), border_mode=2),
    A.RGBShift(p=1.0, r_shift_limit=(-15, 15),
               g_shift_limit=(-15, 15),
               b_shift_limit=(-15, 15)
              ),
    A.RandomBrightnessContrast(p=1.0),
    A.CoarseDropout(p=1.0,min_holes=5, max_holes=10,
                    min_width=8, max_width=12,
                    min_height=8, max_height=12),
    
    A.Normalize(p=1.0)
])
val_transform = A.Compose([A.Normalize(p=1.0)])

# Read Images

In [None]:
def read_image(image_name, phase):
    filepath=os.path.join(train_folder, "{}.jpg".format(image_name))
    img=cv2.imread(filepath)
    img=cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    if phase!='eval':
        img=train_transform(image = img)['image']
    else:
        img=val_transform(image = img)['image']
    return img

# Model

In [None]:
class Baseline(nn.Module):
    def __init__(self):
        super(Baseline, self).__init__()
        self.efficient_net=EfficientNet.from_pretrained('efficientnet-b0', include_top=True)
        self.avg_pooling=nn.AdaptiveAvgPool2d(1)
        
        self.fc=nn.Sequential(
            nn.BatchNorm1d(1280),
            nn.Linear(1280, 512),
            nn.SiLU(),
            nn.Dropout(0.2),
            
            nn.BatchNorm1d(512),
            nn.Linear(512, 1)
        )
    def forward(self, x):
        batch_size=x.size(0)
        x=self.efficient_net.extract_features(x)
        x=self.avg_pooling(x).view(batch_size, -1)
        x=self.fc(x)
        x=torch.sigmoid(x)
        return x

# Dataset and dataloaders

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, image_ids, scores, phase):
        self.image_ids=image_ids
        self.scores=scores
        self.phase=phase
    def __getitem__(self, idx):
        image_name = self.image_ids[idx]
        score=self.scores[idx]
        img=read_image(image_name, self.phase)
        
        X=torch.tensor(img, dtype=torch.float32).transpose(0, 2)
        y=torch.tensor(score, dtype=torch.float32)
        return (X, y)
        
    def __len__(self):
        return len(self.image_ids)

In [None]:
def get_dataloaders(fold_num, df):
    BATCH_SIZE=64
    
    train_image_ids=df[df.fold!=fold_num].Id.values
    train_scores=df[df.fold!=fold_num].score.values
    
    val_image_ids=df[df.fold==fold_num].Id.values
    val_scores=df[df.fold==fold_num].score.values
    
    train_dataset=Dataset(train_image_ids, train_scores, phase='train')
    val_dataset=Dataset(val_image_ids, val_scores, phase='eval')
    
    train_dataloader=torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_dataloader=torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=False)
    
    return (train_dataloader, val_dataloader)

# training

In [None]:
def rmse(y, yhat):
    yerr=torch.abs(y-yhat)
    rmse_loss=torch.sqrt( torch.mean( yerr**2 ) )
    return rmse_loss

In [None]:
def evaluate(val_dataloader, model):
    model.eval()
    eval_loss = 0
    for it, (X, y) in enumerate(val_dataloader):
        X=X.to(device)
        y=y.to(device)
        
        with torch.no_grad():
            yhat=model(X)
            rmse_loss=rmse(y, yhat)
            eval_loss+=rmse_loss
    eval_loss/=len(val_dataloader)
    return eval_loss

In [None]:
model=torch.load("../input/pawpularity-baseline/model1.pth", map_location=device)
model=model.to(device)

In [None]:
train_fold_df=kfold(train_df)
val_df=train_fold_df[train_fold_df.fold==0].copy()

val_df.head()

In [None]:

val_image_ids=val_df.Id.values
val_scores=val_df.score.values
val_dataset=Dataset(val_image_ids, val_scores, phase='eval')

val_dataloader=torch.utils.data.DataLoader(val_dataset, batch_size=64, shuffle=False, drop_last=False)

In [None]:
yval=[]
model.eval()
for it, (X, _) in enumerate(val_dataloader):
    if it%10 == 0:
        print(it*64)
    with torch.no_grad():
        yhat=model(X)
        yhat=yhat.view(-1).tolist()
        yval+=yhat
print(len(yval))

In [None]:
val_df['pred_score'] = yval
val_df['pred_bin'] = val_df['pred_score'].apply(lambda x: min(9, (100*x)//10))

val_df.head()

In [None]:
plt.title("Score (Vs) Predicted Score")
sns.kdeplot(data=val_df[['score', 'pred_score']], fill=True)
plt.show()

1. We can see the predicted model is more biased to the middle region with higher peak.

# lets check the distributions

In [None]:
_, ax=plt.subplots(nrows=2, ncols=1)
sns.countplot(x=val_df.bin_num, ax=ax[0])
sns.countplot(x=val_df.pred_bin, ax=ax[1])
plt.show()

In [None]:
plt.figure(figsize=(17, 5))
sns.countplot(data=val_df, x='bin_num', hue='pred_bin')
plt.show()

1. Model ignored the training images below pawpularity score<10
2. As expected there are spikes with predicted classes 2,3,4.

Furthur Improvements.

1. Use classfication or distance representation to penalize samples that is predicted far away from original.
2. Reduce the bias of overrepresented samples.
3. using GAN's to increase the underrepresented samples.
4. Training is overfitting , have to use better augmentations or auxilary tasks, to improve representations