# Competition Task
The objective of this competition is to find a populaity score for a pet given their picture and other meta-data. The metadata corresponds to the aesthetic features of the picture provided and it is hand labeled

# Competition Metric
Submissions are scored on the __Root Mean Squared Error__

# Code Requirements
- CPU Notebook <= 9 hours run-time
- GPU Notebook <= 9 hours run-time
- Internet access disabled
- Freely & publicly available external data is allowed, including pre-trained models
- Submission file must be named submission.csv

In [None]:
%matplotlib inline

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os, random, math
import itertools
from pathlib import Path

from tqdm import tqdm
import gc

import cv2
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

from plotly.offline import iplot

import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline = False, world_readable = True)

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GroupKFold, StratifiedKFold
from sklearn.metrics import mean_squared_error

plt.rcParams["figure.figsize"] = (16, 10)
plt.rcParams['axes.titlesize'] = 12

print(os.listdir('../input/petfinder-pawpularity-score/'))
   
from time import time, strftime, gmtime

start = time()
print(start)

import datetime
print(str(datetime.datetime.now()))

import warnings
warnings.simplefilter('ignore')

In [None]:
base_dir = '../input/petfinder-pawpularity-score/'

In [None]:
train = pd.read_csv(base_dir + 'train.csv')
print(train.shape)
train.head()

In [None]:
test = pd.read_csv(base_dir + 'test.csv')
print(test.shape)
test.head()

In [None]:
sub = pd.read_csv(base_dir + 'sample_submission.csv')
print(sub.shape)
sub.head()

In [None]:
train.info(), test.info()

In [None]:
train.describe().T

In [None]:
plt.figure(figsize = (16, 8))
sns.histplot(train['Pawpularity'], color = 'green', kde = True);

In [None]:
fig, axes = plt.subplots(4, 3, figsize = (20, 16))
axes = axes.ravel()

palette = itertools.cycle(sns.color_palette())

for i, col in enumerate(train.columns[1:-2]):
    c = next(palette)
    ax = sns.countplot(data = train, x = col, ax = axes[i], color = c)
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x() + p.get_width() / 2.0, height + 3,
                f"{round(100 * height / len(train[col]), 2)}%",
                ha = 'center')
fig.tight_layout()

<font size = 4> Binning the pawpularity score into 4 categories</font>

In [None]:
bins = [0, 25, 50, 75, 100]
train['paw_binnned'] = pd.cut(train['Pawpularity'], bins = bins, labels = ['Not So Good', 'Average', 'Good', 'Great'])

In [None]:
ax = sns.countplot(train['paw_binnned'])
ax.set_title('Distribution of Pawpularity Score - Binned')
for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x() + p.get_width() / 2.0, height + 3,
                f"{round(100 * height / len(train['paw_binnned']), 2)}%",
                ha = 'center')

- We see that most of the score is between 25 to 50 range
- The next highest cateogry is the between the range 0 to 25
- We'll visualize few images from each of these categories to better understand the how they're scored

In [None]:
def pawlot_helper(nrows: int, ncols: int, category: str):
    img_idx = np.random.choice(train[train['paw_binnned'] == category]['Id'], nrows * ncols)
    fig, axes = plt.subplots(nrows, ncols, figsize = (16, 14))
    axes = axes.ravel()
    for i, idx in enumerate(img_idx):
        img_path = f"{base_dir}train/{idx}.jpg"
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        axes[i].imshow(img)
        axes[i].set_title(f"{category} - {train[train['Id'] == idx]['Pawpularity'].values[0]}")
        
    plt.show()

In [None]:
pawlot_helper(3, 3, 'Great')

In [None]:
pawlot_helper(3, 3, 'Good')

In [None]:
pawlot_helper(3, 3, 'Average')

In [None]:
pawlot_helper(3, 3, 'Not So Good')

- There are pictures that could be in high score but given low scores - manual labeling error
- Pictures are of different sizes - needs resizing before feeding into a NN

- #### Let's check the pets' pictures by category one by one:

In [None]:
def category_plot_helper(df: pd.DataFrame, nrows: int, ncols: int, category: str):
    img_idx = np.random.choice(df['Id'], nrows * ncols)
    fig, axes = plt.subplots(nrows, ncols, figsize = (20, 10))
    axes = axes.ravel()
    for i, idx in enumerate(img_idx):
        img_path = f"{base_dir}train/{idx}.jpg"
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = cv2.resize(img, (224, 224))
        axes[i].imshow(img)
        axes[i].set_title(f"{category} \n {idx}")
        axes[i].axis('off')
    fig.tight_layout()  
    plt.show()

# Focus 
- Pet stands out against uncluttered background, not too close / far.

In [None]:
focus = train[train['Subject Focus'] == 1][['Id']].copy()
no_focus = train[train['Subject Focus'] == 0][['Id']].copy()
category_plot_helper(focus, 1, 6, 'Focus')
category_plot_helper(no_focus, 1, 6, 'No Focus')

del focus, no_focus
_ = gc.collect()

# Eyes 
- Both eyes are facing front or near-front, with at least 1 eye / pupil decently clear.

In [None]:
clear_eyes = train[train['Eyes'] == 1][['Id']].copy()
no_clear_eyes = train[train['Eyes'] == 0][['Id']].copy()
category_plot_helper(clear_eyes, 1, 6, 'Eyes')
category_plot_helper(no_clear_eyes, 1, 6, 'No Clear Eyes')

del clear_eyes, no_clear_eyes
_ = gc.collect()

# Face 
- Decently clear face, facing front or near-front.

In [None]:
face = train[train['Face'] == 1][['Id']].copy()
no_clear_face = train[train['Face'] == 0][['Id']].copy()
category_plot_helper(face, 1, 6, 'Face')
category_plot_helper(no_clear_face, 1, 6, 'No Clear Face')

del face, no_clear_face
_ = gc.collect()

# Near 
- Single pet taking up significant portion of photo (roughly over 50% of photo width or height).

In [None]:
near = train[train['Near'] == 1][['Id']].copy()
no_near = train[train['Near'] == 0][['Id']].copy()
category_plot_helper(near, 1, 6, 'Near')
category_plot_helper(no_near, 1, 6, 'No Near')

del near, no_near
_ = gc.collect()

# Group 
- More than 1 pet in the photo.

In [None]:
group = train[train['Group'] == 1][['Id']].copy()
no_group = train[train['Group'] == 0][['Id']].copy()
category_plot_helper(group, 1, 6, 'Group')
category_plot_helper(no_group, 1, 6, 'No Group')

del group, no_group
_ = gc.collect()

# Occlusion 
- Specific undesirable objects blocking part of the pet (i.e. human, cage or fence). Note that not all blocking objects are considered occlusion.

In [None]:
occlusion = train[train['Occlusion'] == 1][['Id']].copy()
no_occlusion = train[train['Occlusion'] == 0][['Id']].copy()
category_plot_helper(occlusion, 1, 6, 'Occlusion')
category_plot_helper(no_occlusion, 1, 6, 'No Occlusion')

del occlusion, no_occlusion
_ = gc.collect()

In [None]:
temp = train[train.iloc[:, 1: -2].sum(axis = 1) == 0 ]
print(f"Num of pets scoring 0 in all the categories: {len(temp)}")
fig, axes = plt.subplots(2, 4, figsize = (16, 8))
axes = axes.ravel()
for i, idx in enumerate(np.random.choice(temp['Id'], 8)):
    img_path = f"{base_dir}train/{idx}.jpg"
    img = cv2.imread(img_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (224, 224))
    axes[i].imshow(img)
    axes[i].set_title(f"{idx}")
    axes[i].axis('off')
fig.tight_layout()

# Modeling and Prediction

In [None]:
!pip install -q efficientnet >> /dev/null

In [None]:
import tensorflow as tf
import tensorflow.keras.backend as K

import efficientnet.tfkeras as efn

import yaml
from kaggle_datasets import KaggleDatasets

In [None]:
GCS_PATH  = KaggleDatasets().get_gcs_path('petfinder-pawpularity-score')
GCS_PATH

In [None]:
train['img_path'] = GCS_PATH + '/train/' + train['Id'] + '.jpg'
test['img_path'] = GCS_PATH + '/test/' + test['Id'] + '.jpg'

In [None]:
config = {
    'DEBUG': False,
    'DIR': base_dir,
    'DEVICE': 'TPU',
    'EPOCHS': 15,
    'MODEL': 'efn.EfficientNetB2',
    'FOLDS': 5,
    'SEED': 777,
    'VERBOSE': 1,
    'BATCH_SIZE': 16,
    'IMG_SIZE': 512,
    'LOSS': 'RMSE',
    'OPT': 'Adam',
    'SCHEDULER': 'exp', # Cosine - LR SCHEDULER
    
    #FLIP
    'hflip': True, 
    'vflip': False,
    
    'clip': False,     #CLIP [0, 1]
    
    #Dropout
    'drop_prob': 0.75,
    'drop_cnt': 10,
    'drop_size': 0.05,

    #brightness, contrast
    'sat': [0.7, 1.3],
    'cont': [0.8, 1.2],
    'bri': 0.15,
    'hue': 0.05,
    
    'CAT_COLS': ['Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory',
                   'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur'],
    'TARGET_COL': ['Pawpularity']
}

with open(r'config.yaml', 'w') as f:
    yaml.dump(config, f)

In [None]:
def seeding(SEED):
    np.random.seed(SEED)
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    os.environ['TF_CUDNN_DETERMINISTIC'] = str(SEED)
    tf.random.set_seed(SEED)
    print('seeding done!!!')
seeding(config['SEED'])

### TPU Config

In [None]:
if config['DEVICE'] == "TPU":
    print("connecting to TPU...")
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print('Running on TPU ', tpu.master())
    except ValueError:
        print("Could not connect to TPU")
        tpu = None

    if tpu:
        try:
            print("initializing  TPU ...")
            tf.config.experimental_connect_to_cluster(tpu)
            tf.tpu.experimental.initialize_tpu_system(tpu)
            strategy = tf.distribute.experimental.TPUStrategy(tpu)
            print("TPU initialized")
        except _:
            print("failed to initialize TPU")
    else:
        config['DEVICE'] = "GPU"

if config['DEVICE'] != "TPU":
    print("Using default strategy for CPU and single GPU")
    strategy = tf.distribute.get_strategy()

if config['DEVICE'] == "GPU":
    print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
    

AUTO     = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync
print(f'REPLICAS: {REPLICAS}')

In [None]:
#Calculate bins for folds
num_bins = int(np.floor(1 + np.log2(len(train))))
print(num_bins)
train['bins'] = pd.cut(train['Pawpularity'].values.reshape(-1), bins = num_bins, labels = False)
train.head(2)

In [None]:
skf = StratifiedKFold(n_splits = config['FOLDS'], shuffle = True, random_state = config['SEED'])
for fold, (trn_idx, val_idx) in enumerate(skf.split(train, train['bins'])):
    train.loc[val_idx, 'folds'] = fold
train.groupby(['folds', 'bins']).size()

# Data Pipeline

In [None]:
#Thanks to @awsaf for his boilerplate data pipeline

def build_decoder(with_labels = True, target_size = config['IMG_SIZE'], ext = 'jpg'):
    def decode(path):
        file_bytes = tf.io.read_file(path)
        if ext == 'png':
            img = tf.image.decode_png(file_bytes, channels = 3)
        elif ext in ['jpg', 'jpeg']:
            img = tf.image.decode_jpeg(file_bytes, channels = 3)
        else:
            raise ValueError("Image extension not supported")

        img = tf.image.resize(img, (target_size, target_size))
        img = tf.cast(img, tf.float32) / 255.0
        img = tf.reshape(img, [target_size, target_size, 3])

        return img
    
    def decode_with_labels(path, label):
        return decode(path), tf.cast(label, tf.float32)
    
    return decode_with_labels if with_labels else decode


def build_augmenter(with_labels = True, dim = config['IMG_SIZE']):
    def augment(img, dim = dim):
        img = tf.image.random_flip_left_right(img) if config['hflip'] else img
        img = tf.image.random_flip_up_down(img) if config['vflip'] else img
        img = tf.image.random_hue(img, config['hue'])
        img = tf.image.random_saturation(img, config['sat'][0], config['sat'][1])
        img = tf.image.random_contrast(img, config['cont'][0], config['cont'][1])
        img = tf.image.random_brightness(img, config['bri'])
        #img = dropout(img, DIM = dim, PROBABILITY = config['drop_prob'], CT = config['drop_cnt'], SZ = config['drop_size'])
        img = tf.clip_by_value(img, 0, 1)  if config['clip'] else img         
        img = tf.reshape(img, [dim, dim, 3])
        return img
    
    def augment_with_labels(img, label):    
        return augment(img), label
    
    return augment_with_labels if with_labels else augment


def build_dataset(paths, labels = None, batch_size = 32, cache = True, decode_fn = None, augment_fn = None,
                  augment = True, repeat = True, shuffle = 1024, cache_dir = "", drop_remainder = False):
    
    if cache_dir != "" and cache is True:
        os.makedirs(cache_dir, exist_ok = True)
    
    if decode_fn is None:
        decode_fn = build_decoder(labels is not None)
    
    if augment_fn is None:
        augment_fn = build_augmenter(labels is not None)
    
    AUTO = tf.data.experimental.AUTOTUNE
    slices = paths if labels is None else (paths, labels)
    
    ds = tf.data.Dataset.from_tensor_slices(slices)
    ds = ds.map(decode_fn, num_parallel_calls = AUTO)
    ds = ds.cache(cache_dir) if cache else ds
    ds = ds.repeat() if repeat else ds
    if shuffle: 
        ds = ds.shuffle(shuffle, seed = config['SEED'])
        opt = tf.data.Options()
        opt.experimental_deterministic = False
        ds = ds.with_options(opt)
    ds = ds.map(augment_fn, num_parallel_calls = AUTO) if augment else ds
    ds = ds.batch(batch_size, drop_remainder = drop_remainder)
    ds = ds.prefetch(AUTO)
    return ds

In [None]:
#Sanity check
def plot_dataset(dataset, row: int, col: int):
    for (img, lbls) in dataset.take(1):
        plt.figure(figsize = (16, 12))
        for i in range(row * col):
            ax = plt.subplot(row, col, i + 1)
            plt.imshow(img[i].numpy())
            plt.title(f"Pawpularity: {lbls[i].numpy()}", color = 'r')
            plt.axis('off')
            plt.grid(False)
        plt.show()

In [None]:
fold = 0
fold_df = train.query('folds==@fold')[:2000]
paths  = fold_df['img_path'].tolist()
labels = fold_df[config['TARGET_COL']].values
dataset = build_dataset(paths, labels, cache = False, batch_size = config['BATCH_SIZE'] * REPLICAS,
                   repeat = True, shuffle = True, augment = True)
plot_dataset(dataset, 3, 3)

In [None]:
fold = 0
fold_df = train.query('folds!=@fold')[:2000]
paths  = fold_df['img_path'].tolist()
labels = fold_df[config['TARGET_COL']].values
dataset = build_dataset(paths, labels, cache = True, batch_size = config['BATCH_SIZE'] * REPLICAS,
                   repeat = False, shuffle = False, augment = False)
plot_dataset(dataset, 3, 3)

del dataset
_ = gc.collect()

In [None]:
def get_lr_callback(batch_size = 8, plot = False):
    lr_start   = 0.000005
    lr_max     = 0.00000125 * REPLICAS * batch_size
    lr_min     = 0.000001
    lr_ramp_ep = 5
    lr_sus_ep  = 0
    lr_decay   = 0.8
   
    def lrfn(epoch):
        if epoch < lr_ramp_ep:
            lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start
            
        elif epoch < lr_ramp_ep + lr_sus_ep:
            lr = lr_max
            
        elif config['SCHEDULER'] == 'exp':
            lr = (lr_max - lr_min) * lr_decay**(epoch - lr_ramp_ep - lr_sus_ep) + lr_min
            
        elif config['SCHEDULER'] == 'cosine':
            decay_total_epochs = config['EPOCHS'] - lr_ramp_ep - lr_sus_ep + 3
            decay_epoch_index = epoch - lr_ramp_ep - lr_sus_ep
            phase = math.pi * decay_epoch_index / decay_total_epochs
            cosine_decay = 0.5 * (1 + math.cos(phase))
            lr = (lr_max - lr_min) * cosine_decay + lr_min
        return lr
    if plot:
        plt.figure(figsize = (10, 5))
        plt.plot(np.arange(config['EPOCHS']), [lrfn(epoch) for epoch in np.arange(config['EPOCHS'])], marker = 'o')
        plt.xlabel('epoch')
        plt.ylabel('learnig rate')
        plt.title('Learning Rate Scheduler')
        plt.show()

    lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose = False)
    
    return lr_callback

_ = get_lr_callback(config['BATCH_SIZE'], plot = True )

In [None]:
#TF RMSE Loss
def RMSE(y_true, y_pred):
    loss = tf.math.sqrt(tf.math.reduce_mean(tf.math.square(tf.subtract(y_true, y_pred))))
    return loss

In [None]:
def create_model(dim = config['IMG_SIZE']): 
    
    with strategy.scope():
        pre_trained = efn.EfficientNetB2(input_shape = (dim, dim, 3),
                            include_top = False,
                            weights = 'imagenet')

        x = tf.keras.layers.GlobalAveragePooling2D()(pre_trained.output)
        x = tf.keras.layers.Dense(64, activation = 'selu')(x)
        x = tf.keras.layers.Dense(1)(x)

        model = tf.keras.Model(inputs = pre_trained.input, outputs = x)

        opt = tf.keras.optimizers.Adam(learning_rate = 0.001)
        loss = RMSE
        metric = tf.keras.metrics.RootMeanSquaredError(name = 'rmse')
        model.compile(optimizer = opt, loss = loss, metrics = metric)
    
    return model

In [None]:
model = create_model(dim = config['IMG_SIZE'])
model.summary()

del model
_ = gc.collect()

# Model Training

In [None]:
if config['DEBUG']:
    train = train.sample(frac = 0.2).reset_index(drop = True)
    print(train.shape)

In [None]:
def plot_history(history):
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.legend(['loss', 'val_loss'])
    plt.title(f'Loss: RMSE Loss')
    plt.subplot(1, 2, 2)
    plt.plot(history.history['rmse'])
    plt.plot(history.history['val_rmse'])
    plt.legend(['rmse', 'val_rmse'])
    plt.title(f'Metric: RMSE')
    plt.show()

In [None]:
oof_preds = []
oof_scores = []
test_preds = []
batch_size = config['BATCH_SIZE'] * REPLICAS

for i, fold in enumerate(np.arange(config['FOLDS'])):
    print('#############' * 10)
    print(f"Fold: {fold + 1}")
    train_df = train[train['folds'] != fold].copy()
    valid_df = train[train['folds'] == fold].copy()
    print(f"Training with Model: {config['MODEL']}; Image Size: {config['IMG_SIZE']}; Batch Size: {batch_size}")
    print(f"Num of Train Images: {len(train_df)}; Num of Valid Images: {len(valid_df)}")
    print()
    train_paths = train_df['img_path'].values
    train_labels = train_df['Pawpularity'].values
    valid_paths = valid_df['img_path'].values
    valid_labels = valid_df['Pawpularity'].values
    #print(train_paths.shape, train_labels.shape, valid_paths.shape, valid_labels.shape)
    
    K.clear_session()
    checkpoint = tf.keras.callbacks.ModelCheckpoint(f"paw_model_{fold}.h5", monitor = 'val_rmse', verbose = 1, save_best_only = True,
                                                    save_weights_only = False, mode = 'min', save_freq = 'epoch')
    train_dataset = build_dataset(train_paths, train_labels, cache = False, batch_size = batch_size,
                                   repeat = True, shuffle = True, augment = True)
    valid_dataset = build_dataset(valid_paths, valid_labels, cache = True, batch_size = batch_size,
                                   repeat = False, shuffle = False, augment = False)
    
    model = create_model(dim = config['IMG_SIZE'])
    
    print('Model Training...')
    history = model.fit(
                        train_dataset, 
                        epochs = config['EPOCHS'],
                        callbacks = [checkpoint, get_lr_callback(batch_size)], 
                        steps_per_epoch = len(trn_idx) / batch_size // REPLICAS,
                        validation_data = valid_dataset, 
                        verbose = 1
                    )
    print('Load best model for prediction...')
    model.load_weights(f"paw_model_{fold}.h5")
    print('Predict Valid - OOF...')
    valid_preds = model.predict(valid_dataset, batch_size = batch_size, verbose = 1)
    oof_preds.append(valid_preds)

    print(f"Fold RMSE: {RMSE(valid_labels.reshape(-1, 1).astype(np.float32), valid_preds):0.4f}")
    oof_scores.append(RMSE(valid_labels.reshape(-1, 1).astype(np.float32), valid_preds))
    
    #plot history
    plot_history(history)
    
    print('Predict on Test dataset...')
    test_paths = test['img_path'].values
    test_dataset = build_dataset(test_paths, labels = None, cache = True, batch_size = batch_size,
                               repeat = False, shuffle = False, augment = False)
    
    test_preds.append(model.predict(test_dataset, batch_size = batch_size, verbose = 1))
    
    del model, train_dataset, valid_dataset
    _ = gc.collect()

In [None]:
#save parameters
config['SCORES'] = oof_scores
with open(r'config.yaml', 'w') as f:
    yaml.dump(config, f)

In [None]:
oof_pred = np.concatenate(oof_preds).reshape(-1)

In [None]:
preds = np.mean(test_preds, axis = 0).reshape(-1)
sub['Pawpularity'] = preds
sub.to_csv('./submission.csv', index = False)

In [None]:
fig, ax = plt.subplots(1, 3, figsize = (16, 8))
ax = ax.ravel()

ax[0].set_title('Train Target Distribution')
sns.kdeplot(train['Pawpularity'], shade = True, ax = ax[0], color = 'green')
ax[1].set_title('OOF Prediction Distribution')
sns.kdeplot(oof_pred, shade = True, ax = ax[1], color = 'blue')
ax[2].set_title('Test Prediction Distribution')
sns.kdeplot(sub['Pawpularity'], shade = True, ax = ax[2], color = 'red')

In [None]:
finish = time()
print(strftime("%H:%M:%S", gmtime(finish - start)))