# Overview

## Files

*   `train/` -    Folder containing training set photos of the form {id}.jpg, where {id} is a unique Pet Profile ID.
*   `train.csv` - Metadata (described below) for each photo in the training set as well as the target, the photo's Pawpularity score. The `Id` column gives the photo's unique Pet Profile ID corresponding the photo's file name.
*    `test/` - Folder containing randomly generated images in a format similar to the training set photos. The actual test data comprises about 6800 pet photos similar to the training set photos.
*    `test.csv` - Randomly generated metadata similar to the training set metadata.
*    `sample_submission.csv` - A sample submission file in the correct format.

##  Photo Metadata

The `train.csv` and `test.csv` files contain metadata for photos in the training set and test set, respectively. 
Each pet photo is labeled with the value of `1` (Yes) or `0` (No) for each of the following features:

*    `Focus` - Pet stands out against uncluttered background, not too close / far.
*    `Eyes` - Both eyes are facing front or near-front, with at least 1 eye / pupil decently clear.
*    `Face` - Decently clear face, facing front or near-front.
*    `Near` - Single pet taking up significant portion of photo (roughly over 50% of photo width or height).
*    `Action` - Pet in the middle of an action (e.g., jumping).
*    `Accessory` - Accompanying physical or digital accessory / prop (i.e. toy, digital sticker), excluding collar and leash.
*    `Group` - More than 1 pet in the photo.
*    `Collage` - Digitally-retouched photo (i.e. with digital photo frame, combination of multiple photos).
*    `Human` - Human in the photo.
*    `Occlusion` - Specific undesirable objects blocking part of the pet (i.e. human, cage or fence). Note that not all blocking objects are considered occlusion.
*    `Info` - Custom-added text or labels (i.e. pet name, description).
*    `Blur` - Noticeably out of focus or noisy, especially for the pet’s eyes and face. For Blur entries, “Eyes” column is always set to 0.


# Setup

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import os
import numpy as np
import pandas as pd 
import tensorflow as tf
import math
import random
import shutil

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
import keras.backend as K

from PIL import Image
from tensorflow import keras
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import layers
from tensorflow.keras import activations
from tensorflow.keras.utils import Sequence
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications import EfficientNetB0
from keras.utils import np_utils

from sklearn.model_selection import train_test_split
from IPython.display import display, Markdown, Latex

In [None]:
# matplotlib
plt.rc('font', size=15)
plt.rc('axes', titlesize=18)  
plt.rc('xtick', labelsize=10)  
plt.rc('ytick', labelsize=10)

# seaborn
sns.set(font_scale = 1.2)
sns.set_style("whitegrid")

# Tensorflow/Keras
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [None]:
class Cfg:
    RANDOM_STATE = 2021
    TRAIN_DATA = '../input/petfinder-pawpularity-score/train.csv'
    TEST_DATA = '../input/petfinder-pawpularity-score/test.csv'
    SUBMISSION = '../input/petfinder-pawpularity-score/sample_submission.csv'    
    IMG_FOLDER = '../input/petfinder-pawpularity-score/train'
    IMG_TEST_FOLDER = '../input/petfinder-pawpularity-score/test'
    IMG_RESIZE_FOLDER = './resized'
    SUBMISSION_FILE = './submission.csv'
    
    SAMPLE_FRAC = 1
    NUM_EPOCHS = 10
    LEARNING_RATE = 0.0001
    TEST_SIZE = 0.3
    BATCH_SIZE = 64
    IMG_SIZE = 128
    
    INDEX = 'Id'
    TARGET = 'Pawpularity'
    FEATURES = [
        'Subject Focus', 
        'Eyes', 
        'Face', 
        'Near', 
        'Action', 
        'Accessory', 
        'Group', 
        'Collage', 
        'Human', 
        'Occlusion', 
        'Info', 
        'Blur'
    ]

In [None]:
# create folder for resized images
if not os.path.isdir(Cfg.IMG_RESIZE_FOLDER):
    os.makedirs(Cfg.IMG_RESIZE_FOLDER)

# Load and prepare data

In [None]:
def read_data(
    train_file:str=Cfg.TRAIN_DATA, 
    test_file:str=Cfg.TEST_DATA
) -> (pd.DataFrame, pd.DataFrame):
    """Reads the csv files `train.csv` and `test.csv` and returns 
       them as pandas data frames.
    """
    # read csv files
    train_df = pd.read_csv(Cfg.TRAIN_DATA, index_col=Cfg.INDEX)
    test_df = pd.read_csv(Cfg.TEST_DATA, index_col=Cfg.INDEX)

    return train_df, test_df


train_df, test_df = read_data()

In [None]:
train_df

In [None]:
test_df

### Summary

* The training set consists of 9912 rows and 13 columns. The target Variable is `Pawpularity`. 
* The test data set consists of 8 roes and the same columns as the training sets with the expection of the target variable.


# Exploratory data analysis

In [None]:
train_df.describe().drop('count')

## Target `Pawpularity`

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))

sns.histplot(
    data=train_df,
    x=Cfg.TARGET,
    bins=30,
    legend=True,
    kde=True,
    ax=ax[0])

ax[0].set_title('Target Distribution')

ax[0].set_xlabel('Pawpularity')
ax[0].set_ylabel('Count')

sns.boxplot(
    data=train_df,
    y=Cfg.TARGET,
    ax=ax[1]
)

plt.show()

### Summary

* The mean and std of the target are: $\mu=38.039$ and $\sigma=20.592$.
* The distribution of the target shows that there are some outliers in the upper area.

## Meta data features

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(20, 15))

for f, ax in zip(Cfg.FEATURES, axes.flatten()):
    sns.histplot(
        data=train_df,
        x=Cfg.TARGET,
        bins=30,
        hue=f,
        legend=True,
        kde=True,
        ax=ax,
        alpha=0.3)

fig.tight_layout()    
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(20, 15))

for f, ax in zip(Cfg.FEATURES, axes.flatten()):
    sns.countplot(
        data=train_df,
        x=f,
        alpha=0.8,
        ax=ax)

fig.tight_layout()
plt.show()

## Correletaion

In [None]:
corr_df = train_df.corr()

fig, ax = plt.subplots(figsize=(15, 15))

mask = np.triu(np.ones_like(corr_df, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)

sns.heatmap(
    corr_df, 
    mask=mask, 
    cmap=cmap, 
    vmin = -0.75, 
    vmax = 0.75,
    center=0,
    square=True,
    annot = True,
    fmt="0.0",
    linewidths=.5)

fig.tight_layout()
plt.show()

### Summary

* There is no significant correlation between the features. 

# Visualization

In [None]:
def get_image(image_id, image_folger=Cfg.IMG_FOLDER, data=train_df, resize=True):
    resized_path = os.path.join(Cfg.IMG_RESIZE_FOLDER, '{}.jpg'.format(image_id))
    if os.path.isfile(resized_path):
        img = Image.open(resized_path)
        return img
    
    img_path = os.path.join(image_folger, '{}.jpg'.format(image_id))
    
    img = Image.open(img_path)
    img = img.resize((Cfg.IMG_SIZE, Cfg.IMG_SIZE))
    img.save(resized_path)
        
    return img

In [None]:
def plot_images(data, nrows=5, ncols=5, figsize=(15, 15)):
    """
    """
    indices = data.sample(nrows * ncols).index
    
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=figsize)
    for index, ax in zip(indices, axes.flatten()):
        img = get_image(index)
        ax.imshow(img)

        ax.set_title(train_df.loc[index][Cfg.TARGET], fontsize=18)
        ax.get_xaxis().set_visible(False)
        ax.get_yaxis().set_visible(False)

    fig.tight_layout()
    plt.show()

In [None]:
pawpularity_range = [0, 20, 40, 60, 80, 100]
query = lambda i: '{} < Pawpularity and Pawpularity <= {}'.format(pawpularity_range[i], pawpularity_range[i+1])

for i in range(0, 5):
    display(Markdown('### Pawpularity `{}` - `{}`'
        .format(pawpularity_range[i], pawpularity_range[i+1])))
    
    df = train_df.query(query(i))
    plot_images(df, nrows=1, ncols=5, figsize=(15, 5))

# Data generator

In [None]:
class DataGenerator(Sequence):
    """
    """
    def __init__(
        self, 
        data, 
        target=None, 
        img_folder=Cfg.IMG_FOLDER, 
        batch_size=Cfg.BATCH_SIZE
    ):
        self.data = data
        self.target = target
        self.batch_size = batch_size
        self.img_folder = img_folder
    
    def __len__(self):
        return math.ceil(len(self.data) / self.batch_size)
    
    def __getitem__(self, idx):
        start_idx = idx * self.batch_size
        end_idx = (idx + 1) * self.batch_size
        ids = self.data[start_idx : end_idx].index.values
        
        images = np.array([np.array(get_image(id, self.img_folder)) for id in ids])
        meta = np.array(self.data[start_idx : end_idx][Cfg.FEATURES]).astype(np.float32)
        
        if self.target is None or not self.target.any():
            return [images, meta]
        
        target = np.array(self.target[start_idx : end_idx]).astype(np.float32)
        return [images, meta], target

# Model

## Image CNN model

In [None]:
def get_image_model(img_size=Cfg.IMG_SIZE, n_channel=3):
    """
    """
    inputs = layers.Input((img_size, img_size, n_channel))
    x = inputs
    
    x = layers.Conv2D(filters=32, kernel_size=(3, 3), activation='relu')(x)
    x = layers.MaxPooling2D(pool_size=(2, 2))(x)
    
    x = layers.Conv2D(filters=64, kernel_size=(3, 3), activation='relu')(x)
    x = layers.MaxPooling2D(pool_size=(2, 2))(x)
    
    x = layers.Conv2D(filters=128, kernel_size=(3, 3), activation='relu')(x)
    x = layers.MaxPooling2D(pool_size=(2, 2))(x)

    x = layers.Conv2D(filters=256, kernel_size=(3, 3), activation='relu')(x)
    x = layers.MaxPooling2D(pool_size=(2, 2))(x)
    
    x = layers.Flatten()(x)
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dropout(0.5)(x)
    
    outputs = x
    model = keras.Model(
        inputs=inputs, 
        outputs=outputs, 
        name='image_cnn_model')

    return model    

In [None]:
image_model = get_image_model()
image_model.summary()

## Meta NN Model

In [None]:
def get_meta_model(n_meta_features=12):
    """
    """
    inputs = layers.Input(shape=((n_meta_features, )))
    x = inputs
    
    x = layers.Dense(12, activation='relu')(x) 
    x = layers.Dense(24, activation='relu')(x)
    x = layers.Dense(12, activation='relu')(x) 
    
    outputs = x
    model = keras.Model(
        inputs=inputs, 
        outputs=outputs, 
        name='meta_nn_model')

    return model

In [None]:
meta_model = get_meta_model()
meta_model.summary()

## Model

In [None]:
def get_model(image_model, meta_model):
    """
    """
    x = layers.Concatenate(axis=1)([image_model.output, meta_model.output])
    x = layers.Dense(1, activation='linear')(x)
    output = x

    model = keras.Model(inputs=[image_model.input, meta_model.input], outputs=output)
    return model

In [None]:
model = get_model(image_model, meta_model)
model.summary()

# Train

In [None]:
data = train_df.sample(frac=Cfg.SAMPLE_FRAC)

# Spit data into train and validation data sets
X_train, X_val, y_train, y_val = train_test_split(
    data[Cfg.FEATURES],
    data[Cfg.TARGET],
    test_size=Cfg.TEST_SIZE, 
    random_state=Cfg.RANDOM_STATE
)

train_generator = DataGenerator(X_train, y_train)
val_generator = DataGenerator(X_val, y_val)

In [None]:
# Compile model
model.compile(
    optimizer=keras.optimizers.Adam(
        learning_rate=Cfg.LEARNING_RATE,
    ), 
    loss = keras.losses.MeanSquaredError(),
    metrics=[ 
        keras.metrics.RootMeanSquaredError(name='rmse')
    ]
)
    
callbacks = [
    keras.callbacks.EarlyStopping(
        monitor='loss', 
        patience=3)
]

In [None]:
%%time

result = model.fit(
    train_generator,
    epochs=Cfg.NUM_EPOCHS,
    validation_data=val_generator,
    callbacks=callbacks
)

# Prediction

In [None]:
y_pred = model.predict(val_generator).reshape(-1)

df = pd.DataFrame({'y_pred': y_pred, 'y_val': y_val})
df['error'] = np.abs(y_pred - y_val)

In [None]:
fig, ax = plt.subplots(1, 4, figsize=(23, 5))

# plot model rmse
ax[0].plot(result.history['rmse'])
ax[0].plot(result.history['val_rmse'])

ax[0].set_title('Model RMSE')
ax[0].set_ylabel('RMSE')
ax[0].set_xlabel('Epoch')
ax[0].legend(['train', 'val'], loc='upper right')

# plot regression 
sns.regplot(
    data=df,
    x='y_val',
    y='y_pred',
    ax=ax[1],
    x_estimator=np.mean, 
    x_bins=30
)

ax[1].set_title('Regression True vs. Pred')
ax[1].set_ylabel('Pred (val)')
ax[1].set_xlabel('True (val)')

sns.histplot(
    data=df,
    x='y_pred',
    bins=30,
    legend=True,
    kde=True,
    ax=ax[2])

ax[2].set_title('Pred Target Distribution')

ax[2].set_xlabel('Pred Pawpularity')
ax[2].set_ylabel('Count')

# plot residuals
sns.scatterplot(data=df, x='y_pred', y='error', ax=ax[3])
ax[3].set_title('Residuals')

ax[3].set_xlabel('Prediction')
ax[3].set_ylabel('Absolute error')

plt.tight_layout()
plt.show()

# Submission

In [None]:
test_generator = DataGenerator(test_df, img_folder=Cfg.IMG_TEST_FOLDER)
y_pred_submission = model.predict(test_generator).reshape(-1)

In [None]:
submission_df = pd.DataFrame({
    Cfg.INDEX: test_df.index,
    Cfg.TARGET: y_pred_submission,
}).set_index(Cfg.INDEX)

submission_df

In [None]:
# save submission file
submission_df.to_csv(Cfg.SUBMISSION_FILE)

In [None]:
# cleanup
shutil.rmtree(Cfg.IMG_RESIZE_FOLDER)