# Predict Pawpularity using EfficientNet Model

In [None]:
import os
import random

import pandas as pd
import numpy as np

import tensorflow as tf
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
TRAIN_DATA_PATH = '../input/petfinder-pawpularity-score/train.csv'
TEST_DATA_PATH = '../input/petfinder-pawpularity-score/test.csv'

TRAIN_DIRECTORY = '../input/petfinder-pawpularity-score/train'
TEST_DIRECTORY = '../input/petfinder-pawpularity-score/test'

In [None]:
TARGET_NAME = 'Pawpularity'
VAL_SIZE = 0.15
SEED = 5

In [None]:
#Settings and training parameters
AUTOTUNE = tf.data.experimental.AUTOTUNE
IMG_SIZE = 224
BATCH_SIZE = 64
DROPOUT_RATE = 0.2
LEARNING_RATE = 1e-3
DECAY_STEPS = 100
DECAY_RATE = 0.96
EPOCHS = 500
PATIENCE = 5

In [None]:
# Pretrained image classification model EfficientNetB0
# From https://www.kaggle.com/ekaterinadranitsyna/keras-applications-models
IMG_MODEL = '../input/keras-applications-models/EfficientNetB0.h5'

## Functions

In [None]:
def set_seed(seed=42):
    """Utility function to use for reproducibility.
    :param seed: Random seed
    :return: None
    """
    np.random.seed(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'


def set_display():
    """Function sets display options for charts and pd.DataFrames.
    """
    # Plots display settings
    plt.style.use('fivethirtyeight')
    plt.rcParams['figure.figsize'] = 12, 8
    plt.rcParams.update({'font.size': 14})
    # DataFrame display settings
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)
    pd.options.display.float_format = '{:.4f}'.format


def id_to_path(img_id: str, dir: str):
    """Function returns a path to an image file.
    :param img_id: Image Id
    :param dir: Path to the directory with images
    :return: Image file path
    """
    return os.path.join(dir, f'{img_id}.jpg')


@tf.function
def get_image(path: str) -> tf.Tensor:
    """Function loads image from a file and preprocesses it.
    :param path: Path to image file
    :return: Tensor with preprocessed image
    """
    image = tf.image.decode_jpeg(tf.io.read_file(path), channels=3)
    image = tf.cast(tf.image.resize_with_pad(image, IMG_SIZE, IMG_SIZE), dtype=tf.int32)
    return tf.keras.applications.efficientnet.preprocess_input(image)


@tf.function
def process_dataset(path: str, label: int) -> tuple:
    """Function returns preprocessed image and label.
    :param path: Path to image file
    :param label: Class label
    :return: tf.Tensor with preprocessed image, numeric label
    """
    return get_image(path), label


@tf.function
def get_dataset(x, y=None) -> tf.data.Dataset:
    """Function creates batched optimized dataset for the model
    out of an array of file paths and (optionally) class labels.
    :param x: Input data for the model (array of file paths)
    :param y: Target values for the model (array of class indexes)
    :return TensorFlow Dataset object
    """
    if y is not None:
        ds = tf.data.Dataset.from_tensor_slices((x, y))
        return ds.map(process_dataset, num_parallel_calls=AUTOTUNE) \
            .batch(BATCH_SIZE).prefetch(buffer_size=AUTOTUNE)
    else:
        ds = tf.data.Dataset.from_tensor_slices(x)
        return ds.map(get_image, num_parallel_calls=AUTOTUNE) \
            .batch(BATCH_SIZE).prefetch(buffer_size=AUTOTUNE)


def plot_history(hist):
    """Function plots a chart with training and validation metrics.
    :param hist: Tensorflow history object from model.fit()
    """
    # Losses and metrics
    loss = hist.history['loss']
    val_loss = hist.history['val_loss']
    rmse = hist.history['root_mean_squared_error']
    val_rmse = hist.history['val_root_mean_squared_error']

    # Epochs to plot along x axis
    x_axis = range(1, len(loss) + 1)

    fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, sharex=True)

    ax1.plot(x_axis, loss, 'bo', label='Training')
    ax1.plot(x_axis, val_loss, 'ro', label='Validation')
    ax1.set_title('MSE Loss')
    ax1.legend()

    ax2.plot(x_axis, rmse, 'bo', label='Training')
    ax2.plot(x_axis, val_rmse, 'ro', label='Validation')
    ax2.set_title('Root Mean Squared Error')
    ax2.set_xlabel('Epochs')
    ax2.legend()

    plt.tight_layout()
    plt.show()

## Data Processing

In [None]:
set_seed(SEED)
set_display()

In [None]:
# Train data set
data_train = pd.read_csv(TRAIN_DATA_PATH)
print(f'Train data shape: {data_train.shape}')
data_train.head()

# Define duplicate images list and remove from train set

In [None]:
# From https://www.kaggle.com/valleyzw/petfinder-duplicate-images
sims = ['13d215b4c71c3dc603cd13fc3ec80181_373c763f5218610e9b3f82b12ada8ae5',
       '5ef7ba98fc97917aec56ded5d5c2b099_67e97de8ec7ddcda59a58b027263cdcc',
       '839087a28fa67bf97cdcaf4c8db458ef_a8f044478dba8040cc410e3ec7514da1',
       '1feb99c2a4cac3f3c4f8a4510421d6f5_264845a4236bc9b95123dde3fb809a88',
       '3c50a7050df30197e47865d08762f041_def7b2f2685468751f711cc63611e65b',
       '37ae1a5164cd9ab4007427b08ea2c5a3_3f0222f5310e4184a60a7030da8dc84b',
       '5a642ecc14e9c57a05b8e010414011f2_c504568822c53675a4f425c8e5800a36',
       '2a8409a5f82061e823d06e913dee591c_86a71a412f662212fe8dcd40fdaee8e6',
       '3c602cbcb19db7a0998e1411082c487d_a8bb509cd1bd09b27ff5343e3f36bf9e',
       '0422cd506773b78a6f19416c98952407_0b04f9560a1f429b7c48e049bcaffcca',
       '68e55574e523cf1cdc17b60ce6cc2f60_9b3267c1652691240d78b7b3d072baf3',
       '1059231cf2948216fcc2ac6afb4f8db8_bca6811ee0a78bdcc41b659624608125',
       '5da97b511389a1b62ef7a55b0a19a532_8ffde3ae7ab3726cff7ca28697687a42',
       '78a02b3cb6ed38b2772215c0c0a7f78e_c25384f6d93ca6b802925da84dfa453e',
       '08440f8c2c040cf2941687de6dc5462f_bf8501acaeeedc2a421bac3d9af58bb7',
       '0c4d454d8f09c90c655bd0e2af6eb2e5_fe47539e989df047507eaa60a16bc3fd',
       '5a5c229e1340c0da7798b26edf86d180_dd042410dc7f02e648162d7764b50900',
       '871bb3cbdf48bd3bfd5a6779e752613e_988b31dd48a1bc867dbc9e14d21b05f6',
       'dbf25ce0b2a5d3cb43af95b2bd855718_e359704524fa26d6a3dcd8bfeeaedd2e',
       '43bd09ca68b3bcdc2b0c549fd309d1ba_6ae42b731c00756ddd291fa615c822a1',
       '43ab682adde9c14adb7c05435e5f2e0e_9a0238499efb15551f06ad583a6fa951',
       'a9513f7f0c93e179b87c01be847b3e4c_b86589c3e85f784a5278e377b726a4d4',
       '38426ba3cbf5484555f2b5e9504a6b03_6cb18e0936faa730077732a25c3dfb94',
       '589286d5bfdc1b26ad0bf7d4b7f74816_cd909abf8f425d7e646eebe4d3bf4769',
       '9f5a457ce7e22eecd0992f4ea17b6107_b967656eb7e648a524ca4ffbbc172c06',
       'b148cbea87c3dcc65a05b15f78910715_e09a818b7534422fb4c688f12566e38f',
       '3877f2981e502fe1812af38d4f511fd2_902786862cbae94e890a090e5700298b',
       '8f20c67f8b1230d1488138e2adbb0e64_b190f25b33bd52a8aae8fd81bd069888',
       '221b2b852e65fe407ad5fd2c8e9965ef_94c823294d542af6e660423f0348bf31',
       '2b737750362ef6b31068c4a4194909ed_41c85c2c974cc15ca77f5ababb652f84',
       '01430d6ae02e79774b651175edd40842_6dc1ae625a3bfb50571efedc0afc297c',
       '72b33c9c368d86648b756143ab19baeb_763d66b9cf01069602a968e573feb334',
       '03d82e64d1b4d99f457259f03ebe604d_dbc47155644aeb3edd1bd39dba9b6953',
       '851c7427071afd2eaf38af0def360987_b49ad3aac4296376d7520445a27726de',
       '54563ff51aa70ea8c6a9325c15f55399_b956edfd0677dd6d95de6cb29a85db9c',
       '87c6a8f85af93b84594a36f8ffd5d6b8_d050e78384bd8b20e7291b3efedf6a5b',
       '04201c5191c3b980ae307b20113c8853_16d8e12207ede187e65ab45d7def117b']

In [None]:
#Filter the lower score out
less_score_duplicates = []

for pair in sims:
    p1, p2 = pair.split('_')
    t1 = data_train.query(f'Id == "{p1}"')['Pawpularity'].values[0]
    t2 = data_train.query(f'Id == "{p2}"')['Pawpularity'].values[0]
    if(t1<t2):
        less_score_duplicates.append(p1)
    else:
        less_score_duplicates.append(p2)
    
less_score_duplicates

In [None]:
data_train.shape

In [None]:
len(less_score_duplicates)

In [None]:
data_train = data_train.loc[~data_train['Id'].isin(less_score_duplicates)]
data_train.shape

In [None]:
# Test data set
data_test = pd.read_csv(TEST_DATA_PATH)
print(f'Test data shape: {data_test.shape}')
data_test.head()

In [None]:
# Paths to train and test images.
data_train['path'] = data_train['Id'].apply(
    lambda x: id_to_path(x, TRAIN_DIRECTORY))
data_test['path'] = data_test['Id'].apply(
    lambda x: id_to_path(x, TEST_DIRECTORY))

# Keep a portion of the labeled data for validation.
train_subset, valid_subset = train_test_split(
    data_train[['path', TARGET_NAME]],
    test_size=VAL_SIZE, shuffle=True, random_state=SEED
)

In [None]:
train_subset.shape 

In [None]:
valid_subset.shape

In [None]:
# Create TensorFlow datasets
train_ds = get_dataset(x=train_subset['path'], y=train_subset[TARGET_NAME])
valid_ds = get_dataset(x=valid_subset['path'], y=valid_subset[TARGET_NAME])
test_ds = get_dataset(x=data_test['path'])

In [None]:
# Pretrained image classification model
feature_model = tf.keras.models.load_model(IMG_MODEL)

# Freeze weights in the original model
feature_model.trainable = False

In [None]:
#Do random horizontal flip augmentation, and passed them to pretrained feature extraction model
#Then, do batch normalization, dropout and activations.
image_model = tf.keras.models.Sequential(
    [
        tf.keras.layers.Input(shape=(IMG_SIZE, IMG_SIZE, 3)),
        tf.keras.layers.experimental.preprocessing.RandomFlip(mode='horizontal'),
        feature_model,
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(DROPOUT_RATE, name='top_dropout'),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(1, name='score')
    ]
)

In [None]:
#Gradually decrease learning rate 
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=LEARNING_RATE,
    decay_steps=DECAY_STEPS, decay_rate=DECAY_RATE,
    staircase=True)

In [None]:
# Compile the model
image_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule),
                    loss=tf.keras.losses.MeanSquaredError(),
                    metrics=[tf.keras.metrics.RootMeanSquaredError()])

In [None]:
image_model.summary()

In [None]:
#Monitor validation loss and stop the training.
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=PATIENCE, restore_best_weights=True)

In [None]:
history = image_model.fit(train_ds, validation_data=valid_ds,
                          epochs=EPOCHS, callbacks=[early_stop],
                          use_multiprocessing=True, workers=-1)

In [None]:
plot_history(history)

## Inference

In [None]:
#Popularity score prediction
data_test[TARGET_NAME] = image_model.predict(
    test_ds, use_multiprocessing=True, workers=os.cpu_count())

In [None]:
data_test[['Id', TARGET_NAME]].to_csv('submission.csv', index=False)
data_test[['Id', TARGET_NAME]].head()