# Predicting Pawpularity using EfficientNet

The results that are publicly shared in this competition shows that the models which involves using only EfficientNets (No ViT, RAPIDS SVR, etc...) have achieved an average performance of 18.60 on the test set, and the best performance that I came across is of 18.56 accuracy, shared by [LEANDRO ROSER](https://www.kaggle.com/leangab/tf-pawpularity-efficientnet-metadata-ensamble).

I am trying to achieve the best possible score using only EfficientNets as backbone of the model, and this notebook will be updated whenever a new score is acheived.

The model architecture:
- EfficientNetB3 with noisy-student pre-trained weights.
- Fully connected layer for meta data.
- Fully connected head.
 
Tools used to optimize the results:
- Keras Tuner for hyperparameter optimization with Hyperband search.
- 7-fold cross-validation.
- Data Augmentation.

**Note:** This notebook takes around 6 hours to fully run.

If you find this Notebook usefull feel free to share it.


In [None]:
# Importing libraries
import os
import random
import gc
import warnings

import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.data import Dataset
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Flatten, Dropout, concatenate, BatchNormalization
from tensorflow.keras.layers import GlobalAveragePooling2D, LeakyReLU
from tensorflow.keras import layers
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.efficientnet import EfficientNetB0
from tensorflow.keras.callbacks import EarlyStopping,ReduceLROnPlateau
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.utils import plot_model
import keras_tuner as kt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

warnings.filterwarnings('ignore')

In [None]:
# Importing the training data
Train_df = pd.read_csv('../input/petfinder-pawpularity-score/train.csv')
Train_df.head()

In [None]:
# Importing the test data
Test_df = pd.read_csv('../input/petfinder-pawpularity-score/test.csv')
Test_id = Test_df.Id.copy()
Test_df.head()

In [None]:
# Setting seeds
seed = 42
np.random.seed(seed)
random.seed(seed)
tf.random.set_seed(seed)

# Setting constants
batch_size = 32
image_size = 300
channels = 3
shuffle_size = 1024 

# Setting auto tune
AUTOTUNE = tf.data.experimental.AUTOTUNE  

In [None]:
# Mapping the images ID to the image paths
Train_df.Id = Train_df.Id.map(lambda x: '../input/petfinder-pawpularity-score/train/' + x + '.jpg')
Test_df.Id = Test_df.Id.map(lambda x: '../input/petfinder-pawpularity-score/test/' + x + '.jpg')

In [None]:
# Defining functions to decode image paths and preprocess images 
def read_img():
    def img_to_array(path):
        image = tf.io.read_file(path)
        image = tf.image.decode_jpeg(image, channels=channels)
        image = tf.cast(image, tf.float32)
        image = tf.image.resize(image, (image_size, image_size))
        image = tf.keras.applications.efficientnet.preprocess_input(image)
        return image
    def mapping(path, struct_data, score):
        return (img_to_array(path),struct_data), score
    return mapping

def augment(data, score):
    image, struct_data = data
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_saturation(image, 0.95, 1.05)
    image = tf.image.random_contrast(image, 0.95, 1.05)
    image = tf.image.random_brightness(image, 0.1)
    return (image, struct_data), score

def preprocess(ds, batch_size, ds_type):
    labeled_read_img = read_img()
    ds = ds.map(labeled_read_img, num_parallel_calls=AUTOTUNE)
    if ds_type=='train':
        ds = ds.map(augment, num_parallel_calls=AUTOTUNE)
        ds = ds.shuffle(shuffle_size, reshuffle_each_iteration=True)
    ds = ds.batch(batch_size)
    ds = ds.prefetch(AUTOTUNE)
    return ds

In [None]:
def create_ds(df, ds_type):
    ds = Dataset.from_tensor_slices((df['Id'].values,df.iloc[:,1:-1],df['Pawpularity'].values))
    ds = preprocess(ds, batch_size, ds_type)
    return ds

In [None]:
Test_ds = Dataset.from_tensor_slices((Test_df['Id'].values,Test_df.iloc[:,1:], np.multiply(Test_df.iloc[:,1].values,0)))
Test_ds = preprocess(Test_ds, batch_size, 'test')

In [None]:
Train_ds_one = create_ds(Train_df.iloc[:9000], 'train')
Train_ds_two = create_ds(Train_df.iloc[:9000], 'train')
Val_ds = create_ds(Train_df.iloc[9000:], 'train')

Train_ds = tf.data.Dataset.zip((Train_ds_one, Train_ds_two))

In [None]:
# Mixup augmentation functions
def sample_beta_distribution(size, concentration_0=0.2, concentration_1=0.2):
    gamma_1_sample = tf.random.gamma(shape=[size], alpha=concentration_1)
    gamma_2_sample = tf.random.gamma(shape=[size], alpha=concentration_0)
    return gamma_1_sample / (gamma_1_sample + gamma_2_sample)


def mix_up(ds_one, ds_two, alpha=0.2):
    # Unpack two datasets
    (images_one, struct_one), labels_one = ds_one
    (images_two, struct_two), labels_two = ds_two
    batch_size = tf.shape(images_one)[0]

    # Sample lambda and reshape it to do the mixup
    l = sample_beta_distribution(batch_size, alpha, alpha)
    i_l = tf.reshape(l, (batch_size, 1, 1, 1))
    s_l = tf.reshape(l, (batch_size, 1))
    y_l = tf.reshape(l, (batch_size, 1))

    # Perform mixup on both images and labels by combining a pair of images/labels
    # (one from each dataset) into one image/label
    images = images_one * i_l + images_two * (1 - i_l)
    struct = tf.cast(struct_one, dtype='float32') * s_l + tf.cast(struct_one, dtype='float32') * (1 - s_l)
    labels = tf.math.multiply(tf.expand_dims(tf.cast(labels_one, dtype='float32'), axis=1), y_l) + \
             tf.math.multiply(tf.expand_dims(tf.cast(labels_two, dtype='float32'), axis=1), (1 - y_l))
    return (images, struct), labels

In [None]:
# create the new dataset using our mix_up mapping
Train_ds_mu = Train_ds.map(
    lambda ds_one, ds_two: mix_up(ds_one, ds_two, alpha=0.2), num_parallel_calls=AUTOTUNE)

In [None]:
# Let's preview 9 samples from the dataset
(sample_images, sample_structs), sample_labels = next(iter(Train_ds_mu.take(1)))
plt.figure(figsize=(10, 10))
for i, (image, label) in enumerate(zip(sample_images[:9], sample_labels[:9])):
    ax = plt.subplot(3, 3, i + 1)
    plt.imshow(image.numpy().astype(np.uint8))
    ax.set_title(label.numpy().tolist())
    plt.axis("off")

In [None]:
# Importing EfficientNetB3 pretrained model
EffNetB3_path = "../input/efficientnetb3pretrained/EfficientNetB3.h5"
EffB3 = tf.keras.models.load_model(EffNetB3_path)
EffB3.load_weights('../input/efficientnet-keras-noisystudent-weights-b0b7/noisystudent/noisy.student.notop-b3.h5')
EffB3.trainable=False

In [None]:
# Defining the neural network model used for hyperparameters optimization
def create_model_hp(hp):
    Inp1 = Input(shape=(image_size,image_size,channels))
    out1 = EffB3(Inp1)
    out1 = GlobalAveragePooling2D()(out1)
    out1 = BatchNormalization()(out1)
    Eff_drop = hp.Float('eff_drop_rate', min_value=0.1, max_value=0.5, step=0.05)
    out1 = Dropout(Eff_drop)(out1)
    hp_activation = 'selu'  # ReLU, ELU were tested in previous trials.
    hp_units_eff = hp.Int('units_eff', min_value=4, max_value=32, step=4)
    hp_layers_eff = hp.Int('layers_eff', min_value=1, max_value=3, step=1)
    for l in range(hp_layers_eff):
        out1 = Dense(hp_units_eff, activation=hp_activation, kernel_initializer='he_normal')(out1)

    Inp2 = Input(shape=(12,))
    hp_units_meta = hp.Int('units_meta', min_value=2, max_value=16, step=2)
    hp_layers_meta = hp.Int('layers_meta', min_value=1, max_value=3, step=1)
    for k in range(hp_layers_meta):
        out2 = Dense(hp_units_meta, activation=hp_activation, kernel_initializer='he_normal')(Inp2)

    out = concatenate([out1,out2], axis=1)
    Top_drop = hp.Float('top_drop_rate', min_value=0.1, max_value=0.5, step=0.05)
    out = Dropout(Top_drop)(out)
    hp_units_top = hp.Int('units_top', min_value=4, max_value=32, step=4)
    out = Dense(hp_units_top, activation=hp_activation, kernel_initializer='he_normal')(out)
    out = Dense(1, activation='relu')(out)

    PawModel = Model(inputs=[Inp1,Inp2], outputs=out)
    
    PawModel.compile(loss='mse', 
              optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3),   # LR was chosen based on previous trials.
              metrics=[tf.keras.metrics.RootMeanSquaredError()])
    
    return PawModel

In [None]:
# setting up the keras-tuner
tuner = kt.RandomSearch(create_model_hp,
                     objective=kt.Objective("val_root_mean_squared_error", direction="min"),
                     max_trials=80,
                     seed=42,
                     directory='my_dir',
                     project_name='KT_Paw')

In [None]:
tuner.search_space_summary(extended=True)

In [None]:
# Running the hyperparameter tuning process
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=1, min_lr=1e-6)

tuner.search(Train_ds_mu, validation_data=Val_ds, callbacks=[stop_early, reduce_lr])

In [None]:
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
best_activation = 'selu'
best_LR = 0.001
best_drop_eff = best_hps.get('eff_drop_rate')
best_units_eff = best_hps.get('units_eff')
best_layers_eff = best_hps.get('layers_eff')
best_units_meta = best_hps.get('units_meta')
best_layers_meta = best_hps.get('layers_meta')
best_drop_top = best_hps.get('top_drop_rate')
best_units_top = best_hps.get('units_top')
print(f"""
The hyperparameter search is complete. The optimal number of units for the dense layers are the following:
EffB0 drop rate: {best_drop_eff}
EffB0 number of layers: {best_layers_eff}
EffB0 dense layers units: {best_units_eff}
Meta number of layers: {best_layers_meta}
Meta data dense layers units: {best_units_meta}
Top drop rate: {best_drop_top}
Top dense layer: {best_units_top}\n
The optimal activation function is {best_activation} and the optimal learning rate is {best_LR}.
""")


In [None]:
# Freeing up memory
del Train_ds, Train_ds_mu, Val_ds, Train_ds_one, Train_ds_two
gc.collect()

In [None]:
# Defining the neural network model for cross-validation
def create_model_cv():
    Inp1 = Input(shape=(image_size,image_size,channels))
    out1 = EffB3(Inp1)
    out1 = GlobalAveragePooling2D()(out1)
    out1 = BatchNormalization()(out1)
    out1 = Dropout(0.2)(out1)
    for l in range(best_layers_eff):
        out1 = Dense(best_units_eff , activation=best_activation, kernel_initializer='he_normal')(out1)

    Inp2 = Input(shape=(12,))
    for l in range(best_layers_meta):
        out2 = Dense(best_units_meta, activation=best_activation, kernel_initializer='he_normal')(Inp2)

    out = concatenate([out1,out2], axis=1)
    out = Dense(best_units_top, activation=best_activation, kernel_initializer='he_normal')(out)
    out = Dense(1, activation='relu')(out)

    PawModel = Model(inputs=[Inp1,Inp2], outputs=out)
    
    PawModel.compile(loss='mse', 
              optimizer = tf.keras.optimizers.Adam(learning_rate=best_LR), 
              metrics=[tf.keras.metrics.RootMeanSquaredError()])
    
    return PawModel

In [None]:
# Visualizing the model architecture
mod = create_model_cv()
plot_model(mod, show_shapes=True)

In [None]:
# Custorm metric that prints the learning rate parameter used in the current epoch
def get_lr_metric(optimizer):
    def lr(y_true, y_pred):
        return optimizer._decayed_lr(tf.float32)
    return lr

In [None]:
# Variables for the cross-validation trining process 
counter=0
test_pred = np.zeros((Test_df.shape[0],))
results_list=[]
pred_list=[]

# 7-fold Cross-validation loop
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
for idx, (train, val) in enumerate(kfold.split(X=Train_df.iloc[:,:-1], y=Train_df.iloc[:,-1])):
    counter += 1
    print(f'\n Fold {counter}:\n')

    Train_fold = Train_df.iloc[train,:]
    Val_fold = Train_df.iloc[val,:]
    
    Train_ds_one = create_ds(Train_fold, 'train')
    Train_ds_two = create_ds(Train_fold, 'train')
    Train_ds = tf.data.Dataset.zip((Train_ds_one, Train_ds_two))
    Train_ds_mu = Train_ds.map(lambda ds_one, ds_two: mix_up(ds_one, ds_two, alpha=0.2),\
                               num_parallel_calls=AUTOTUNE)
    Val_ds = create_ds(Val_fold, 'train')
    
    
    model = create_model_cv()
    
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=1, min_lr=1e-6)
    early_stopping = EarlyStopping(patience=7, restore_best_weights=True)
    results = model.fit(Train_ds_mu,
                      epochs=15,
                      validation_data = Val_ds,
                      callbacks=[early_stopping, reduce_lr], verbose=1)
    results_list.append(results)
    
    # Prediction on test
    pred_list.append(model.predict(Test_ds))
    print('='*25)
    
    # Freeing up memory
    del model, results
    del Train_ds, Val_ds, Train_ds_one, Train_ds_two, Train_ds_mu
    del Train_fold, Val_fold
    gc.collect()

In [None]:
# Claculating average prediction on test
Average_pred = sum(pred_list)/counter

In [None]:
# Creating the Submission file
Submission_df=pd.DataFrame()
Submission_df['Id']=Test_id
Submission_df['Pawpularity']=Average_pred
Submission_df.to_csv('submission.csv',index=False)

In [None]:
Submission_df

**If you find this notebook useful, don't forget to upvote it!**