# Using a Pretrained EfficientNet to Predict Pet Popularity

Original Version: https://www.kaggle.com/ekaterinadranitsyna/pretrained-feature-model-keras

Please respect the original work and upvote the original work too. Credits: (https://www.kaggle.com/ekaterinadranitsyna)

## What all is different from original work:

1. Converting to N-fold training
2. EfficientNetB4 as pretrained model
3. Saving Models, Training Preds and Valid Preds for experiments with boosting models and tabular data.
4. Wandb Integration
5. Saving config for future reference.
6. Support for saving Image Embeddings.
7. Changing From Sequential API to Functional API for multioutput.
8. Shuffle during Training

Pretrained **EfficientNetB4 model from Keras applications** is used to extract features from images resized to 224 x 224. Popularity score is estimated based solely on images. Tabular data is ignored. Since image quality affects the target value only horizontal flip is used for data augmentation.

In [None]:
import os
import random

import pandas as pd
import numpy as np

import tensorflow as tf
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow.keras.backend as K
import yaml

import wandb
from wandb.keras import WandbCallback
from kaggle_secrets import UserSecretsClient

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

In [None]:
config = {}

# Tabular data file paths
config['TRAIN_DATA_PATH'] = '../input/pawpularity-folds/train_5folds.csv'
TEST_DATA_PATH = '../input/petfinder-pawpularity-score/test.csv'

# Image data directories
TRAIN_DIRECTORY = '../input/petfinder-pawpularity-score/train'
TEST_DIRECTORY = '../input/petfinder-pawpularity-score/test'

In [None]:
# Parameters for processing tabular data
TARGET_NAME = 'Pawpularity'
config['SEED'] = 541
config['FOLDS'] = 5
config['DEBUG'] = False

In [None]:
# TensorFlow settings and training parameters
AUTOTUNE = tf.data.experimental.AUTOTUNE
config['IMG_SIZE'] = 299
config['BATCH_SIZE'] = 64
config['DROPOUT_RATE'] = 0.2
config['LEARNING_RATE'] = 1e-3
config['DECAY_STEPS'] = 100
config['DECAY_RATE'] = 0.96
config['EPOCHS'] = 100
config['PATIENCE'] = 4
config['USE_WANDB'] = False
config['WANDB_PROJECT'] = 'pawpularity'
config['WANDB_MODE'] = 'offline'

In [None]:
# Pretrained image classification model EfficientNetB0
# from tf.keras.applications with global average pooling as a final layer.
# In this notebook the model is loaded from a public dataset on Kaggle
# at https://www.kaggle.com/ekaterinadranitsyna/keras-applications-models
config['IMG_MODEL'] = '../input/keras-applications-models/EfficientNetB4.h5'

In [None]:
with open(r'config.yaml', 'w') as file:
    yaml.dump(config, file)

## Functions

In [None]:
def use_wandb():
    if config['WANDB_MODE'] == 'offline':
        os.environ["WANDB_MODE"] = "offline"
        key='X'*40
        wandb.login(key=key)
    else:
        user_secrets = UserSecretsClient()
        wandb_api = user_secrets.get_secret("wandb_api")
        wandb.login(key=wandb_api)

    run = wandb.init(project=config['WANDB_PROJECT'], 
                     job_type='train',
                     config = config)

    return run

def set_seed(seed=42):
    """Utility function to use for reproducibility.
    :param seed: Random seed
    :return: None
    """
    np.random.seed(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'


def set_display():
    """Function sets display options for charts and pd.DataFrames.
    """
    # Plots display settings
    plt.style.use('fivethirtyeight')
    plt.rcParams['figure.figsize'] = 12, 8
    plt.rcParams.update({'font.size': 14})
    # DataFrame display settings
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)
    pd.options.display.float_format = '{:.4f}'.format


def id_to_path(img_id: str, dir: str):
    """Function returns a path to an image file.
    :param img_id: Image Id
    :param dir: Path to the directory with images
    :return: Image file path
    """
    return os.path.join(dir, f'{img_id}.jpg')


@tf.function
def get_image(path: str) -> tf.Tensor:
    """Function loads image from a file and preprocesses it.
    :param path: Path to image file
    :return: Tensor with preprocessed image
    """
    image = tf.image.decode_jpeg(tf.io.read_file(path), channels=3)
    image = tf.cast(tf.image.resize_with_pad(image, config['IMG_SIZE'], config['IMG_SIZE']), dtype=tf.int32)
    return tf.keras.applications.efficientnet.preprocess_input(image)


@tf.function
def process_dataset(path: str, label: int) -> tuple:
    """Function returns preprocessed image and label.
    :param path: Path to image file
    :param label: Class label
    :return: tf.Tensor with preprocessed image, numeric label
    """
    return get_image(path), label


@tf.function
def get_dataset(x, y=None, type='train') -> tf.data.Dataset:
    """Function creates batched optimized dataset for the model
    out of an array of file paths and (optionally) class labels.
    :param x: Input data for the model (array of file paths)
    :param y: Target values for the model (array of class indexes)
    :return TensorFlow Dataset object
    """
    
    
    if y is not None:
        ds = tf.data.Dataset.from_tensor_slices((x, y))
        if type=='train':
            ds = ds.shuffle(buffer_size=1024)

        return ds.map(process_dataset, num_parallel_calls=AUTOTUNE) \
            .batch(config['BATCH_SIZE']).prefetch(buffer_size=AUTOTUNE)
    else:
        ds = tf.data.Dataset.from_tensor_slices(x)
        return ds.map(get_image, num_parallel_calls=AUTOTUNE) \
            .batch(config['BATCH_SIZE']).prefetch(buffer_size=AUTOTUNE)


def plot_history(hist):
    """Function plots a chart with training and validation metrics.
    :param hist: Tensorflow history object from model.fit()
    """
    # Losses and metrics
    loss = hist.history['loss']
    val_loss = hist.history['val_loss']
    rmse = hist.history['score_root_mean_squared_error']
    val_rmse = hist.history['val_score_root_mean_squared_error']

    # Epochs to plot along x axis
    x_axis = range(1, len(loss) + 1)

    fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, sharex=True)

    ax1.plot(x_axis, loss, 'bo', label='Training')
    ax1.plot(x_axis, val_loss, 'ro', label='Validation')
    ax1.set_title('MSE Loss')
    ax1.legend()

    ax2.plot(x_axis, rmse, 'bo', label='Training')
    ax2.plot(x_axis, val_rmse, 'ro', label='Validation')
    ax2.set_title('Root Mean Squared Error')
    ax2.set_xlabel('Epochs')
    ax2.legend()

    plt.tight_layout()
    plt.show()

## Data Processing

In [None]:
set_seed(config['SEED'])
set_display()

if config['USE_WANDB']:
    run = use_wandb()

In [None]:
# Train data set
data_train = pd.read_csv(config['TRAIN_DATA_PATH'])
print(f'Train data shape: {data_train.shape}')
data_train.head()

In [None]:
# Test data set
data_test = pd.read_csv(TEST_DATA_PATH)
print(f'Test data shape: {data_test.shape}')
data_test.head()

In [None]:
# Reconstruct the paths to train and test images.


data_train['path'] = data_train['Id'].apply(
    lambda x: id_to_path(x, TRAIN_DIRECTORY))
data_test['path'] = data_test['Id'].apply(
    lambda x: id_to_path(x, TEST_DIRECTORY))

if config['DEBUG']:
    data_train = data_train.head(50)
    data_test = data_test.head(50)
    config['EPOCHS'] = 1

# # Keep a portion of the labeled data for validation.
# train_subset, valid_subset = train_test_split(
#     data_train[['path', TARGET_NAME]],
#     test_size=VAL_SIZE, shuffle=True, random_state=SEED
# )

## Modelling

In [None]:
def get_model():
    # Pretrained image classification model
    feature_model = tf.keras.models.load_model(config['IMG_MODEL'])

    # Freeze weights in the original model
    feature_model.trainable = False

    # This model takes in 224 x 224 images, applies random horizontal flip
    # (only in the train mode), passes image arrays through pretrained
    # feature extraction model and applies batch normalization, dropout
    # and activations to get the target score.
    
    input_layer = tf.keras.layers.Input(shape=(config['IMG_SIZE'], config['IMG_SIZE'], 3))
    random_flip = tf.keras.layers.experimental.preprocessing.RandomFlip(mode='horizontal')(input_layer)
    feature_model = feature_model(random_flip)
    bn = tf.keras.layers.BatchNormalization()(feature_model)
    dropout = tf.keras.layers.Dropout(config['DROPOUT_RATE'], name='top_dropout')(bn)
    fc1 = tf.keras.layers.Dense(32, activation='relu', name='embedding')(dropout)
    fc2 = tf.keras.layers.Dense(1, name='score')(fc1)
    
    image_model = tf.keras.Model(inputs=input_layer, 
                            outputs=[fc1,fc2])

    # To gradually decrease learning rate
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=config['LEARNING_RATE'],
        decay_steps=config['DECAY_STEPS'], decay_rate=config['DECAY_RATE'],
        staircase=True)


    # Compile the model
    image_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule),
                        loss={'embedding':None, 'score':tf.keras.losses.MeanSquaredError()},
                        metrics={'embedding': None, 'score':tf.keras.metrics.RootMeanSquaredError()})

    return image_model

In [None]:
image_model = get_model()
image_model.save_weights('default_weights.h5')

In [None]:
image_model.summary()

In [None]:
tf.keras.utils.plot_model(image_model, show_shapes=True,show_dtype=True)

In [None]:
! mkdir 'preds'

In [None]:
from sklearn import metrics

In [None]:
test_ds = get_dataset(x=data_test['path'])
history_objs = []
scores = []

# To monitor validation loss and stop the training.
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=config['PATIENCE'], restore_best_weights=True)


for fold in range(config['FOLDS']):
    K.clear_session()
    print("##"*50)
    print(f"##### Training for fold {fold}:")
    train_subset = data_train[data_train['kfold'] != fold][['path', TARGET_NAME]].reset_index(drop=True)
    valid_subset = data_train[data_train['kfold'] == fold][['path', TARGET_NAME]].reset_index(drop=True)
    
    # Create TensorFlow datasets
    train_ds = get_dataset(x=train_subset['path'], y=train_subset[TARGET_NAME], type='train')
    valid_ds = get_dataset(x=valid_subset['path'], y=valid_subset[TARGET_NAME], type='valid')
    
    bm = tf.keras.callbacks.ModelCheckpoint('paw_fold_'+str(fold)+'.h5',
                                            verbose=1, 
                                            monitor='val_loss', 
                                            mode='min', 
                                            save_best_only=True, 
                                            save_weights_only=False)
    image_model = get_model()
    image_model.load_weights('default_weights.h5')
    callbacks=[early_stop, bm]
    if config['USE_WANDB']:
        callbacks.append(WandbCallback(save_model=False))
    
    history = image_model.fit(train_ds, validation_data=valid_ds,
                          epochs=config['EPOCHS'], callbacks = callbacks,
                          use_multiprocessing=True, workers=-1)

    K.clear_session()
    image_model.load_weights('paw_fold_'+str(fold)+'.h5')
    print(f"##### Predicting for fold {fold}:")
    
    train_preds = image_model.predict(
    train_ds, use_multiprocessing=True, workers=os.cpu_count())
    valid_preds = image_model.predict(
    valid_ds, use_multiprocessing=True, workers=os.cpu_count())
    train_subset['predicted_score'] = train_preds[1]
    embeddings = pd.DataFrame(train_preds[0])
    train_subset = pd.concat([train_subset, embeddings], axis=1)
    valid_subset['predicted_score'] = valid_preds[1]
    
    print(f"##### Score for fold {fold}:", metrics.mean_squared_error(valid_subset.predicted_score.values, valid_subset[TARGET_NAME].values, squared=False))
    
    scores.append(metrics.mean_squared_error(valid_subset.predicted_score.values, valid_subset[TARGET_NAME], squared=True))
    train_subset.to_csv('preds/train_preds'+str(fold)+'.csv', index=False)
    valid_subset.to_csv('preds/valid_preds'+str(fold)+'.csv', index=False)
    
    del train_subset
    del valid_subset
    del train_ds
    del valid_ds
    
    print(f"##### Predicting for fold {fold} on Test Set:")
    data_test[TARGET_NAME+'_fold'+str(fold)] = image_model.predict(
    test_ds, use_multiprocessing=True, workers=os.cpu_count())[1]
    
    
    history_objs.append(history)
    
data_test.to_csv('preds/test_preds.csv', index=False)

In [None]:
print(scores)

In [None]:
print("CV:",np.sqrt(np.sum(scores)/config['FOLDS']))  

In [None]:
for idx, history in enumerate(history_objs):
    print(f"##### Fold {idx} training:")
    plot_history(history)

## Generating Submission File

In [None]:
pred_columns = [col for col in list(data_test.columns) if col.startswith( TARGET_NAME+'_fold')]
data_test[TARGET_NAME] = np.mean(data_test[pred_columns], axis=1)
data_test[['Id', TARGET_NAME]].to_csv('submission.csv', index=False)
print("All Done!")

if config['USE_WANDB']:
    run.finish()