In the current competition there are already a lot of very strong performing notebooks such as those from [Chris Deotte](https://www.kaggle.com/cdeotte) and [Abhishek Thakur](https://www.kaggle.com/abhishek). These notebooks are however based on the current State of the Art models suchs as SWIN Transformers.

I was pretty surprised when I noticed that the score of my notebook with only an EfficientNet B2 (and not even the V2 version...) came pretty close to their scores.

So ... why not share it with the Kaggle Community ;-)

The main process this notebook uses is the following:
1. An EfficientNetB2-NS Classification model is trained as classifier with Stratified 10 Fold Cross Validation. Key is only limited epochs and very low learning rate.
2. Next each of the 10 feature models is used to extract features for the train and test.
3. As a last step with the extracted features a Stratified 6 Fold CV training with CatBoost is performed and the predictions on the test set are made.

And if you do like the notebook ... then please give an upvote for it. And definitely let me know your questions and/or remarks in the comments for this notebook.

Enjoy!

In [None]:
# Install libraries
!pip install '../input/pawpularset/Keras_Applications-1.0.8-py3-none-any.whl'
!pip install '../input/pawpularset/efficientnet-1.1.1-py3-none-any.whl'

# Import libraries
import gc
import numpy as np
import pandas as pd
import random
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, train_test_split

import tensorflow as tf
import efficientnet.tfkeras as efn
from tensorflow.keras.layers import Input

In [None]:
# Constants
IMG_SIZE = 384
CHANNELS = 3
BATCH_SIZE = 16
Q = 30
EPOCHS = 10
FOLDS = 6
FEATURE_FOLDS = 10
SEED = 4261
VERBOSE = 1
LR = 0.000005

# Logic...
TRAIN_FEATURE_MODEL = True

# Folders
DATA_DIR = '../input/petfinder-pawpularity-score/'
TRAIN_DIR = DATA_DIR + 'train/'
TEST_DIR = DATA_DIR + 'test/'

## SET TPU / GPU

In [None]:
# Configure Strategy. Assume TPU...if not set default for GPU/CPU
tpu = None
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    # Enable XLA
    tf.config.optimizer.set_jit(enabled = "autoclustering")
    strategy = tf.distribute.get_strategy()
    
# Set Auto Tune
AUTOTUNE = tf.data.experimental.AUTOTUNE   

## Training data

In [None]:
# Load Train Data
train_df = pd.read_csv(f'{DATA_DIR}train.csv')
train_df['Id'] = train_df['Id'].apply(lambda x: f'{TRAIN_DIR}{x}.jpg')

# Set a specific label to be able to perform stratification
train_df['stratify_label'] = pd.qcut(train_df['Pawpularity'], q = Q, labels = range(Q))

# Label value to be used for feature model 'classification' training.
train_df['target_value'] = train_df['Pawpularity'] / 100.

# Summary
print(f'train_df: {train_df.shape}')
train_df.head()

## Test data

In [None]:
# Load Test Data
test_df = pd.read_csv(f'{DATA_DIR}test.csv')
test_df['Id'] = test_df['Id'].apply(lambda x: f'{TEST_DIR}{x}.jpg')
test_df['Pawpularity'] = 0

# Summary
print(f'test_df: {test_df.shape}')
test_df.head()

## TF Dataset support code

In [None]:
def build_augmenter(is_labelled):
    def augment(img):
        # Only use basic augmentations...too much augmentation hurts performance
        img = tf.image.random_flip_left_right(img)
        img = tf.image.random_flip_up_down(img)
        img = tf.image.random_saturation(img, 0.95, 1.05)
        img = tf.image.random_brightness(img, 0.05)
        img = tf.image.random_contrast(img, 0.95, 1.05)
        img = tf.image.random_hue(img, 0.05)
        
        return img
    
    def augment_with_labels(img, label):
        return augment(img), label
    
    return augment_with_labels if is_labelled else augment

def build_decoder(is_labelled):
    def decode(path):
        # Read Image
        file_bytes = tf.io.read_file(path)
        img = tf.image.decode_jpeg(file_bytes, channels = CHANNELS)
        
        # Normalize and Resize
        img = tf.cast(img, tf.float32) / 255.0
        img = tf.image.resize(img, (IMG_SIZE, IMG_SIZE))
        
        return img
    
    def decode_with_labels(path, label):
        return decode(path), label
    
    return decode_with_labels if is_labelled else decode

def create_dataset(df, batch_size = 32, is_labelled = False, augment = False, repeat = False, shuffle = False):
    decode_fn = build_decoder(is_labelled)
    augmenter_fn = build_augmenter(is_labelled)
    
    # Create Dataset
    if is_labelled:
        dataset = tf.data.Dataset.from_tensor_slices((df['Id'].values, df['target_value'].values))
    else:
        dataset = tf.data.Dataset.from_tensor_slices((df['Id'].values))
    dataset = dataset.map(decode_fn, num_parallel_calls = AUTOTUNE)
    dataset = dataset.map(augmenter_fn, num_parallel_calls = AUTOTUNE) if augment else dataset
    dataset = dataset.repeat() if repeat else dataset
    dataset = dataset.shuffle(1024, reshuffle_each_iteration = True) if shuffle else dataset
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(AUTOTUNE)
    
    return dataset

## EfficientNet Feature Model Support Code

In [None]:
# Set Callbacks
def model_checkpoint(fold):
    return tf.keras.callbacks.ModelCheckpoint(f'feature_model_{fold}.h5',
                                              verbose = 1, 
                                              monitor = 'val_rmse', 
                                              mode = 'min', 
                                              save_weights_only = True,
                                              save_best_only = True)

def unfreeze_model(model):
    # Unfreeze layers while leaving BatchNorm layers frozen
    for layer in model.layers:
        if not isinstance(layer, tf.keras.layers.BatchNormalization):
            layer.trainable = True
        else:
            layer.trainable = False

def create_model(): 
    # Create and Compile Model and show Summary
    effnet_model = efn.EfficientNetB2(include_top = False, 
                                      classes = None, 
                                      input_shape = (IMG_SIZE, IMG_SIZE, CHANNELS), 
                                      weights = '../input/pawpularset/efficientnet-b2_noisy-student_notop.h5', 
                                      pooling = 'avg')

    # Set all layers to Trainable except BN layers
    unfreeze_model(effnet_model)
    
    X = tf.keras.layers.Dropout(0.25)(effnet_model.output)
    output = tf.keras.layers.Dense(1, activation = 'sigmoid')(X)
    
    # Create Final Model
    model = tf.keras.Model(inputs = effnet_model.input, outputs = output)

    # Compile
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = LR), 
                  loss = tf.keras.losses.BinaryCrossentropy(), 
                  metrics = [tf.keras.metrics.RootMeanSquaredError('rmse')])        
    
    return model

## EfficientNet Feature Model Training

Next we train a number of EfficientNet B2 models to be used as feature extractors. Training is only done for a few epochs with a low learning rate. Using the 'noisy-student' weights as baseline to finetune turned out to work a lot better than the 'imagenet' weights.

A baseline set of fine-tuned models is available in the added dataset.

In [None]:
if TRAIN_FEATURE_MODEL:
    # OOF RMSE Placeholder
    all_val_rmse = []

    # Stratified Training
    kfold = StratifiedKFold(n_splits = FEATURE_FOLDS, shuffle = True, random_state = SEED)
    for fold, (train_index, val_index) in enumerate(kfold.split(train_df.index, train_df['stratify_label'])):
        print(f'\n===== Fold {fold}\n')

        # Pre model.fit cleanup
        tf.keras.backend.clear_session()
        gc.collect()

        # Create Model
        model = create_model()

        # Create TF Datasets
        trn = train_df.iloc[train_index]
        val = train_df.iloc[val_index]
        training_dataset = create_dataset(trn, batch_size = BATCH_SIZE, is_labelled = True, augment = True, repeat = True, shuffle = True)
        validation_dataset = create_dataset(val, batch_size = BATCH_SIZE, is_labelled = True, augment = False, repeat = True, shuffle = False)

        # Fit Model
        history = model.fit(training_dataset,
                            epochs = EPOCHS,
                            steps_per_epoch = trn.shape[0] // BATCH_SIZE,
                            validation_steps = val.shape[0] // BATCH_SIZE,
                            callbacks = [model_checkpoint(fold)],
                            validation_data = validation_dataset,
                            verbose = 1)   

        # Validation Information
        best_val_rmse = min(history.history['val_rmse'])
        all_val_rmse.append(best_val_rmse)
        print(f'\nValidation RMSE: {best_val_rmse}\n')

    # Summary
    print(f'Final Mean RMSE for {FEATURE_FOLDS} Fold CV Training: {np.mean(all_val_rmse)}')

## CatBoost 6 Fold CV Training

In the last step I'am looping through and loading each of the 10 trained feature extraction models. For each of the individual feature extraction models we will extract the features for the training and test data. Next a complete 6 fold Cross Validation training run is performed with CatBoost.

This means that the final predictions are based on the average of 10 * 6 predictions for the PawPularity score. Not bad ;-)

If you do training of the feature models and running CatBoost in one go .. then modify the path from which the feature models are loaded.

In [None]:
# Placeholders
preds_final = np.zeros((test_df.shape[0], 1))
all_oof_score = []

# Stratification and Label values
Y_strat = train_df['stratify_label'].values
Y_pawpularity = train_df['Pawpularity'].values

In [None]:
# Loop through all Feature Extraction Models
for fold_index in range(FEATURE_FOLDS):
    print('\n\n====================================================================================================')
    print(f'===== Run for Feature Model {fold_index} ======================================================================\n')

    # Pre model.fit cleanup
    tf.keras.backend.clear_session()
    gc.collect()

    # Create Model
    model = create_model()
    
    # Load Weights...Use the provided weight files...or modify for your own set.
    #model.load_weights(f'../input/pawpularset/feature_model_{fold_index}.h5')
    # Use as below when TRAIN_FEATURE_MODEL = True
    model.load_weights(f'feature_model_{fold_index}.h5')
    
    # Strip Last layers to be able to extract features
    model = tf.keras.Model(inputs = model.input, outputs = model.layers[-3].output)
    
    # Summary...only on first load
    if fold_index == 0: print(model.summary())        
        
    # Feature Extraction
    print('\n===== Extracting Features')
    cb_train_set = create_dataset(train_df, batch_size = BATCH_SIZE, is_labelled = True, augment = False, repeat = False, shuffle = False)
    cb_test_set = create_dataset(test_df, batch_size = BATCH_SIZE, is_labelled = False, augment = False, repeat = False, shuffle = False)
    cb_train_features = model.predict(cb_train_set, verbose = VERBOSE)
    cb_test_features = model.predict(cb_test_set, verbose = VERBOSE)
    
    print('\n===== Feature Set Shapes')
    print(f'Train Feature Set Shape: {cb_train_features.shape}')
    print(f'Test Feature Set Shape: {cb_test_features.shape}')
    
    # Stratified Training for CatBoost
    print(f'\n===== Running CatBoost - SEED {SEED}')
    
    # Placeholders
    oof_score = 0

    kfold = StratifiedKFold(n_splits = FOLDS, shuffle = True, random_state = SEED)
    for idx, (train, val) in enumerate(kfold.split(cb_train_features, Y_strat)):
        print(f'\n===== CatBoost Fold {idx} ===============================================================================================')

        train_x, train_y = cb_train_features[train], Y_pawpularity[train]
        val_x, val_y = cb_train_features[val], Y_pawpularity[val]
        
        # Set CatBoost Parameters
        cb_params = {'loss_function' : 'RMSE',
                     'eval_metric' : 'RMSE',
                     'iterations' : 1000,
                     'grow_policy' : 'SymmetricTree',
                     'depth' : 6,
                     'l2_leaf_reg' : 2.0,
                     'random_strength' : 1.0,
                     'learning_rate' : 0.05,
                     'task_type' : 'CPU',
                     'devices' : '0',
                     'verbose' : 0,
                     'random_state': SEED}
        
        # Create and Fit CatBoost Model
        cb_model = CatBoostRegressor(**cb_params)
        cb_model.fit(train_x, train_y, eval_set = [(val_x, val_y)], early_stopping_rounds = 100, verbose = 250)

        y_pred = cb_model.predict(val_x)
        preds_final += np.array([cb_model.predict(cb_test_features)]).T

        # Update OOF Score
        oof_score += np.sqrt(mean_squared_error(val_y, y_pred))        

        # Cleanup
        del cb_model, y_pred
        del train_x, train_y
        del val_x, val_y
        gc.collect()   
    
    # OOF Score for CatBoost run
    oof_score /= FOLDS
    all_oof_score.append(oof_score)
    print(f'CatBoost OOF Score: {oof_score}')
    print('Test Predictions Cumulative...')
    print(preds_final[:5])
    
    # Increase to improve randomness on the next feature model run
    SEED += 1

In [None]:
# Final OOF score for All Feature Models
print(f'Final OOF RMSE Score for all feature models: {np.mean(all_oof_score)}')

## Create submission file

In [None]:
preds_final /= (FOLDS*FEATURE_FOLDS)
submission_df = pd.read_csv(f'{DATA_DIR}sample_submission.csv')
submission_df['Pawpularity'] = preds_final.ravel()
submission_df.to_csv('submission.csv', index = False)

# Summary
submission_df.head(10)