# PetFinder.my - Pawpularity Contest Solution using a DOLG model and additional categorical data

**Training code which was submitted to the Petfinder.my Competition**

This notebook deals with:
* training code of a Deep Orthogonal Fusion of Local and Global Features (DOLG) model
* adding an additional branch which uses categorical information
* categorical data includes (i) image meta features like colorfulness, saturation, size ratio ... and (ii) image content feature like animal type, breed
* training can be executed on TPU

**CV score was around 17.5 and Private Leaderbord score ~17.8**

In [None]:
import joblib
import os

import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow_addons as tfa

from functools import partial
from PIL import Image
from tqdm.notebook import tqdm
from sklearn.model_selection import KFold
from sklearn.utils import shuffle
from kaggle_datasets import KaggleDatasets


from tensorflow.keras import layers
from tensorflow.keras import applications, layers, Model, Input
from tensorflow.keras import (layers, Sequential, activations, initializers)
from tensorflow.keras.applications import EfficientNetB5

# DOLG implementation with EfficientNetB5 base
**DOLG implementation origins from https://github.com/innat/DOLG-TensorFlow (see also the authors Kaggle Notebooks https://www.kaggle.com/ipythonx). Some adaptions were made to use an EfficientNet base**

In [None]:
class MultiAtrous(tf.keras.Model):
    def __init__(self, dilation_rates = [6, 12, 18], upsampling = 1, kernel_size = 3, padding = "same", **kwargs):
        super(MultiAtrous, self).__init__(name = 'MultiAtrous', **kwargs)
        self.dilation_rates = dilation_rates
        self.kernel_size = kernel_size
        self.upsampling = upsampling
        self.padding = padding
        self.dilated_convs = [layers.Conv2D(filters = int(1024 / 4), kernel_size = self.kernel_size, padding = self.padding, dilation_rate = rate) for rate in self.dilation_rates]
        self.gap_branch = Sequential([layers.Lambda(lambda t4d: K.mean(t4d, axis = (1, 2), keepdims = True), name = 'GlobalAverage2D'), layers.Conv2D(int(1024 / 2), kernel_size = 1), layers.Activation('relu'), layers.UpSampling2D(size = self.upsampling, interpolation = "bilinear")], name = 'gap_branch')

    def call(self, inputs, training = None, **kwargs):
        local_feature = []
        for dilated_conv in self.dilated_convs:
            x = dilated_conv(inputs)
            x = self.gap_branch(x)
            local_feature.append(x)
        return tf.concat(local_feature, axis = -1)

    def get_config(self):
        config = {'dilation_rates': self.dilation_rates, 'kernel_size': self.kernel_size, 'padding': self.padding, 'upsampling': self.upsampling}
        base_config = super(MultiAtrous, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

class DOLGLocalBranch(tf.keras.Model):
    def __init__(self, img_size, **kwargs):
        super(DOLGLocalBranch, self).__init__(name = 'LocalBranch', **kwargs)
        self.multi_atrous = MultiAtrous(padding = 'same', upsampling = int(img_size / 32))
        self.conv1 = layers.Conv2D(1024, kernel_size = 1)
        self.conv2 = layers.Conv2D(1024, kernel_size = 1, use_bias = False)
        self.conv3 = layers.Conv2D(1024, kernel_size = 1)
        self.bn = layers.BatchNormalization()

    def call(self, inputs, training = None, **kwargs):
        local_feat = self.multi_atrous(inputs)
        local_feat = self.conv1(local_feat)
        local_feat = tf.nn.relu(local_feat)
        local_feat = self.conv2(local_feat)
        local_feat = self.bn(local_feat)
        norm_local_feat = tf.math.l2_normalize(local_feat)
        attn_map = tf.nn.relu(local_feat)
        attn_map = self.conv3(attn_map)
        attn_map = activations.softplus(attn_map)
        return norm_local_feat * attn_map

class OrthogonalFusion(layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(name = 'OrthogonalFusion', **kwargs)

    def call(self, inputs):
        local_feat, global_feat = inputs
        height = local_feat.shape[1]
        width = local_feat.shape[2]
        depth = local_feat.shape[3]

        local_feat = tf.reshape(local_feat, [-1, height * width, depth])
        local_feat = tf.transpose(local_feat, perm = [0, 2, 1])

        projection = tf.matmul(tf.expand_dims(global_feat, axis = 1), local_feat)
        projection = tf.matmul(tf.expand_dims(global_feat, axis = 2), projection)
        projection = tf.reshape(projection, [-1, height, width, depth])

        global_feat_norm = tf.norm(global_feat, ord = 2, axis = 1)
        projection = projection / tf.reshape(global_feat_norm * global_feat_norm, shape = [-1, 1, 1, 1])
        local_feat = tf.transpose(local_feat, perm = [0, 1, 2])
        local_feat = tf.reshape(local_feat, [-1, height, width, depth])

        orthogonal_comp = local_feat - projection
        global_feat = tf.expand_dims(tf.expand_dims(global_feat, axis = 1), axis = 1)
        global_feat = tf.broadcast_to(global_feat, tf.shape(local_feat))
        output = tf.concat([global_feat, orthogonal_comp], axis = -1)
        return output

class GeneralizedMeanPooling2D(layers.Layer):
    def __init__(self, init_norm = 3.0, normalize = False, epsilon = 1e-6, **kwargs):
        self.init_norm = init_norm
        self.normalize = normalize
        self.epsilon = epsilon
        super(GeneralizedMeanPooling2D, self).__init__(name = 'GeM', **kwargs)

    def build(self, input_shape):
        self.p = self.add_weight(name = "norms", shape = (input_shape[-1],), initializer = initializers.constant(self.init_norm), trainable = True)
        super(GeneralizedMeanPooling2D, self).build(input_shape)

    def call(self, inputs):
        x = tf.reduce_mean(tf.abs(inputs ** self.p), axis = [1, 2], keepdims = False) + self.epsilon
        x = x ** (1.0 / self.p)
        if self.normalize:
            x = tf.nn.l2_normalize(x, 1)
        return x

    def get_config(self):
        config = {'init_norm': self.init_norm, 'normalize': self.normalize, 'epsilon': self.epsilon}
        base_config = super(GeneralizedMeanPooling2D, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))
    

class DOLGNet(tf.keras.Model):
    def __init__(self, img_size, **kwargs):
        
        self.img_size = img_size
        super(DOLGNet, self).__init__(name = 'DOLGNet', **kwargs)
        
        self.orthogonal_fusion = OrthogonalFusion()
        self.local_branch = DOLGLocalBranch(img_size)
        self.glob_branch_pool = Sequential([layers.GlobalAveragePooling2D(), layers.Dense(1024, activation = None)], name = 'GlobalBranchPooling')
        
        base = applications.EfficientNetB5(
            include_top = False,
            weights = 'imagenet',
            input_shape=(img_size,img_size,3),
            input_tensor = Input((img_size, img_size, 3))
        )
        
        # Batchlayers not to be trained
        for layer in reversed(base.layers):
            if isinstance(layer, tf.keras.layers.BatchNormalization):
                layer.trainable = False
            else:
                layer.trainable = True
        
        self.new_base = Model([base.inputs], [base.get_layer('block5g_add').output,
            base.get_layer('block7c_add').output
        ], name = 'EfficientNet')
        
        
        self.classifier = Sequential([layers.GlobalAveragePooling2D(name = 'HeadGAP')], name = 'Classifiers')
        
            
    def call(self, inputs, training = None, **kwargs):
        to_local, to_global = self.new_base(inputs)
        local_feat = self.local_branch(to_local)
        global_feat = self.glob_branch_pool(to_global)
        
        orthogonal_feat = self.orthogonal_fusion([local_feat, global_feat])
        
        return self.classifier(orthogonal_feat)


    def build_graph(self):
        x = tf.keras.layers.Input(shape = (self.img_size, self.img_size, 3), name="cnn_input")
        return Model(inputs = [x], outputs = self.call(x))

# Initalize TPU

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Device:', tpu.master())
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    DEVICE = "TPU"
    batchsize_factor = 32
except:
    DEVICE = "notTPU"
    strategy = tf.distribute.get_strategy()
    batchsize_factor = 32


AUTOTUNE = tf.data.experimental.AUTOTUNE

REPLICAS =  strategy.num_replicas_in_sync
BATCH_SIZE = batchsize_factor * strategy.num_replicas_in_sync
GCS_PATH = KaggleDatasets().get_gcs_path(f'tfrecs-new')
FILENAMES = tf.io.gfile.glob(GCS_PATH + '/*.tfrecords')

# Inspect TFRecords

The dataset contains several features:
* All features provided by the competition hosts (accessory, info, collage, action, face, near, human...)
* An image quality assessment feature called "brisque" and calculated with the piq pytorch package (https://github.com/photosynthesis-team/piq)
* An colorfulness score (calculated according to https://www.pyimagesearch.com/2017/06/05/computing-image-colorfulness-with-opencv-and-python/)
* The image size and the size ration (both scaled between 0 and 1)
* Image brightness and saturation
* The type of the animal (0: dog, 1: cat). For the classification a pretrained ResNet50V2 was used
* The breed of the dog (using an Inception architecture trained on the Kaggle dog breed dataset). The values were one-hot encoded afterwards.

In [None]:
raw_dataset = tf.data.TFRecordDataset(FILENAMES[0])

def _get_keys(raw_dataset):
    for raw_record in raw_dataset.take(1):
        example = tf.train.Example()
        example.ParseFromString(raw_record.numpy())
        return dict(example.features.feature).keys()

# Get feature columns from tfrecords and exclude features which are not used for training
CONSIDERED_COLS = [k for k in _get_keys(raw_dataset) if k not in ["image", "image_height", "image_width", "score"]]
print(f"{len(CONSIDERED_COLS)} features e.g. {CONSIDERED_COLS[0:5]}")

CONSIDERED_COLS = list(map(lambda x: x.lower(), CONSIDERED_COLS))

# Prepare TFRecords and apply light augmentations

In [None]:
DEFAULT_IMG_SIZE = (512,512)

def _parse_image(proto, train):
    
    image_feature_description = {
        'image': tf.io.FixedLenFeature([], tf.string),
        'image_width': tf.io.FixedLenFeature([], tf.int64),
        'image_height': tf.io.FixedLenFeature([], tf.int64)
        
    }
    
    features = dict(
        image_feature_description, **{k: tf.io.FixedLenFeature([], tf.float32) for k in CONSIDERED_COLS}
    )
        
    if train:
        features["score"] = tf.io.FixedLenFeature([], tf.float32)
        
    return tf.io.parse_single_example(proto, features)


def decode_image(image, img_size, normalize=False):
    image = tf.image.decode_jpeg(image, channels=3) 
    shapes = tf.shape(image)
    h, w = shapes[-3], shapes[-2]
    small = tf.minimum(h, w)
    image = tf.image.resize_with_crop_or_pad(image, small, small)
    image = tf.image.resize(image, img_size)
    
    # EfficientNet shouldn't be normalized as this is done in a custom model layer
    if normalize:
        image = tf.cast(image, tf.float16)
        image = image / 255.0
    
    return tf.reshape(image, [*img_size, 3])

def get_image_and_label(proto, train, img_size):
    sample = _parse_image(proto, train=train)
    
    img = decode_image(sample["image"], img_size)
    
    features = {"cnn_input": img,
               "dense_input": tf.stack([sample[c] for c in CONSIDERED_COLS])}
    
    if train:
        return features, tf.cast(sample["score"], tf.float32) / 100.0  
    
    return features, None

def augmentation(img):
    
    img = tf.image.random_flip_left_right(img)
    
    img = tf.image.random_flip_up_down(img)
    
    if tf.random.uniform([], 0, 1.0, dtype = tf.float32) > 0.75:
        img = tf.image.transpose(img)
    
    probablity_rotation = tf.random.uniform([], 0, 1.0, dtype = tf.float32)
    if probablity_rotation > 0.75:
        img = tf.image.rot90(img, k = 3)
    elif probablity_rotation > 0.5:
        img = tf.image.rot90(img, k = 2)
    elif probablity_rotation > 0.25:
        img = tf.image.rot90(img, k = 1)   
        
    return img

def augmentation_wrapper(x, y):
    x.update({"cnn_input": augmentation(x["cnn_input"])})
    return x, y

def scaling_wrapper(x, y):
    return x["cnn_input"] / 255.0

def get_tfrecord_size(tfrecord):
    return sum(1 for _ in tfrecord)

def get_training_dataset(tfr, batchsize, img_size=DEFAULT_IMG_SIZE):
    return tfr.map(partial(get_image_and_label, train=True, img_size=img_size)).repeat().map(
        lambda x,y: augmentation_wrapper(x,y)).shuffle(1000).batch(batchsize, drop_remainder=True).prefetch(AUTOTUNE)

def get_validation_dataset(tfr, batchsize, img_size=DEFAULT_IMG_SIZE):
    return tfr.map(partial(get_image_and_label, train=True, img_size=img_size)).batch(batchsize, drop_remainder=True).prefetch(AUTOTUNE)

def get_normalization_batch(tfr, batchsize, img_size=DEFAULT_IMG_SIZE):
    return tfr.map(partial(get_image_and_label, train=True, img_size=img_size)).map(
        lambda x,y: scaling_wrapper(x,y)).shuffle(1000).batch(batchsize, drop_remainder=True).prefetch(AUTOTUNE)

# Building the complete model consisting of a DLOG branch and another branch for including the categorical features

In [None]:
def dolg():
    tf.keras.backend.reset_uids()
    model = DOLGNet(img_size = DEFAULT_IMG_SIZE[0]) 
    return model.build_graph()

In [None]:
def dense_net():
    inputs = tf.keras.layers.Input(shape=len(CONSIDERED_COLS), name="dense_input")
    embedding_layer = tf.keras.layers.Embedding(input_dim=len(CONSIDERED_COLS), output_dim=10, name="emb_2")(inputs)
    flatten_layer = tf.keras.layers.Flatten()(embedding_layer)
    dense_layer = tf.keras.layers.Dense(10, activation="relu")(flatten_layer)
    model = tf.keras.Model(inputs=inputs, outputs=dense_layer)
    return model

In [None]:
def stack_model(models):
    combined_inputs = [model.input for model in models]
    combined_outputs = [model.output for model in models]
    concat_layer = tf.keras.layers.concatenate(combined_outputs)
    dense_layer_ = tf.keras.layers.Dense(1024, activation="relu")(concat_layer)
    dropout_layer_0 = tf.keras.layers.Dropout(0.3)(dense_layer_)
    dense_layer_0 = tf.keras.layers.Dense(512, activation="relu")(dropout_layer_0)
    dropout_layer = tf.keras.layers.Dropout(0.3)(dense_layer_0)
    dense_layer_1 = tf.keras.layers.Dense(128, activation="relu")(dropout_layer)
    dense_layer_2 = tf.keras.layers.Dense(32, activation="relu")(dense_layer_1)
    dense_layer_3 = tf.keras.layers.Dense(1, activation="sigmoid")(dense_layer_2)
    return tf.keras.Model(inputs=combined_inputs, outputs=[dense_layer_3])

# Learning rate sheduler

In [None]:
EPOCHS = 15

def lrfn(epoch, bs=BATCH_SIZE, epochs=EPOCHS):

    LR_START = 1e-5
    LR_MAX = 1e-4
    LR_FINAL = 5e-5
    LR_RAMPUP_EPOCHS = 3
    LR_SUSTAIN_EPOCHS = 0
    DECAY_EPOCHS = epochs  - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS - 1
    LR_EXP_DECAY = (LR_FINAL / LR_MAX) ** (1 / (EPOCHS - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS - 1))

    if epoch < LR_RAMPUP_EPOCHS:
        lr = LR_START + (LR_MAX + LR_START) * (epoch / LR_RAMPUP_EPOCHS) ** 2.5
    elif epoch < LR_RAMPUP_EPOCHS + LR_SUSTAIN_EPOCHS:
        lr = LR_MAX
    else:
        epoch_diff = epoch - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS
        decay_factor = (epoch_diff / DECAY_EPOCHS) * 3.141592653589793
        decay_factor= (tf.math.cos(decay_factor).numpy() + 1) / 2        
        lr = LR_FINAL + (LR_MAX - LR_FINAL) * decay_factor

    return lr

In [None]:
import matplotlib.pyplot as plt  
rng = [i for i in range(20)]
y = [lrfn(x) for x in rng]
plt.plot(rng, y);

# Overall architecture

**The model has two different inputs. An image for the cnn part and and feature vector containing the meta information**

In [None]:
_dolg = dolg()
_dense = dense_net()
stacked_model = stack_model([_dolg,_dense])
stacked_model.compile()
tf.keras.utils.plot_model(stacked_model, show_shapes=True)

# KFold Splitting and Training

In [None]:
kfold = KFold(n_splits=4, shuffle=True, random_state=0)
filenames = [f for f in [_ for _ in os.listdir("../input/tfrecs-new") if len(_.split(".")) > 1] if f.split(".")[1] =="tfrecords"]
folds = {}
for fold, (train_idx, val_idx) in enumerate(kfold.split(filenames)):
  folds[fold] = {"split": (np.take(filenames, train_idx), np.take(filenames, val_idx))}

In [None]:
ignore_order = tf.data.Options()
ignore_order.experimental_deterministic = False

if DEVICE =="TPU":
    tf.tpu.experimental.initialize_tpu_system(tpu)

In [None]:
cv_history = []


img_size = DEFAULT_IMG_SIZE

for fold in [3]:

    validation_records = tf.data.TFRecordDataset(
      [f for f in FILENAMES if f.split("/")[-1] in folds[fold]["split"][1]],
      num_parallel_reads=AUTOTUNE)

    train_records = tf.data.TFRecordDataset([f for f in FILENAMES if f.split("/")[-1] in folds[fold]["split"][0]],
                                          num_parallel_reads=AUTOTUNE)

    train_records = train_records.with_options(ignore_order)

    validation_data = get_validation_dataset(validation_records, BATCH_SIZE, img_size) 
    
    print(f"Used batchsize: {BATCH_SIZE}")
    
    train_data = get_training_dataset(train_records, BATCH_SIZE, img_size)

    train_size = get_tfrecord_size(train_records)
    validation_size = get_tfrecord_size(validation_records)

    with strategy.scope():

        model1 = dolg()
        model2 = dense_net()
        model = stack_model([model1, model2])

        # get data to set mean and std vor the normalization layers
        model.get_layer('EfficientNet').get_layer('normalization').adapt(
            get_normalization_batch(train_records, BATCH_SIZE, img_size)
        )   

        opt = tf.keras.optimizers.Adam(lr=0.001)
        
        loss = tf.keras.losses.BinaryCrossentropy(label_smoothing=0.01)

        model.compile(loss=loss, optimizer=opt, metrics=["mse", tf.keras.metrics.RootMeanSquaredError()])

        model.summary()

        cb_lr = tf.keras.callbacks.LearningRateScheduler(lambda epoch: lrfn(epoch), verbose=1)

        cb_earlystop = tf.keras.callbacks.EarlyStopping( 
                           patience=3, restore_best_weights=True, verbose=1)

        params = {
            "epochs":15,
            "steps_per_epoch":train_size//BATCH_SIZE,
            "validation_data": validation_data,
            "callbacks": [cb_earlystop, cb_lr]
        } 

        print(f"Fold: {fold}, {train_size} train images {validation_size} validation images")

        history = model.fit(train_data, **params)

        cv_history.append(history.history)
