# Pawpularity Prediction: Vision Transformer from Scratch

## Table of Contents
- Summary
- Setup
- Configuration
- Helpers
- Import datasets
- Data Preprocessing
- Model Development
- Submission
- Reference

## Summary
* Create Vision Transformer from scratch, modify the Vision Transformer so that it can also accept Tabular inputs. Tabular inputs is optional, with the Functional API, it's easy to control whether or not to receive Tabular inputs.
* Change Regression Problem to Classification Problem. Normalize the Pawpularity score from 0 to 1 and use BinaryCrossEntropy as loss function.
* Apply Data Augmentation to dataset during Training.


## Setup

In [None]:
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf
import sklearn
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
import tensorflow_addons as tfa

## Configuration

In [None]:
class Config:
    image_size = 128
    input_shape = [image_size, image_size, 3]
    learning_rate = 4e-4
    weight_decay = 0.0001
    batch_size = 128
    num_classes = 1
    num_epochs = 30
    patch_size = 16
    num_patches = (image_size // patch_size) ** 2
    projection_dim = 64
    num_heads = 4
    scale_factor = 100.0
    transformer_units = [
        projection_dim * 2,
        projection_dim
    ]
    transformer_layers = 8
    mlp_head_units = [2048, 1024]
    tabular_columns = ['Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory', 'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur']

## Helpers

### Display images

In [None]:
def display_images(images, row_count, column_count):
    fig, axs = plt.subplots(row_count, column_count, figsize=(10,10))
    for i in range(row_count):
        for j in range(column_count):
            axs[i,j].imshow(images[i * column_count + j])
            axs[i,j].axis('off')
    plt.show()

### Preprocess images

In [None]:
def preprocess_image(image_url):
    image_string = tf.io.read_file(image_url)
    image = tf.image.decode_jpeg(image_string, channels=3)
    image = tf.cast(image, tf.float32) / 255.0
    #image = tf.image.central_crop(image, 1.0)
    image = tf.image.resize(image, (Config.image_size, Config.image_size))
    return image

### Multi Layer Perceptron

In [None]:
def mlp(x, hidden_units, dropout_rate):
    for units in hidden_units:
        x = layers.Dense(units, activation=tf.nn.gelu)(x)
        x = layers.Dropout(dropout_rate)(x)
    return x

### Custom RMSE function that can calculate Pawpularity Score correctly after Normalization

In [None]:
def rmse(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(y_true * Config.scale_factor - y_pred * Config.scale_factor)))

## Import datasets

In [None]:
train = pd.read_csv("../input/petfinder-pawpularity-score/train.csv")
test = pd.read_csv("../input/petfinder-pawpularity-score/test.csv")
sample_submission = pd.read_csv("../input/petfinder-pawpularity-score/sample_submission.csv")

## Data Preprocessing

In [None]:
train.head()

In [None]:
train["file_path"] = train["Id"].apply(lambda identifier: "../input/petfinder-pawpularity-score/train/" + identifier + ".jpg")
test["file_path"] = test["Id"].apply(lambda identifier: "../input/petfinder-pawpularity-score/test/" + identifier + ".jpg")

In [None]:
train.head()

In [None]:
train["Pawpularity"].hist()

### Images that has High Score

In [None]:
item_width = 5
data = train[train.Pawpularity >= 90]
image_urls = data.iloc[np.random.choice(data.shape[0], item_width ** 2)]["file_path"]
for images in tf.data.Dataset.from_tensor_slices((image_urls)).map(preprocess_image).batch(item_width ** 2):
    display_images(images.numpy(), item_width, item_width)

### Images with Low Scores

In [None]:
item_width = 5
data = train[train.Pawpularity <= 10]
image_urls = data.iloc[np.random.choice(data.shape[0], item_width ** 2)]["file_path"]
for images in tf.data.Dataset.from_tensor_slices((image_urls)).map(preprocess_image).batch(item_width ** 2):
    display_images(images.numpy(), item_width, item_width)

### Images has median scores

In [None]:
item_width = 5
data = train[(train.Pawpularity >= 40) & (train.Pawpularity <= 60)]
image_urls = data.iloc[np.random.choice(data.shape[0], item_width ** 2)]["file_path"]
for images in tf.data.Dataset.from_tensor_slices((image_urls)).map(preprocess_image).batch(item_width ** 2):
    display_images(images.numpy(), item_width, item_width)

### Normalize Pawpularity Score from 0 to 1.

In [None]:
train["Pawpularity"] /= Config.scale_factor

## Model Development

### Preprocess function

In [None]:
def preprocess(image_url, tabular):
    image =  preprocess_image(image_url)
    return (image, tabular[1:]), tabular[0]

### Data Augmentation

In [None]:
augmentation_layer = keras.Sequential([
    keras.layers.Input(Config.input_shape),
    #keras.layers.experimental.preprocessing.RandomRotation(factor=0.02),
    keras.layers.experimental.preprocessing.RandomFlip("horizontal"),
    keras.layers.experimental.preprocessing.RandomZoom(height_factor=0.2, width_factor=0.2),
])

### Patch Creation Layer

In [None]:
class Patches(layers.Layer):
    def __init__(self, patch_size):
        super(Patches, self).__init__()
        self.patch_size = patch_size
        
    def call(self, images):
        batch_size = tf.shape(images)[0]
        patches = tf.image.extract_patches(
            images = images,
            sizes=[1, self.patch_size, self.patch_size, 1],
            strides=[1, self.patch_size, self.patch_size, 1],
            rates=[1, 1, 1, 1],
            padding="VALID",
        )
        patch_dims = patches.shape[-1]
        patches = tf.reshape(patches, [batch_size, -1, patch_dims])
        return patches

Let's understand what Patch Creation Layer do. It simply split the image to NxN grid.

In [None]:
index = np.random.choice(train.shape[0])
plt.figure(figsize=(4, 4))
image = preprocess_image(tf.constant(train.iloc[index]["file_path"], dtype=tf.string))
print(image.shape)
plt.imshow(np.squeeze(image))
plt.axis("off")

resized_image = tf.image.resize(
    tf.convert_to_tensor([image]), size=(Config.image_size, Config.image_size)
)
print(resized_image.shape)
patches = Patches(Config.patch_size)(resized_image)
print(f"Image size: {Config.image_size} X {Config.image_size}")
print(f"Patch size: {Config.patch_size} X {Config.patch_size}")
print(f"Patches per image: {patches.shape[1]}")
print(f"Elements per patch: {patches.shape[-1]}")

n = int(np.sqrt(patches.shape[1]))
plt.figure(figsize=(4, 4))
for i, patch in enumerate(patches[0]):
    ax = plt.subplot(n, n, i + 1)
    patch_img = tf.reshape(patch, (Config.patch_size, Config.patch_size, 3))
    plt.imshow(patch_img.numpy())
    plt.axis("off")

### Patch Encoder Layer

In [None]:
class PatchEncoder(layers.Layer):
    
    def __init__(self, num_patches, projection_dim):
        super(PatchEncoder, self).__init__()
        self.num_patches = num_patches
        self.projection = layers.Dense(projection_dim)
        self.position_embedding = layers.Embedding(
            input_dim=num_patches, output_dim=projection_dim
        )
    def call(self, patch):
        positions = tf.range(start=0, limit=self.num_patches, delta=1)
        encoded = self.projection(patch) + self.position_embedding(positions)
        return encoded

### Vision Transformer Model

In [None]:
def create_vision_transformer(use_tabular_inputs=False):
    
    tabular_inputs = tf.keras.Input(len(Config.tabular_columns))
    # Inputs
    inputs = layers.Input(shape=Config.input_shape)
    # Data Augmentation
    augmented = augmentation_layer(inputs)
    # Patches
    patches = Patches(Config.patch_size)(augmented)
    encoder_patches = PatchEncoder(Config.num_patches, Config.projection_dim)(patches)
    
    for _ in range(Config.transformer_layers):
        # Layer Normalization 1
        x1 = layers.LayerNormalization(epsilon=1e-6)(encoder_patches)
        # Multi-Head Attention Layer
        attention_output = layers.MultiHeadAttention(
            num_heads=Config.num_heads, 
            key_dim=Config.projection_dim,
            dropout=0.1
        )(x1, x1)
        # Skip Connnection 1
        x2 = layers.Add()([attention_output, encoder_patches])
        
        # Layer Normalization 2
        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
        
        # MLP
        x3 = mlp(x3, hidden_units=Config.transformer_units, dropout_rate=0.1)
        
        # Skip Connnection 2
        encoder_patches = layers.Add()([x3, x2])
    
    representation = layers.LayerNormalization(epsilon=1e-6)(encoder_patches)
    representation = layers.Flatten()(representation)
    representation = layers.Dropout(0.5)(representation)
    
    features = mlp(representation, hidden_units=Config.mlp_head_units, dropout_rate=0.5)
    
    if use_tabular_inputs:
        image_x = layers.Dense(128, activation=tf.nn.gelu)(features)
        tabular_x = mlp(tabular_inputs, hidden_units=[16] * 10, dropout_rate=0.5)
        x = tf.keras.layers.Concatenate(axis=1)([image_x, tabular_x])
    else:
        x = features
    
    outputs = layers.Dense(1, activation="sigmoid")(x)
    
    model = keras.Model(inputs=[inputs, tabular_inputs], outputs=outputs)
    return model

Let's have a big picture of how this Model looks like.

In [None]:
model =  create_vision_transformer(True)
tf.keras.utils.plot_model(model, show_shapes=True)

In [None]:
model.summary()

This Model accepts images with shape (image_size, image_size, 3) and tabular information (if needed) with shape (12) as input. It generates output with shape (1). 

In [None]:
image = np.random.normal(size=(1, Config.image_size, Config.image_size, 3))
tabular = np.random.normal(size=(1, len(Config.tabular_columns)))
print(image.shape, tabular.shape)
print(model((image, tabular)).shape)

### Model Training
I will use tensorflow Dataset here to preprocess and cache tensors, first epoch is very slow because it's preprocessing data; after that, it would be must faster.

In [None]:
tf.keras.backend.clear_session()
models = []
historys = []
kfold = KFold(n_splits=5, shuffle=True, random_state=997)
# For the current random state, 5th fold can generate a better validation rmse and faster convergence.
train_best_fold = False
best_fold = 4
for index, (train_indices, val_indices) in enumerate(kfold.split(train)):
    if train_best_fold and index != best_fold:
        continue
    x_train = train.loc[train_indices, "file_path"]
    tabular_train = train.loc[train_indices, ["Pawpularity"] + Config.tabular_columns]
    x_val= train.loc[val_indices, "file_path"]
    tabular_val = train.loc[val_indices, ["Pawpularity"] + Config.tabular_columns]
    checkpoint_path = "model_%d.h5"%(index)
    checkpoint = tf.keras.callbacks.ModelCheckpoint(
        checkpoint_path, 
        save_best_only=True,
        save_weights_only=True
    )
    early_stop = tf.keras.callbacks.EarlyStopping(
        min_delta=1e-4, 
        patience=20
    )
   
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
        factor=0.3,
        patience=10, 
        min_lr=1e-7
    )

    callbacks = [checkpoint, reduce_lr, early_stop]
    
    loss = tf.keras.losses.BinaryCrossentropy()

    optimizer = tfa.optimizers.AdamW(
        learning_rate=Config.learning_rate,
        weight_decay=Config.weight_decay
    )
    train_ds = tf.data.Dataset.from_tensor_slices((x_train, tabular_train)).map(preprocess).shuffle(512).batch(Config.batch_size).cache().prefetch(1)
    val_ds = tf.data.Dataset.from_tensor_slices((x_val, tabular_val)).map(preprocess).batch(Config.batch_size).cache().prefetch(1)
    # You can choose whether to use tabular inputs
    model = create_vision_transformer(use_tabular_inputs=True)
    model.compile(loss=loss, optimizer=optimizer, metrics=[rmse, "mae", "mape"])
    history = model.fit(train_ds, epochs=Config.num_epochs, validation_data=val_ds, callbacks=callbacks)
    for metrics in [("loss", "val_loss"), ("mae", "val_mae", "rmse", "val_rmse"), ("mape", "val_mape")]:
        pd.DataFrame(history.history, columns=metrics).plot()
        plt.show()
    model.load_weights(checkpoint_path)
    historys.append(history)
    models.append(model)

## Submission

In [None]:
def preprocess_test_data(image_url, tabular):
    print(image_url, tabular)
    image = preprocess_image(image_url)
    # 0 won't be used in prediction, but it's needed in this senario or the tabular variable is treated as label.
    return (image, tabular), 0

In [None]:
test_ds = tf.data.Dataset.from_tensor_slices((test["file_path"], test[Config.tabular_columns])).map(preprocess_test_data).batch(Config.batch_size).cache().prefetch(1)

When We Submit the result, don't forget to multiply the result by 100.

In [None]:
use_best_result = False
if use_best_result:
    if train_best_fold:
        best_model = models[0]
    else:
        best_fold = 0
        best_score = 10e8
        for fold, history in enumerate(historys):
            for val_rmse in history.history["val_rmse"]:
                if val_rmse < best_score:
                    best_score = val_rmse
                    best_fold = fold
        print("Best Score:%.2f Best Fold: %d"%(best_score, best_fold + 1))
        best_model = models[best_fold]
    sample_submission["Pawpularity"] = Config.scale_factor * best_model.predict(test_ds).reshape(-1)
    sample_submission.to_csv("submission.csv", index=False)
else:
    total_results = []
    for model in models:
        total_results.append(model.predict(test_ds).reshape(-1))
    results = np.mean(total_results, axis=0).reshape(-1)
    sample_submission["Pawpularity"] = Config.scale_factor * results
    sample_submission.to_csv("submission.csv", index=False)

## Reference
- [Image classification with Vision Transformer](https://keras.io/examples/vision/image_classification_with_vision_transformer/)
- [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929)

**If you find my notebook useful, give me an upvote.**