In [None]:
!pip install -U tensorflow-addons

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_addons as tfa
import matplotlib.pyplot as plt
import cv2
import os
import scipy.io
import shutil

### Hyper parameters

In [None]:
image_size = 224
patch_size = 32

input_shape = (image_size, image_size, 3)
learning_rate = 0.001
weight_decay = 0.0001
batch_size = 32
num_epochs = 100
num_patches = (image_size // patch_size) ** 2
projection_dim = 64
num_heads = 4
# Size of the transformer layers
transformer_units = [
    projection_dim * 2,
    projection_dim,
]
transformer_layers = 4
mlp_head_units = [2048, 1024, 512, 64, 32]  # Size of the dense layers

### Prepare dataset

In [None]:
path_to_download_file = keras.utils.get_file(
    fname='caltech_101_zipped',
    origin="https://data.caltech.edu/tindfiles/serve/e41f5188-0b32-41fa-801b-d1e840915e80/",
    extract=True,
    archive_format='zip',
    cache_dir='./'
)

In [None]:
shutil.unpack_archive('datasets/caltech-101/101_ObjectCategories.tar.gz', './')

In [None]:
shutil.unpack_archive('datasets/caltech-101/Annotations.tar', './')

In [None]:
path_images = '101_ObjectCategories/airplanes/'
path_annot = 'Annotations/Airplanes_Side_2/'

In [None]:
image_paths = [f for f in os.listdir(path_images) if os.path.isfile(os.path.join(path_images, f))]
annot_paths = [f for f in os.listdir(path_annot) if os.path.isfile(os.path.join(path_annot, f))]

In [None]:
image_paths.sort()
annot_paths.sort()

In [None]:
image_paths[:10], annot_paths[:10]

In [None]:
images, targets = [], []
for i in range(len(annot_paths)):
  annot = scipy.io.loadmat(os.path.join(path_annot, annot_paths[i]))['box_coord'][0]
  top_left_x, top_left_y = annot[2], annot[0]
  bottom_right_x, bottom_right_y = annot[3], annot[1]

  image = keras.utils.load_img(os.path.join(path_images, image_paths[i]))
  (w, h) = image.size[:2]

  # Resize train images
  if i < int(len(annot_paths) * 0.8):
    image = image.resize((image_size, image_size))

  images.append(keras.utils.img_to_array(image))

  # Apply relative scaling
  targets.append((
       float(top_left_x) / w,
       float(top_left_y) / h,
       float(bottom_right_x) / w,
       float(bottom_right_y) / h
  ))

(x_train, y_train) = (
  np.asarray(images[: int(len(images) * 0.8)]),
  np.asarray(targets[: int(len(targets) * 0.8)])
)
(x_test, y_test) = (
  np.asarray(images[int(len(images) * 0.8) :]),
  np.asarray(targets[int(len(targets) * 0.8) :])
)

### MLP layer

In [None]:
def mlp(x, hidden_units, dropout_rate):
  for units in hidden_units:
    x = layers.Dense(units, activation=tf.nn.gelu)(x)
    x = layers.Dropout(dropout_rate)(x)
  return x

### Patch creation layer

In [None]:
class Patches(layers.Layer):
  def __init__(self, patch_size):
    super().__init__()
    self.patch_size = patch_size

  def call(self, images):
    batch_size = tf.shape(images)[0]
    patches = tf.image.extract_patches(
        images=images,
        sizes=[1, self.patch_size, self.patch_size, 1],
        strides=[1, self.patch_size, self.patch_size, 1],
        rates=[1, 1, 1, 1],
        padding='VALID'
    )
    return tf.reshape(patches, [batch_size, -1, patches.shape[-1]])

#### Display patches

In [None]:
plt.figure(figsize=(4, 4))
plt.imshow(x_train[0].astype('uint8'))
plt.axis('off')

patches = Patches(patch_size)(tf.convert_to_tensor([x_train[0]]))
print(f'Image size: {image_size}x{image_size}')
print(f'Patch_size: {patch_size}x{patch_size}')
print(f'{patches.shape[1]} patches per image')
print(f'{patches.shape[-1]} elements per patch')
print(f'Pathces shape: {patches.shape}')

n = int(np.sqrt(patches.shape[1]))
plt.figure(figsize=(4, 4))
for i, patch in enumerate(patches[0]):
  ax = plt.subplot(n, n, i + 1)
  patch_img = tf.reshape(patch, (patch_size, patch_size, 3))
  plt.imshow(patch_img.numpy().astype('uint8'))
  plt.axis('off')

### Patch encoder

In [None]:
class PatchEncoder(layers.Layer):
  def __init__(self, num_patches, projection_dim):
    super().__init__()
    self.num_patches = num_patches
    self.projection = layers.Dense(projection_dim)
    self.position_embedding = layers.Embedding(
        input_dim=num_patches, output_dim=projection_dim
    )

  def call(self, patch):
    positions = tf.range(start=0, limit=self.num_patches, delta=1)
    encoded = self.projection(patch) + self.position_embedding(positions)
    return encoded

### Build the ViT model

In [None]:
def create_vit_object_detector(
  input_shape,
  patch_size,
  num_patches,
  projection_dim,
  num_heads,
  transformer_units,
  transformer_layers,
  mlp_head_units
):
  inputs = layers.Input(shape=input_shape)
  patches = Patches(patch_size)(inputs)
  encoded_patches = PatchEncoder(num_patches, projection_dim)(patches)

  for _ in range(transformer_layers):
    # Layer norm
    x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
    # MHA
    attention_output = layers.MultiHeadAttention(
        num_heads, projection_dim, dropout=0.1
    )(x1, x1) # self attention
    # Skip connection
    x2 = layers.Add()([attention_output, encoded_patches])

    # Layer norm
    x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
    # MLP
    x3 = mlp(x3, transformer_units, 0.1)
    # Skip connection
    encoded_patches = layers.Add()([x3, x2])

  # Output of transformer blocks: [batch_size, num_patches, projection_dim]
  # Create a [batch_size, projection_dim] tensor
  #   step1: layer norm
  #   step2: flatten [batch_size, num_patches * projection_dim]
  representation = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
  representation = layers.Flatten()(representation)
  representation = layers.Dropout(0.3)(representation)
  print(representation.get_shape())

  # mlp
  features = mlp(representation, mlp_head_units, dropout_rate=0.3)
  # Final four neurons that output bounding box
  bounding_box = layers.Dense(4)(features)

  return keras.Model(inputs=inputs, outputs=bounding_box)


### Run the experiment

In [None]:
def run_experiment(model, learning_rate, weight_decay, batch_size, num_epochs):
  optimizer = tfa.optimizers.AdamW(
      learning_rate=learning_rate, weight_decay=weight_decay
  )

  model.compile(optimizer=optimizer, loss=keras.losses.MeanSquaredError())

  checkpoint_filepath = './'
  checkpoint_callback = keras.callbacks.ModelCheckpoint(
      checkpoint_filepath, monitor='val_loss',
      save_best_only=True, save_weights_only=True
  )

  history = model.fit(
      x=x_train, y=y_train,
      batch_size=batch_size,
      epochs=num_epochs,
      validation_split=0.1,
      callbacks=[
        checkpoint_callback, keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
      ]
  )

  return history

In [None]:
vit_object_detector = create_vit_object_detector(
    input_shape, patch_size, num_patches, projection_dim, num_heads,
    transformer_units, transformer_layers, mlp_head_units
)

In [None]:
history = run_experiment(vit_object_detector, learning_rate, weight_decay, batch_size, num_epochs)

### Evaluate the model

In [None]:
def bounding_box_iou(box_predicted, box_truth):
  top_x_intersect = max(box_predicted[0], box_truth[0])
  top_y_intersect = max(box_predicted[1], box_truth[1])
  bottom_x_intersect = min(box_predicted[2], box_truth[2])
  bottom_y_intersect = min(box_predicted[3], box_truth[3])

  intersection_area = max(0, bottom_x_intersect - top_x_intersect + 1) * max(0, bottom_y_intersect - top_y_intersect + 1)
  box_predicted_area = \
    (box_predicted[2] - box_predicted[0] + 1) * \
    (box_predicted[3] - box_predicted[1] + 1)
  box_truth_area = \
    (box_truth[2] - box_truth[0] + 1) * \
    (box_truth[3] - box_truth[1] + 1)

  return intersection_area / float(box_predicted_area + box_truth_area - intersection_area)
  


In [None]:
import matplotlib.patches as plot_patches

In [None]:
def get_bbox(coords, w, h):
  top_left_x, top_left_y = int(coords[0] * w), int(coords[1] * h)
  bottom_right_x, bottom_right_y = int(coords[2] * w), int(coords[3] * h)
  bbox = [top_left_x, top_left_y, bottom_right_x, bottom_right_y]
  return bbox

In [None]:
def draw_bbox(bbox, ax, is_preds):
  top_left_x, top_left_y = bbox[:2]
  bottom_right_x, bottom_right_y = bbox[2:]
  rect = plot_patches.Rectangle(
      (top_left_x, top_left_y),
      bottom_right_x - top_left_x,
      bottom_right_y - top_left_y,
      facecolor='none',
      edgecolor='red',
      linewidth=1
  )

  label = 'Predicted' if is_preds else 'Target'
  ax.add_patch(rect)
  ax.set_xlabel(
      label + ': ' +
      str(top_left_x) + ', ' +
      str(top_left_y) + ', ' +
      str(bottom_right_x) + ', ' +
      str(bottom_right_x) 
  )

In [None]:
mean_iou = 0.0
for i, input_image in enumerate(x_test[:10]):
  fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 15))
  im = input_image

  ax1.imshow(im.astype('uint8'))
  ax2.imshow(im.astype('uint8'))

  input_image = cv2.resize(
      input_image, (image_size, image_size)
  )
  input_image = np.expand_dims(input_image, axis=0)
  preds = vit_object_detector.predict(input_image)[0]

  (h, w) = im.shape[0:2]
  box_predicted = get_bbox(preds, w, h)
  draw_bbox(box_predicted, ax1, is_preds=True)

  # Draw truth bounding box 
  box_truth = get_bbox(y_test[i], w, h)
  draw_bbox(box_truth, ax2, is_preds=False)
  
  mean_iou += bounding_box_iou(box_predicted, box_truth)

In [None]:
print(f'mean_iou: {mean_iou / len(x_test[:10])}')