# Object Detection API - Your own images!


## Download your own data first

* Potentially find images from wikimedia: https://commons.wikimedia.org/wiki/Main_Page

* Get a few images of the same size, say 512x512
* Download them
* Zip the files and upload them to Google drive
* Next, download them/ or upload them directly here on the left!

## You will fine-tune a ResNet object detector on your data

Model Zoo where you can find the URL to the new model: https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/tf2_detection_zoo.md


In [None]:
# uncomment the next line if you want to delete an existing models directory
!rm -rf ./models/

# clone the Tensorflow Model Garden
!git clone --depth 1 https://github.com/tensorflow/models/

In [None]:
# Compile the Object Detection API protocol buffers
!cd models/research/ && protoc object_detection/protos/*.proto --python_out=.

You will write a file based on the [setup script](https://github.com/tensorflow/models/blob/master/research/object_detection/packages/tf2/setup.py) in the official repo to work with the packages in the current version of Colab. We removed some the packages that is not needed in this lab to make the installation faster.

In [None]:
%%writefile models/research/setup.py

import os
from setuptools import find_packages
from setuptools import setup

REQUIRED_PACKAGES = [
    'tf-models-official==2.8.0',
    'tensorflow_io'
]

setup(
    name='object_detection',
    version='0.1',
    install_requires=REQUIRED_PACKAGES,
    include_package_data=True,
    packages=(
        [p for p in find_packages() if p.startswith('object_detection')] +
        find_packages(where=os.path.join('.', 'slim'))),
    package_dir={
        'datasets': os.path.join('slim', 'datasets'),
        'nets': os.path.join('slim', 'nets'),
        'preprocessing': os.path.join('slim', 'preprocessing'),
        'deployment': os.path.join('slim', 'deployment'),
        'scripts': os.path.join('slim', 'scripts'),
    },
    description='Tensorflow Object Detection Library',
    python_requires='>3.6',
)

In [None]:
# Run the setup script you just wrote
!python -m pip install models/research

## Imports

Let's now import the packages you will use in this assignment.

In [None]:
import matplotlib
import matplotlib.pyplot as plt

import os
import random
import zipfile
import io
import scipy.misc
import numpy as np

import glob
import imageio
from six import BytesIO
from PIL import Image, ImageDraw, ImageFont
from IPython.display import display, Javascript
from IPython.display import Image as IPyImage

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

import tensorflow as tf
tf.get_logger().setLevel('ERROR')

# Import Object Detection API packages

# import the label map utility module
from object_detection.utils import label_map_util

# import module for reading and updating configuration files.
from object_detection.utils import config_util

# import module for visualization. use the alias `viz_utils`
from object_detection.utils import visualization_utils as viz_utils

# import module for building the detection model
from object_detection.builders import model_builder

# import module for utilities in Colab
from object_detection.utils import colab_utils

## Utilities

You'll define a couple of utility functions for loading images and plotting detections. This code is provided for you.

In [None]:
def load_image_into_numpy_array(path):
    """Load an image from file into a numpy array.

    Puts image into numpy array to feed into tensorflow graph.
    Note that by convention we put it into a numpy array with shape
    (height, width, channels), where channels=3 for RGB.

    Args:
    path: a file path.

    Returns:
    uint8 numpy array with shape (img_height, img_width, 3)
    """
    
    img_data = tf.io.gfile.GFile(path, 'rb').read()
    image = Image.open(BytesIO(img_data))
    (im_width, im_height) = image.size
    
    return np.array(image.getdata()).reshape(
        (im_height, im_width, 3)).astype(np.uint8)


def plot_detections(image_np,
                    boxes,
                    classes,
                    scores,
                    category_index,
                    figsize=(12, 16),
                    image_name=None):
    """Wrapper function to visualize detections.

    Args:
    image_np: uint8 numpy array with shape (img_height, img_width, 3)
    boxes: a numpy array of shape [N, 4]
    classes: a numpy array of shape [N]. Note that class indices are 1-based,
          and match the keys in the label map.
    scores: a numpy array of shape [N] or None.  If scores=None, then
          this function assumes that the boxes to be plotted are groundtruth
          boxes and plot all boxes as black with no classes or scores.
    category_index: a dict containing category dictionaries (each holding
          category index `id` and category name `name`) keyed by category indices.
    figsize: size for the figure.
    image_name: a name for the image file.
    """
    
    image_np_with_annotations = image_np.copy()
    
    viz_utils.visualize_boxes_and_labels_on_image_array(
        image_np_with_annotations,
        boxes,
        classes,
        scores,
        category_index,
        use_normalized_coordinates=True,
        min_score_thresh=0.8)
    
    if image_name:
        plt.imsave(image_name, image_np_with_annotations)
    
    else:
        plt.imshow(image_np_with_annotations)


## Download your own data

* Potentially find images from wikimedia: https://commons.wikimedia.org/wiki/Main_Page

* Get a few images of the same size, say 512x512
* Download them
* Zip the files and upload them to Google drive
* Next, download them/ or upload them directly here on the left!

If you upload to Google Drive, you need to make sure the zip file is shareable (publicly) then get the file ID by coping the link and extracting the ID part.

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

train_file = '1-8GtxXpzoKUPd9uHE68KvuGxRgFPdM3H'
downloaded = drive.CreateFile({'id': train_file})
downloaded.GetContentFile('Pieds.zip')

# unzip to a local directory
local_zip = './Pieds.zip'
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall('./training')
zip_ref.close()

### Visualize the training images

Next, you'll want to inspect the images that you just downloaded. 

* You can inspect the *training* directory (using the `Files` button on the left side of this Colab) to see the filenames of the zombie images. The paths for the images will look like this:

```
./training/pied-kingfisher-1.jpg
./training/pied-kingfisher-2.jpg
./training/pied-kingfisher-3.jpg
...
```

In [None]:
%matplotlib inline

# assign the name (string) of the directory containing the training images
train_image_dir = './training'

# declare an empty list
train_images_np = []

# run a for loop for each image
for i in range(1, 8):

    # define the path (string) for each image
    image_path = os.path.join(train_image_dir, "pied-kingfisher-" +str(i) + ".jpeg")
    print(image_path)

    # load images into numpy arrays and append to a list
    train_images_np.append(load_image_into_numpy_array(image_path))

## Check the shapes of your images

In [None]:
for i in range (len(train_images_np)):
  print(train_images_np[i].shape)

<a name='gt_boxes_definition'></a>
## Prepare data for training

In this section, you will create your ground truth boxes. 


In [None]:
# Define the list of ground truth boxes
gt_boxes = []

In [None]:
# annotate the training images
colab_utils.annotate(train_images_np, box_storage_pointer=gt_boxes)

#### View your ground truth box coordinates
Whether you chose to draw your own or use the given boxes, please check your list of ground truth box coordinates.

In [None]:
# print the coordinates of your ground truth boxes
for gt_box in gt_boxes:
  print(gt_box)

### Define the category index dictionary

You'll need to tell the model which integer class ID to assign to the 'zombie' category, and what 'name' to associate with that integer id.

- zombie_class_id: By convention, class ID integers start numbering from 1,2,3, onward.
  - If there is ever a 'background' class, it could be assigned the integer 0, but in this case, you're just predicting the one zombie class.
  - Since you are just predicting one class (zombie), we assign `1` to the zombie class ID.

- category_index: we need to define the `category_index` dictionary, which will have the same structure as this:
```
{human_class_id : 
  {'id'  : human_class_id, 
   'name': 'human_so_far'}
}
```
- num_classes: Since you are predicting one class, assign `1` to the number of classes that the model will predict.
  - This will be used during data preprocessing and again when you configure the model.


In [None]:
# Assign the zombie class ID
kf_class_id = 1

# define a dictionary describing the zombie class
category_index = {kf_class_id : 
{'id'  : 1, 
 'name': 'kingfisher'}
}

# Specify the number of classes that the model will predict
num_classes = 1


### Data preprocessing
You will now do some data preprocessing so it is formatted properly before it is fed to the model:
- Convert the class labels to one-hot representations
- convert everything (i.e. train images, gt boxes and class labels) to tensors.

This code is provided for you.

In [None]:
# The `label_id_offset` here shifts all classes by a certain number of indices;
# we do this here so that the model receives one-hot labels where non-background
# classes start counting at the zeroth index. 

label_id_offset = 1
train_image_tensors = []

# lists containing the one-hot encoded classes and ground truth boxes
gt_classes_one_hot_tensors = []
gt_box_tensors = []

for (train_image_np, gt_box_np) in zip(train_images_np, gt_boxes):
    
    # convert training image to tensor, add batch dimension, and add to list
    train_image_tensors.append(tf.expand_dims(tf.convert_to_tensor(
        train_image_np, dtype=tf.float32), axis=0))
    
    # convert numpy array to tensor, then add to list
    gt_box_tensors.append(tf.convert_to_tensor(gt_box_np, dtype=tf.float32))
    
    # apply offset to to have zero-indexed ground truth classes
    zero_indexed_groundtruth_classes = tf.convert_to_tensor(
        np.ones(shape=[gt_box_np.shape[0]], dtype=np.int32) - label_id_offset)
    
    # do one-hot encoding to ground truth classes
    gt_classes_one_hot_tensors.append(tf.one_hot(
        zero_indexed_groundtruth_classes, num_classes))

print('Done prepping data.')

In [None]:
gt_classes_one_hot_tensors

## Visualize the images with their ground truth bounding boxes

You should see the 5 training images with the bounding boxes after running the cell below. 


In [None]:
# give boxes a score of 100%
dummy_scores = np.array([1.0], dtype=np.float32)

# define the figure size
plt.figure(figsize=(30, 15))

# use the `plot_detections()` utility function to draw the ground truth boxes
for idx in range(5):
    plt.subplot(2, 4, idx+1)
    plot_detections(
      train_images_np[idx],
      gt_boxes[idx],
      np.ones(shape=[gt_boxes[idx].shape[0]], dtype=np.int32),
      dummy_scores, category_index)

plt.show()

### Download checkpoints



In [None]:
# Download the SSD Resnet 50 version 1, 640x640 checkpoint
!wget http://download.tensorflow.org/models/object_detection/tf2/20200711/ssd_mobilenet_v1_fpn_640x640_coco17_tpu-8.tar.gz
    
# untar (decompress) the tar file
!tar -xf ssd_mobilenet_v1_fpn_640x640_coco17_tpu-8.tar.gz

# copy the checkpoint to the test_data folder models/research/object_detection/test_data/
!mv ssd_mobilenet_v1_fpn_640x640_coco17_tpu-8/checkpoint models/research/object_detection/test_data/

- In the Colab, on the left side table of contents, click on the folder icon to display the file browser for the current workspace.  
- Navigate to `models/research/object_detection/configs/tf2`.  The folder has multiple .config files.  
- Look for the file corresponding to faster rcnn resnet50 v1 640x640
- You can double-click the config file to view its contents. 
- Set the `pipeline_config` to a string that contains the full path to the resnet config file, in other words: `models/research/.../... .config`

In [None]:
tf.keras.backend.clear_session()

# define the path to the .config file for ssd resnet 50 v1 640x640
pipeline_config = '/content/models/research/object_detection/configs/tf2/ssd_mobilenet_v1_fpn_640x640_coco17_tpu-8.config'

# Load the configuration file into a dictionary
configs = config_util.get_configs_from_pipeline_file(pipeline_config)

# See what configs looks like
configs

## Get the model config

This tells us details about the model

In [None]:
# Read in the object stored at the key 'model' of the configs dictionary
model_config = configs.get('model')

# see what model_config looks like
model_config

# Notice something on the 2nd line?


### Modify model_config
- Modify num_classes from the default `90` to the `num_classes` that you set earlier in this notebook.
- Freeze batch normalization 
  - Batch normalization is not frozen in the default configuration.
  - If you inspect the `model_config` object, you'll see that `freeze_batchnorm` is nested under `ssd` just like `num_classes`.
  - Freeze batch normalization by setting the relevant field to `True`.

In [None]:
# Modify the number of classes from its default of 90 to 1
model_config.ssd.num_classes = 1

# Freeze batch normalization from False to True
model_config.ssd.freeze_batchnorm = True

# See what model_config now looks like after you've customized it!
model_config

## Model builder

We included this library earlier.

Now we build the model and pass it the configurations

In [None]:
detection_model = model_builder.build(model_config=model_config, is_training=True)

View what you just created

In [None]:
vars(detection_model)

### Isolate layers we want to re-use
You will now isolate the layers of detection_model that you wish to reuse so that you can restore the weights to just those layers.

You'll see that detection_model contains several variables. Two of these will be relevant to you:
```
...
_box_predictor': <object_detection.predictors.convolutional_keras_box_predictor.WeightSharedConvolutionalBoxPredictor at 0x7f5205eeb1d0>,
...
_feature_extractor': <object_detection.models.ssd_resnet_v1_fpn_keras_feature_extractor.SSDResNet50V1FpnKerasFeatureExtractor at 0x7f52040f1ef0>,
```

Your end goal is to create a custom model which reuses parts of, but not all of the layers of RetinaNet (currently stored in ``detection_model.``)

The parts of RetinaNet that you want to reuse are: (here I specifically denoted them as 1 and 2 so the explanations are a bit easier to follow below)

1. Bounding box regression prediction layer (`_box_predictor`)
2. Feature extraction layers (`_feature_extractor`)

The part of RetinaNet that you **will not want to reuse** is the classification prediction layer (since you will define and train your own classification layer specific to zombies). The current classification prediction layer was trained on another dataset.

For the parts of RetinaNet that you want to reuse, you will also restore the weights from the checkpoint that you selected.

`detection_model.box_predictor` contains a few things we will use:
1. _prediction_head 
2. _base_tower_layers_for_heads 

In [None]:
vars(detection_model._box_predictor)

Among the variables listed, a few will be relevant to you:

```
...
_base_tower_layers_for_heads
...
_box_prediction_head
...
```

First of all, where is box_predictor:
https://github.com/tensorflow/models/blob/master/research/object_detection/predictors/convolutional_keras_box_predictor.py 

1. **Let's consider ``_base_tower_layers_for_heads``: in the code we find this:**
https://github.com/tensorflow/models/blob/3f1ca33afe3c1631b733ea7e40c294273b9e406d/research/object_detection/predictors/convolutional_keras_box_predictor.py#L385
```
    # Stack the base_tower_layers in the order of conv_layer, batch_norm_layer
    # and activation_layer
    base_tower_layers = []
    for i in range(self._num_layers_before_predictor):
      base_tower_layers.extend([conv_layers[i]])
```

So `detection_model.box_predictor._base_tower_layers_for_heads` contains:

* The layers for the prediction before the final bounding box prediction
* The layers for the prediction before the final class prediction.


2. **Let's consider ``_box_prediction_head``: in the code we find this:**
https://github.com/tensorflow/models/blob/3f1ca33afe3c1631b733ea7e40c294273b9e406d/research/object_detection/predictors/convolutional_keras_box_predictor.py#L248

```
box_prediction_head: The head that predicts the boxes.
```

This points to the bounding box prediction layer, which you'll want to use for your model.

### Now create the checkpoint based on what we want to reuse

``_box_predictor`` has two objected nested in it, namely ``_base_tower_layers_for_heads`` and ``_box_prediction_head``. Since these are nested, we need to create the checkpoints for these separately. We can't create one giant checkpoint. This is why the code below is split into two.

In [None]:
detection_model._ssd

In [None]:
tmp_box_predictor_checkpoint = tf.train.Checkpoint(
    _base_tower_layers_for_heads = detection_model._box_predictor._base_tower_layers_for_heads,
    _box_prediction_head = detection_model._box_predictor._box_prediction_head
)  

tmp_model_checkpoint = tf.train.Checkpoint(
    _box_predictor = tmp_box_predictor_checkpoint, # We want the bounding box regression layer
    _feature_extractor = detection_model._feature_extractor # We want the feature extractor
)

Now let's restore the checkpoint from the local disk based on what we downloaded and what we have defined that we want.

In [None]:
checkpoint_path = 'models/research/object_detection/test_data/checkpoint/ckpt-0'

# Define a checkpoint
checkpoint = tf.train.Checkpoint(
    model=tmp_model_checkpoint
)

# Restore the checkpoint to the checkpoint path
checkpoint.restore(checkpoint_path)

## Actually load the weights

As it stands, we defined how the weights should be loaded but haven't actually loaded them yet. This is probably since the model isn't in eager mode. With Tensorflow, everything is in the graph. So here, we pass through some data to run through the computation graph and load the weights. See below, there are no weights.

In [None]:
# use the detection model's `preprocess()` method and pass a dummy image
tmp_image, tmp_shapes = detection_model.preprocess(tf.zeros([1, 640, 640, 3]))

In [None]:
# run a prediction with the preprocessed image and shapes
tmp_prediction_dict = detection_model.predict(tmp_image, tmp_shapes)

In [None]:
len(detection_model.trainable_variables)

Weights restored!

### Next, some hyper-parameters...

In [None]:
# set the batch_size
batch_size = 7

# set the number of batches
num_batches = 140

# Set the learning rate
learning_rate = 0.01

# set the optimizer and pass in the learning_rate
optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate, momentum=0.9)

## Decide on which layers to fine-tune

In [None]:
# Inspect the layers of detection_model
for i,v in enumerate(detection_model.trainable_variables):
    print(f"i: {i} \t name: {v.name} \t shape:{v.shape} \t dtype={v.dtype}")

Notice that there are some layers whose names are prefixed with the following:

```
WeightSharedConvolutionalBoxPredictor/WeightSharedConvolutionalBoxHead
...
WeightSharedConvolutionalBoxPredictor/WeightSharedConvolutionalClassHead
...
WeightSharedConvolutionalBoxPredictor/BoxPredictionTower
...
WeightSharedConvolutionalBoxPredictor/ClassPredictionTower
...
```
We want to fine-tune these ones:

* The **bounding box head** variables (which predict bounding box coordinates - so we will want to fine-tune this)
* The **class head** variables (which predict the class/category - so we will want to fine-tune this)

And the rest?

* "tower" refers to layers that are before the prediction layer




In [None]:
# define a list that contains the layers that you wish to fine tune
to_fine_tune = []
for v in detection_model.trainable_variables:
  if v.name.startswith('WeightSharedConvolutionalBoxPredictor/WeightSharedConvolutional'):
    to_fine_tune.append(v)

### Function to perform one training step

In [None]:
# decorate with @tf.function for faster training
@tf.function
def train_step_fn(image_list,
                groundtruth_boxes_list,
                groundtruth_classes_list,
                model,
                optimizer,
                vars_to_fine_tune):
    """A single training iteration.

    Args:
      image_list: A list of [1, height, width, 3] Tensor of type tf.float32.
        Note that the height and width can vary across images, as they are
        reshaped within this function to be 640x640.
      groundtruth_boxes_list: A list of Tensors of shape [N_i, 4] with type
        tf.float32 representing groundtruth boxes for each image in the batch.
      groundtruth_classes_list: A list of Tensors of shape [N_i, num_classes]
        with type tf.float32 representing groundtruth boxes for each image in
        the batch.

    Returns:
      A scalar tensor representing the total loss for the input batch.
    """

    # Provide the ground truth to the model
    model.provide_groundtruth(
        groundtruth_boxes_list=groundtruth_boxes_list,
        groundtruth_classes_list=groundtruth_classes_list
    )

    with tf.GradientTape() as tape:

        # Preprocess the images
        preprocessed_image_list = []
        true_shape_list = []

        for img in image_list:
            processed_img, true_shape = model.preprocess(img)
            preprocessed_image_list.append(processed_img)
            true_shape_list.append(true_shape)

        preprocessed_image_tensor = tf.concat(preprocessed_image_list, axis=0)
        true_shape_tensor = tf.concat(true_shape_list, axis=0)

        # Make a prediction
        prediction_dict = model.predict(preprocessed_image_tensor, true_shape_tensor)

        # Calculate the total loss (sum of both losses)
        losses_dict = model.loss(prediction_dict, true_shape_tensor)
        
        total_loss = losses_dict['Loss/localization_loss'] + losses_dict['Loss/classification_loss']

        # Calculate the gradients
        gradients = tape.gradient([total_loss], vars_to_fine_tune)

        # Optimize the model's selected variables
        optimizer.apply_gradients(zip(gradients, vars_to_fine_tune))
 
    return total_loss

## Actual Training

In [None]:
print('Start fine-tuning!', flush=True)

for idx in range(num_batches):
    # Grab keys for a random subset of examples
    all_keys = list(range(len(train_images_np)))
    random.shuffle(all_keys)
    example_keys = all_keys[:batch_size]

    # Get the ground truth
    gt_boxes_list = [gt_box_tensors[key] for key in example_keys]
    gt_classes_list = [gt_classes_one_hot_tensors[key] for key in example_keys]
    
    # get the images
    image_tensors = [train_image_tensors[key] for key in example_keys]

    # Training step (forward pass + backwards pass)
    total_loss = train_step_fn(image_tensors, 
                               gt_boxes_list, 
                               gt_classes_list,
                               detection_model,
                               optimizer,
                               to_fine_tune
                              )

    if idx % 10 == 0:
        print('batch ' + str(idx) + ' of ' + str(num_batches)
        + ', loss=' +  str(total_loss.numpy()), flush=True)

print('Done fine-tuning!')

## Download some test images

Follow same process as for your training images.

In [None]:
# uncomment if you want to delete existing files
!rm -rf ./results

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

train_file = '1s82FBELO3xCCQVYj1OB3mgDq58ZJh2WR'
downloaded = drive.CreateFile({'id': train_file})
downloaded.GetContentFile('Pieds-Test.zip')

# unzip test images
local_zip = './Pieds-Test.zip'
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall('./results')
zip_ref.close()

## Load them into numpy arrays

In [None]:
test_image_dir = './results/'
test_images_np = []

# load images into a numpy array. this will take a few minutes to complete.
# You only need to do this once
for i in range(1, 5):
    image_path = os.path.join(test_image_dir, 'test-' + str(i) + '.jpeg')
    print(image_path)
    test_images_np.append(np.expand_dims(
      load_image_into_numpy_array(image_path), axis=0))

We have 4 testing images

In [None]:
len(test_images_np)

Each has the following dimensions

In [None]:
test_images_np[0].shape

In [None]:
@tf.function
def detect(input_tensor):
    """Run detection on an input image.

    Args:
    input_tensor: A [1, height, width, 3] Tensor of type tf.float32.
      Note that height and width can be anything since the image will be
      immediately resized according to the needs of the model within this
      function.

    Returns:
    A dict containing 3 Tensors (`detection_boxes`, `detection_classes`,
      and `detection_scores`).
    """
    preprocessed_image, shapes = detection_model.preprocess(input_tensor)
    prediction_dict = detection_model.predict(preprocessed_image, shapes)
    
    # use the detection model's postprocess() method to get the the final detections
    detections = detection_model.postprocess(prediction_dict, shapes)
    
    return detections

## Visualise results

Results will appear in the /results folder

In [None]:
label_id_offset = 1
results = {'boxes': [], 'scores': []}

for i in range(len(test_images_np)):
    input_tensor = tf.convert_to_tensor(test_images_np[i], dtype=tf.float32)
    detections = detect(input_tensor)
    plot_detections(
      test_images_np[i][0],
      detections['detection_boxes'][0].numpy(),
      detections['detection_classes'][0].numpy().astype(np.uint32)
      + label_id_offset,
      detections['detection_scores'][0].numpy(),
      category_index, figsize=(15, 20), image_name="./results/gif_frame_" + ('%03d' % i) + ".jpg")
    results['boxes'].append(detections['detection_boxes'][0][0].numpy())
    results['scores'].append(detections['detection_scores'][0][0].numpy())


## Visualise a single result

In [None]:
display(IPyImage('./results/gif_frame_002.jpg'))
print()