In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Object Detection using tf.keras in eager execution mode

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://www.tensorflow.org/images/object_detection.ipynb"><img src="https://www.tensorflow.org/images/tf_logo_32px.png" />View on TensorFlow.org</a>
  </td>
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/images/object_detection.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/tensorflow/docs/blob/master/site/en/images/object_detection.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
</table>

In this tutorial, we will discuss how to detect objects in images. We'll build an object detection neural network using tf.keras in eager execution mode.

## Specific concepts that will be covered:
In the process, we will build practical experience and develop intuition around the following concepts

* Builds model with tf.keras pretrained model. 
* Multi-task (classification and regression) learning. 
* Gradient backpropagation in the eager mode. 

## Things are important for understanding the object detection pipeline. You can check the source code for more details:
* Convert object detection dataset with json format into tfrecords and load with tf.data
* Load tfrecords with tf.data
* Anchor box generations and bounding box operations
* Loss computation with hard negative mining.

## Importing packages

In [None]:
from __future__ import division
from __future__ import print_function

import functools
import os

import tensorflow as tf
import yaml

from object_detection_lib import citycam_dataset_converter
from object_detection_lib import anchor_lib
from object_detection_lib import dataset_lib
from object_detection_lib import bbox_lib
from object_detection_lib import model_lib

## Set up environment for tensorflow

In [None]:
# Reduce the tensorflow log.
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# Initialize the tensorflow eager mode
tf.enable_eager_execution()

## Build the dataset

For this tutorial purpose, we are using public dataset [citycam](www.citycam-cmu.com). It is release from paper [Understanding Traffic Density from Large-Scale Web Camera Data](https://arxiv.org/abs/1703.05868). The dataset contains a list of images and annotation json file, which is widely used for object detection task. The library code contains the method to convert those format to tfrecord, which is used in tensorflow, and load them with tf.data. 

Please follow the [link](https://www.citycam-cmu.com/dataset) and click on Download Sample Data buttom. It will direct you to a google drive. Please download the tar file and put the downloaded path in the tarfile_path below.

The dataset contains images, bounding box for each vehicle and mask for the specific scene. The annotation only contains the vehicle in the mask region. When training the neural network, the mask will be used to filter out the loss outside of the mask.

The annotation json file contains a list of dictionary with key: **labels, mask_name, image_name, bboxes**. The mask name and image_name are the relative path to the dataset dir. 

Image with bounding box| Mask
- | -
![image](images/od_image.png) | ![mask](images/od_mask.png)

In [None]:
tarfile_path = "164.tar.gz" # Replace with your downloaded path.

if not os.path.exists(tarfile_path):
    print("Please make sure the tarfile you entered is current.")
    exit(1)

# Convert the dataset.
train_filepath, val_filepath = citycam_dataset_converter.convert(tarfile_path)

## Generate anchor boxes for the network.

Multiple anchor boxes (filters) is introduced for object detection in the paper: [Faster R-CNN: Towards Real-Time Object
Detection with Region Proposal Networks](https://arxiv.org/pdf/1506.01497.pdf). It helps the network for detecting objects at different scales.

![anchor](images/od_anchor1.png)

In [None]:
output_h = 15
output_w = 22
input_shape_h = 240
input_shape_w = 352

anchor_strides = [input_shape_h / output_h, input_shape_w / output_w]

anchors = anchor_lib.anchor_gen(output_h, output_w, anchor_stride=anchor_strides)

## Specify the parameters for loading the data

train_ds and val_ds are initialized with tf.data. It is a dictionary:
* **bboxes**: bounding box.
* **labels**: labels.
* **image**: decoded images.
* **mask**: decoded masks.
* **bboxes_preprocessed**: preprocessed bounding boxes. It is generated relative to the anchor boxes.
* **labels_preprocessed**: preprocessed labels. It is generated relative to the anchor boxes.

In [None]:
batch_size = 32
pos_iou_threshold = 0.7
neg_iou_threshold = 0.3
neg_label_value = -1
ignore_label_value = -2

dataset_builder_fn = functools.partial(
    dataset_lib.read_data,
    anchors=anchors,
    batch_size=batch_size,
    pos_iou_threshold=pos_iou_threshold,
    neg_iou_threshold=neg_iou_threshold,
    neg_label_value=neg_label_value,
    ignore_label_value=ignore_label_value)

epoch = 30
shuffle_buffer_size = 1000

train_ds = dataset_builder_fn(
    train_filepath, epoch=epoch,
    shuffle_buffer_size=shuffle_buffer_size,
    image_arg=True)

val_ds = dataset_builder_fn(val_filepath)

## Build model
Due to the small sample data size, we are using pretrained model ResNet50 for feature extraction. Classification branch and regression branch are added after the ResNet50. Classification branch predicts the likelihood of certain objects at a certain localtion, and the regression branch predicts the size of the objects. Please see region proposal network in [Faster R-CNN: Towards Real-Time Object
Detection with Region Proposal Networks](https://arxiv.org/pdf/1506.01497.pdf) and [SSD: Single Shot MultiBox Detector](https://arxiv.org/pdf/1512.02325.pdf) for more details.

![](images/od_branches2.png)

In [None]:
def build_model(num_classes, anchor_num_per_output):
    base_network_model = tf.keras.applications.resnet50.ResNet50(
        include_top=False, weights="imagenet")

    for layer in base_network_model.layers:
        layer.trainable = False

    h = base_network_model.get_layer(name="activation_39").output
    drop_rate = 0.5
    h = tf.keras.layers.Dropout(drop_rate)(h)

    classification_branch = tf.keras.layers.Conv2D(
        (num_classes + 1) * anchor_num_per_output, (1, 1))(
            h)
    regression_branch = tf.keras.layers.Conv2D(4 * anchor_num_per_output, (1, 1))(
        h)
    model_outputs = [classification_branch, regression_branch]
    return tf.keras.models.Model(base_network_model.input, model_outputs)

## Training
### Specify training configuration and parameters

The model will be saved in "model" directory

In [None]:
global_step = tf.train.get_or_create_global_step()

# Number of class in the dataset.
num_classes = 10

# length of anchor scales (3) * length of aspect ratio (3)
anchor_num_per_output = 9

# Initialize the model
od_model = build_model(num_classes, anchor_num_per_output)

# Initialize for the learning.
learning_rate = 0.001
decay_step = 1000
decay_alpha = 0.000001

global_step = tf.train.get_or_create_global_step()
decayed_lr = tf.train.cosine_decay(
    learning_rate=learning_rate,
    global_step=global_step,
    decay_steps=decay_step,
    alpha=decay_alpha)

optimizer = tf.train.AdamOptimizer(decayed_lr)

# Initialize for the loss.
classificaiton_loss_weight = 1
regression_loss_weight = 10
negative_ratio = 3

compute_loss_fn = functools.partial(
    model_lib.compute_loss,
    num_classes=num_classes,
    c_weight=classificaiton_loss_weight,
    r_weight=regression_loss_weight,
    neg_label_value=neg_label_value,
    ignore_label_value=ignore_label_value,
    negative_ratio=negative_ratio)

# Initialize parameters for training loop.
val_iter = 100
val_batch = 5
test_iter = 500
test_batch = 10
score_threshold = 0.5
max_prediction = 100

train_loss_sum = 0

model_dir = "models"
if not os.path.exists(model_dir):
    os.makedirs(model_dir)


### Training loop

In [None]:
for train_index, train_item in enumerate(train_ds):
    with tf.GradientTape() as tape:
        print(train_index)
        train_network_output = od_model(train_item["image"], training=True)
        train_loss = compute_loss_fn(train_network_output,
                                           train_item["bboxes_preprocessed"],
                                           train_item["labels_preprocessed"])
        train_loss_sum += train_loss

        grads = tape.gradient(train_loss, od_model.variables)
        optimizer.apply_gradients(
            zip(grads, od_model.variables),
            global_step=tf.train.get_or_create_global_step())

    if train_index != 0 and train_index % val_iter == 0:
        val_loss_sum = 0
        for val_index, val_item in enumerate(val_ds):
            if val_index != 0 and val_index % val_batch == 0:
                break
            val_network_output = od_model(val_item["image"], training=False)
            val_loss = compute_loss_fn(val_network_output,
                                             val_item["bboxes_preprocessed"],
                                             val_item["labels_preprocessed"])
            val_loss_sum += val_loss

        train_loss = train_loss_sum / val_iter
        val_loss = val_loss_sum / val_batch

        print("Loss at step {:04d}: train loss: {:.3f}, val loss: {:3f}".format(
            train_index, train_loss, val_loss))

        train_loss_sum = 0

od_model.save_weights(os.path.join(model_dir, "od_model"))

## Testing
Load the model saved from previous step. The results will be saved in the "results" directory. By default it will save 20 results. You can adjust the save_image_number to change the number of images to save. 

In [None]:
od_model.load_weights(os.path.join(model_dir, "od_model"))
test_ds = dataset_builder_fn(val_filepath)

# Number of images saved in the testing process.
save_image_number = 20

save_image_count = 0
save_image_dir = "results"
if not os.path.exists(save_image_dir):
    os.makedirs(save_image_dir)

for test_index, test_item in enumerate(test_ds):
    if save_image_count == save_image_number:
      break

    test_network_output = od_model(test_item["image"], training=False)
    bbox_list, label_list = model_lib.predict(
        test_network_output,
        mask=test_item["mask"],
        score_threshold=score_threshold,
        neg_label_value=neg_label_value,
        anchors=anchors,
        max_prediction=max_prediction,
        num_classes=num_classes)

    for image, bbox, label in zip(test_item["image"], bbox_list, label_list):
        # label is converted to [0, 9] for training. +1 to match the original label map [1, 10].
        label_list = [label + 1 for label in label_list]
        # Image is whitened in the preprocess.
        image += 0.5
        normalized_bboxes = bbox_lib.normalizing_bbox(
            bbox, input_shape_h, input_shape_w)
        image_with_bboxes = tf.image.draw_bounding_boxes(
            image[tf.newaxis, ...], normalized_bboxes[tf.newaxis, ...])
        image_with_bboxes = tf.image.encode_png(tf.cast(image_with_bboxes[0] * 255, tf.uint8))
        filepath = tf.constant(os.path.join(save_image_dir, '{}.png'.format(save_image_count)))
        tf.write_file(filepath, image_with_bboxes)
        save_image_count += 1
        if save_image_count == save_image_number:
          break

## Results

|result 0 |result 1 | result 2 
|:- | :- | :-
|![alt](images/od_result_0.png) | ![alt](images/od_result_1.png) | ![alt](images/od_result_2.png)