# SSD: Single Shot MultiBox Detector

```
@misc{liu2015ssd,
    title={SSD: Single Shot MultiBox Detector},
    author={Wei Liu and Dragomir Anguelov and Dumitru Erhan and Christian Szegedy and Scott Reed and Cheng-Yang Fu and Alexander C. Berg},
    year={2015},
    eprint={1512.02325},
    archivePrefix={arXiv},
    primaryClass={cs.CV}
}
```

In [1]:
import cv2
import matplotlib.pyplot as plt
import numpy as np
import os
import tensorflow as tf

print('TensorFlow:', tf.__version__)

TensorFlow: 2.2.0-rc2


In [2]:
strategy = tf.distribute.MirroredStrategy()
print('\nNumber of Accelerators :', strategy.num_replicas_in_sync)

config = {
    'input_shape': [300, 300],
    'scales': [0.1, 0.2, 0.37, 0.54, 0.71, 0.88, 1.05],
    'aspect_ratios': [[1 / 2, 1, 2], [1 / 2, 1 / 3, 1, 2, 3],
                     [1 / 2, 1 / 3, 1, 2, 3], [1 / 2, 1 / 3, 1, 2, 3],
                     [1 / 2, 1, 2], [1 / 2, 1, 2]],
    'feature_sizes': [38, 19, 10, 5, 3, 1],
    'clip_default_boxes': False,
    'num_classes': 2,
    'batch_size': 4 * strategy.num_replicas_in_sync
}

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)

Number of Accelerators : 1


In [3]:
def change_box_format(boxes, return_format='xywh'):
    boxes = tf.cast(boxes, dtype=tf.float32)
    if return_format == 'xywh':
        return tf.stack([(boxes[..., 2] + boxes[..., 0]) / 2.0,
                         (boxes[..., 3] + boxes[..., 1]) / 2.0, boxes[..., 2] -
                         boxes[..., 0], boxes[..., 3] - boxes[..., 1]],
                        axis=-1)
    elif return_format == 'x1y1x2y2':
        return tf.stack([
            boxes[..., 0] - boxes[..., 2] / 2.0, boxes[..., 1] -
            boxes[..., 3] / 2.0, boxes[..., 0] + boxes[..., 2] / 2.0,
            boxes[..., 1] + boxes[..., 3] / 2.0
        ],
                        axis=-1)


def compute_iou(boxes1, boxes2):
    boxes1 = tf.cast(boxes1, dtype=tf.float32)
    boxes2 = tf.cast(boxes2, dtype=tf.float32)

    boxes1_t = change_box_format(boxes1, return_format='x1y1x2y2')
    boxes2_t = change_box_format(boxes2, return_format='x1y1x2y2')

    lu = tf.maximum(boxes1_t[:, None, :2], boxes2_t[:, :2])
    rd = tf.minimum(boxes1_t[:, None, 2:], boxes2_t[:, 2:])

    intersection = tf.maximum(0.0, rd - lu)
    inter_square = intersection[:, :, 0] * intersection[:, :, 1]

    square1 = boxes1[:, 2] * boxes1[:, 3]
    square2 = boxes2[:, 2] * boxes2[:, 3]

    union_square = tf.maximum(square1[:, None] + square2 - inter_square, 1e-10)
    return tf.clip_by_value(inter_square / union_square, 0.0, 1.0)

In [4]:
# smin = 20
# smax = 90
# m = 5
# scales = [0.1]
# for k in range(1, m+2):
#     sl = smin + (smax - smin)//(m - 1) * (k - 1)
#     scales.append(sl/100)
# assert scales == [0.1, 0.2, 0.37, 0.54, 0.71, 0.88, 1.05]

In [5]:
class DefaultBoxes:

    def __init__(self, input_shape, scales, aspect_ratios, feature_sizes, clip_boxes, **kwargs):
        self._input_shape = input_shape
        self._scales = scales
        self._feature_sizes = feature_sizes
        self._strides = [np.ceil(300 / x) for x in feature_sizes]
        self._aspect_ratios = aspect_ratios
        self._clip_boxes = clip_boxes
        self._default_boxes = []
        self._build_default_boxes()

    def _build_meshgrid(self, feature_size):
        meshgrid = tf.stack(tf.meshgrid(tf.range(feature_size),
                                        tf.range(feature_size)),
                            axis=-1)
        return tf.cast(meshgrid, dtype=tf.float32)

    def _get_dims(self, scale, ratio):
        h = self._input_shape[0] * scale / np.sqrt(ratio)
        w = self._input_shape[1] * scale * np.sqrt(ratio)
        wh = tf.constant([w, h], dtype=tf.float32, shape=[1, 1, 2])
        return wh

    def _build_default_boxes(self):
        default_boxes = []
        for i in range(len(self._feature_sizes)):
            feature_size = self._feature_sizes[i]
            aspect_ratios = self._aspect_ratios[i]
            sl = self._scales[i]
            sl_next = self._scales[i + 1]

            meshgrid = self._build_meshgrid(feature_size)
            centers = (meshgrid + 0.5) * self._strides[i]
            default_box = []

            for ratio in aspect_ratios:
                wh = self._get_dims(sl, ratio)
                wh = tf.tile(wh, multiples=[feature_size, feature_size, 1])
                box = tf.concat([centers, wh], axis=-1)
                box = tf.expand_dims(box, axis=2)
                default_box.append(box)

            extra_wh = tf.constant([
                self._input_shape[0] * np.sqrt(sl * sl_next),
                self._input_shape[1] * np.sqrt(sl * sl_next)
            ],
                                   dtype=tf.float32,
                                   shape=[1, 1, 2])
            extra_wh = tf.tile(extra_wh,
                               multiples=[feature_size, feature_size, 1])
            extra_box = tf.concat([centers, extra_wh], axis=-1)
            extra_box = tf.expand_dims(extra_box, axis=2)
            default_box.append(extra_box)
            
            default_box = tf.concat(default_box, axis=2)
            default_box = tf.reshape(default_box, shape=[-1, 4])
            default_boxes.append(default_box)
        self._default_boxes = tf.concat(default_boxes, axis=0)
        if self._clip_boxes:
            '''
            convert to x1y1x2y2
            clip (coord, 0, 300)
            convert back to xywh
            '''
            pass

    @property
    def boxes(self):
        return self._default_boxes


class LabelEncoder:

    def __init__(self, default_boxes, iou_threshold=0.5):
        self._default_boxes = default_boxes
        self._iou_threshold = iou_threshold
        pass

    def _match_gt(gt_boxes):
        pass

In [6]:
def conv_layer(tensor,
               filters,
               kernel_size,
               stride,
               padding='same',
               w_init=tf.initializers.glorot_normal):
    return tf.keras.layers.Conv2D(filters=filters,
                                  kernel_size=kernel_size,
                                  strides=stride,
                                  padding=padding,
                                  kernel_initializer=w_init)(tensor)


def build_model(num_classes=2):
    default_boxes = [4, 6, 6, 6, 4, 4]
    base_model = tf.keras.applications.ResNet50(input_shape=[300, 300, 3],
                                                include_top=False,
                                                weights='imagenet')
    feature_maps = [
        base_model.get_layer(layer).output for layer in
        ['conv3_block4_out', 'conv4_block6_out', 'conv5_block3_out']
    ]
    x = base_model.output
    x = conv_layer(x, 256, 1, 1)
    x = conv_layer(x, 256, 3, 2)
    feature_maps.append(x)
    x = conv_layer(x, 128, 1, 1, padding='valid')
    x = conv_layer(x, 256, 3, 1, padding='valid')
    feature_maps.append(x)
    x = conv_layer(x, 128, 1, 1, padding='valid')
    x = conv_layer(x, 256, 3, 1, padding='valid')
    feature_maps.append(x)

    outputs = []
    for i, feature_map in enumerate(feature_maps):
        filters = default_boxes[i] * (4 + num_classes)
        y = conv_layer(feature_map, filters, 3, 1)
        y = tf.keras.layers.Reshape([-1, 4 + num_classes])(y)
        outputs.append(y)
    prediction = tf.keras.layers.Concatenate(axis=1)(outputs)
    model = tf.keras.Model(inputs=[base_model.input],
                           outputs=[prediction],
                           name='SSD')
    return model

In [7]:
model = build_model(num_classes=2)
default_boxes = DefaultBoxes(**config)

In [8]:
model(tf.random.normal([1, 300, 300, 3]))

<tf.Tensor: shape=(1, 8732, 6), dtype=float32, numpy=
array([[[ 0.20844515, -1.3278862 ,  0.83444494, -0.65102065,
         -2.9813375 , -1.2271886 ],
        [ 0.08294801, -0.8132075 ,  1.6412677 , -0.8853096 ,
          1.2581884 ,  0.5719473 ],
        [ 1.4424982 , -0.58966994, -1.5887465 , -2.7098904 ,
         -0.600199  ,  0.05868107],
        ...,
        [-0.436692  , -0.65218467, -0.69499856, -0.76058984,
          0.23132701,  0.2934894 ],
        [-0.894344  , -0.15233374,  0.394225  ,  0.47983184,
          0.2652601 ,  0.7091137 ],
        [-0.23398116, -1.0034909 ,  0.75078976, -0.23263337,
         -0.5205121 , -0.6275463 ]]], dtype=float32)>

In [9]:
default_boxes.boxes

<tf.Tensor: shape=(8732, 4), dtype=float32, numpy=
array([[  4.      ,   4.      ,  21.213203,  42.426407],
       [  4.      ,   4.      ,  30.      ,  30.      ],
       [  4.      ,   4.      ,  42.426407,  21.213203],
       ...,
       [150.      , 150.      , 264.      , 264.      ],
       [150.      , 150.      , 373.3524  , 186.6762  ],
       [150.      , 150.      , 288.37476 , 288.37476 ]], dtype=float32)>

In [None]:
class DecodePredictions(tf.keras.layers.Layer):
    def __init__(default_boxes, iou_threshold, score_threshold, **kwargs):
        super(DecodePredictions, self).__init__(**kwargs)
        
    def call(self, predictions)
    
    
class SSDLocLoss(tf.losses.loss):
    pass

class SSDClsLoss(tf.losses.loss):
    pass