In [None]:
# https://github.com/pierluigiferrari/ssd_keras/blob/master/models/keras_ssd7.py

In [1]:
import numpy as np
from keras.models import Model
from keras.layers import Input, Lambda, Conv2D, MaxPooling2D, BatchNormalization, ELU, Reshape, Concatenate, Activation
from keras.regularizers import l2
import keras.backend as K


In [2]:
############################################################################
# Compute the anchor box parameters.
############################################################################

aspect_ratios_per_layer = None

# aspect_ratios_global=[0.5, 1.0, 2.0]
aspect_ratios_global=[2.65, 2.85, 3.25]

# aspect_ratios_global (list, optional): The list of aspect ratios for which anchor boxes are to be
# generated. This list is valid for all prediction layers. Note that you should set the aspect ratios such
# that the resulting anchor box shapes roughly correspond to the shapes of the objects you are trying to detect.

n_predictor_layers = 4 # The number of predictor conv layers in the network
n_classes = 5 # (no. of classes)
n_classes = n_classes + 1 # Account for the background class.

two_boxes_for_ar1=True

# two_boxes_for_ar1 (bool, optional): Only relevant for aspect ratio lists that contain 1. Will be ignored otherwise.
# If `True`, two anchor boxes will be generated for aspect ratio 1. The first will be generated
# using the scaling factor for the respective layer, the second one will be generated using
# geometric mean of said scaling factor and next bigger scaling factor.

steps = None
offsets = None

# Set the aspect ratios for each predictor layer. These are only needed for the anchor box layers.
if aspect_ratios_per_layer:
    aspect_ratios = aspect_ratios_per_layer
else:
    aspect_ratios = [aspect_ratios_global] * n_predictor_layers
    
print('aspect ratios = ', aspect_ratios)
print('==============================================================================')

# Compute the number of boxes to be predicted per cell for each predictor layer.
# We need this so that we know how many channels the predictor layers need to have.
if aspect_ratios_per_layer:
    n_boxes = []
    for ar in aspect_ratios_per_layer:
        if (1 in ar) & two_boxes_for_ar1:
            n_boxes.append(len(ar) + 1) # +1 for the second box for aspect ratio 1
        else:
            n_boxes.append(len(ar))
else: # If only a global aspect ratio list was passed, then the number of boxes is the same for each predictor layer
    if (1 in aspect_ratios_global) & two_boxes_for_ar1:
        n_boxes = len(aspect_ratios_global) + 1
    else:
        n_boxes = len(aspect_ratios_global)
        print('++++++++++++++++++++++++  n_boxes = ', n_boxes)
    n_boxes = [n_boxes] * n_predictor_layers
    print('------------------------  n_boxes = ', n_boxes)

if steps is None:
    steps = [None] * n_predictor_layers
if offsets is None:
    offsets = [None] * n_predictor_layers
print(steps)
print(offsets)

aspect ratios =  [[2.65, 2.85, 3.25], [2.65, 2.85, 3.25], [2.65, 2.85, 3.25], [2.65, 2.85, 3.25]]
++++++++++++++++++++++++  n_boxes =  3
------------------------  n_boxes =  [3, 3, 3, 3]
[None, None, None, None]
[None, None, None, None]


In [3]:
## Base Model

l2_reg = 0.0

# x = Input(shape=(img_height, img_width, img_channels))

x = Input(shape=(608, 608, 3))

"""
# The following identity layer is only needed so that the subsequent lambda layers can be optional.
x1 = Lambda(identity_layer, output_shape=(img_height, img_width, img_channels), name='identity_layer')(x)
if not (subtract_mean is None):
    x1 = Lambda(input_mean_normalization, output_shape=(img_height, img_width, img_channels), name='input_mean_normalization')(x1)
if not (divide_by_stddev is None):
    x1 = Lambda(input_stddev_normalization, output_shape=(img_height, img_width, img_channels), name='input_stddev_normalization')(x1)
if swap_channels:
    x1 = Lambda(input_channel_swap, output_shape=(img_height, img_width, img_channels), name='input_channel_swap')(x1)
"""

conv1 = Conv2D(32, (5, 5), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv1')(x)
conv1 = BatchNormalization(axis=3, momentum=0.99, name='bn1')(conv1) # Tensorflow uses filter format [filter_height, filter_width, in_channels, out_channels], hence axis = 3
conv1 = ELU(name='elu1')(conv1)
pool1 = MaxPooling2D(pool_size=(2, 2), name='pool1')(conv1)

conv2 = Conv2D(48, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv2')(pool1)
conv2 = BatchNormalization(axis=3, momentum=0.99, name='bn2')(conv2)
conv2 = ELU(name='elu2')(conv2)
pool2 = MaxPooling2D(pool_size=(2, 2), name='pool2')(conv2)

conv3 = Conv2D(64, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv3')(pool2)
conv3 = BatchNormalization(axis=3, momentum=0.99, name='bn3')(conv3)
conv3 = ELU(name='elu3')(conv3)
pool3 = MaxPooling2D(pool_size=(2, 2), name='pool3')(conv3)

conv4 = Conv2D(64, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4')(pool3)
conv4 = BatchNormalization(axis=3, momentum=0.99, name='bn4')(conv4)
conv4 = ELU(name='elu4')(conv4)
pool4 = MaxPooling2D(pool_size=(2, 2), name='pool4')(conv4)

conv5 = Conv2D(48, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv5')(pool4)
conv5 = BatchNormalization(axis=3, momentum=0.99, name='bn5')(conv5)
conv5 = ELU(name='elu5')(conv5)
pool5 = MaxPooling2D(pool_size=(2, 2), name='pool5')(conv5)

conv6 = Conv2D(48, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv6')(pool5)
conv6 = BatchNormalization(axis=3, momentum=0.99, name='bn6')(conv6)
conv6 = ELU(name='elu6')(conv6)
pool6 = MaxPooling2D(pool_size=(2, 2), name='pool6')(conv6)

conv7 = Conv2D(32, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv7')(pool6)
conv7 = BatchNormalization(axis=3, momentum=0.99, name='bn7')(conv7)
conv7 = ELU(name='elu7')(conv7)

model = Model(inputs=x, outputs=conv7)

print(model.summary())

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 608, 608, 3)]     0         
_________________________________________________________________
conv1 (Conv2D)               (None, 608, 608, 32)      2432      
_________________________________________________________________
bn1 (BatchNormalization)     (None, 608, 608, 32)      128       
_________________________________________________________________
elu1 (ELU)                   (None, 608, 608, 32)      0         
_________________________________________________________________
pool1 (MaxPooling2D)         (None, 304, 304, 32)      0         
_________________________________________________________________
conv2 (Conv2D)               (None, 304, 304, 48)      13872     
_________________________________________________________________
bn2 (BatchNormalization)     (None, 304, 304, 48)     

In [4]:

x = Input(shape=(608, 608, 3))

"""
# The following identity layer is only needed so that the subsequent lambda layers can be optional.
x1 = Lambda(identity_layer, output_shape=(img_height, img_width, img_channels), name='identity_layer')(x)
if not (subtract_mean is None):
    x1 = Lambda(input_mean_normalization, output_shape=(img_height, img_width, img_channels), name='input_mean_normalization')(x1)
if not (divide_by_stddev is None):
    x1 = Lambda(input_stddev_normalization, output_shape=(img_height, img_width, img_channels), name='input_stddev_normalization')(x1)
if swap_channels:
    x1 = Lambda(input_channel_swap, output_shape=(img_height, img_width, img_channels), name='input_channel_swap')(x1)
"""

conv1 = Conv2D(32, (5, 5), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv1')(x)
conv1 = BatchNormalization(axis=3, momentum=0.99, name='bn1')(conv1) # Tensorflow uses filter format [filter_height, filter_width, in_channels, out_channels], hence axis = 3
conv1 = ELU(name='elu1')(conv1)
pool1 = MaxPooling2D(pool_size=(2, 2), name='pool1')(conv1)

conv2 = Conv2D(48, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv2')(pool1)
conv2 = BatchNormalization(axis=3, momentum=0.99, name='bn2')(conv2)
conv2 = ELU(name='elu2')(conv2)
pool2 = MaxPooling2D(pool_size=(2, 2), name='pool2')(conv2)

conv3 = Conv2D(64, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv3')(pool2)
conv3 = BatchNormalization(axis=3, momentum=0.99, name='bn3')(conv3)
conv3 = ELU(name='elu3')(conv3)
pool3 = MaxPooling2D(pool_size=(2, 2), name='pool3')(conv3)

conv4 = Conv2D(64, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4')(pool3)
conv4 = BatchNormalization(axis=3, momentum=0.99, name='bn4')(conv4)
conv4 = ELU(name='elu4')(conv4)
pool4 = MaxPooling2D(pool_size=(2, 2), name='pool4')(conv4)

conv5 = Conv2D(48, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv5')(pool4)
conv5 = BatchNormalization(axis=3, momentum=0.99, name='bn5')(conv5)
conv5 = ELU(name='elu5')(conv5)
pool5 = MaxPooling2D(pool_size=(2, 2), name='pool5')(conv5)

conv6 = Conv2D(48, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv6')(pool5)
conv6 = BatchNormalization(axis=3, momentum=0.99, name='bn6')(conv6)
conv6 = ELU(name='elu6')(conv6)
pool6 = MaxPooling2D(pool_size=(2, 2), name='pool6')(conv6)

conv7 = Conv2D(32, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv7')(pool6)
conv7 = BatchNormalization(axis=3, momentum=0.99, name='bn7')(conv7)
conv7 = ELU(name='elu7')(conv7)


# Build the convolutional predictor layers on top of conv layers 4, 5, 6, and 7.
# We build two predictor layers on top of each of these layers: One for class prediction (classification), one for box coordinate prediction (localization)
# We precidt `n_classes` confidence values for each box, hence the `classes` predictors have depth `n_boxes * n_classes`
# We predict 4 box coordinates for each box, hence the `boxes` predictors have depth `n_boxes * 4`
# Output shape of `classes`: `(batch, height, width, n_boxes * n_classes)`

classes4 = Conv2D(n_boxes[0] * n_classes, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='classes4')(conv4)
classes5 = Conv2D(n_boxes[1] * n_classes, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='classes5')(conv5)
classes6 = Conv2D(n_boxes[2] * n_classes, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='classes6')(conv6)
classes7 = Conv2D(n_boxes[3] * n_classes, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='classes7')(conv7)
# Output shape of `boxes`: `(batch, height, width, n_boxes * 4)`
boxes4 = Conv2D(n_boxes[0] * 4, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='boxes4')(conv4)
boxes5 = Conv2D(n_boxes[1] * 4, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='boxes5')(conv5)
boxes6 = Conv2D(n_boxes[2] * 4, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='boxes6')(conv6)
boxes7 = Conv2D(n_boxes[3] * 4, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='boxes7')(conv7)

model_1_class = Model(inputs=x, outputs=classes4)
model_2_class = Model(inputs=x, outputs=classes5)
model_3_class = Model(inputs=x, outputs=classes6)
model_4_class = Model(inputs=x, outputs=classes7)

model_1_boxes = Model(inputs=x, outputs=boxes4)
model_2_boxes = Model(inputs=x, outputs=boxes5)
model_3_boxes = Model(inputs=x, outputs=boxes6)
model_4_boxes = Model(inputs=x, outputs=boxes7)

In [5]:
model_1_class.summary()

Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 608, 608, 3)]     0         
_________________________________________________________________
conv1 (Conv2D)               (None, 608, 608, 32)      2432      
_________________________________________________________________
bn1 (BatchNormalization)     (None, 608, 608, 32)      128       
_________________________________________________________________
elu1 (ELU)                   (None, 608, 608, 32)      0         
_________________________________________________________________
pool1 (MaxPooling2D)         (None, 304, 304, 32)      0         
_________________________________________________________________
conv2 (Conv2D)               (None, 304, 304, 48)      13872     
_________________________________________________________________
bn2 (BatchNormalization)     (None, 304, 304, 48)     

In [14]:
# scales (list, optional): A list of floats containing scaling factors per convolutional predictor layer.
# This list must be one element longer than the number of predictor layers. The first `k` elements are the
# scaling factors for the `k` predictor layers, while the last element is used for the second box
# for aspect ratio 1 in the last predictor layer if `two_boxes_for_ar1` is `True`. This additional
# last scaling factor must be passed either way, even if it is not being used. If a list is passed,
# this argument overrides `min_scale` and `max_scale`. All scaling factors must be greater than zero.

min_scale = 0.1
max_scale = 0.9
scales = np.linspace(min_scale, max_scale, n_predictor_layers+1)
print(scales)

[0.1 0.3 0.5 0.7 0.9]


In [6]:
# variances (list, optional): A list of 4 floats >0. The anchor box offset for each coordinate will be divided by
# its respective variance value.

variances = [1., 1., 1., 1.]
variances = np.array(variances)

In [7]:
print('aspect ratios = ', aspect_ratios)
print(steps)
print(offsets)

aspect ratios =  [[2.65, 2.85, 3.25], [2.65, 2.85, 3.25], [2.65, 2.85, 3.25], [2.65, 2.85, 3.25]]
[None, None, None, None]
[None, None, None, None]


In [8]:
for ar in aspect_ratios:
    print(ar)

[2.65, 2.85, 3.25]
[2.65, 2.85, 3.25]
[2.65, 2.85, 3.25]
[2.65, 2.85, 3.25]


In [None]:
def anchorboxes(img_height,img_width,this_scale,next_scale,aspect_ratiostwo_boxes_for_ar1=True,
                 this_steps=None,this_offsets=None,clip_boxes=False,variancescoords='centroids',
                 normalize_coords=False):
    
    # Compute box width and height for each aspect ratio
    # The shorter side of the image will be used to compute `w` and `h` using `scale` and `aspect_ratios`.
    size = min(img_height, img_width)
    # Compute the box widths and and heights for all aspect ratios
    wh_list = []
    for ar in aspect_ratios:
        if (ar == 1):
            # Compute the regular anchor box for aspect ratio 1.
            box_height = box_width = this_scale * size
            wh_list.append((box_width, box_height))
            if two_boxes_for_ar1:
                
                # Compute one slightly larger version using the geometric mean of this scale value and the next.
                box_height = box_width = np.sqrt(this_scale * next_scale) * size
                wh_list.append((box_width, box_height))
        else:
            box_height = this_scale * size / np.sqrt(ar)
            box_width = this_scale * size * np.sqrt(ar)
            wh_list.append((box_width, box_height))
    wh_list = np.array(wh_list)