In [1]:
######################################################################################
Copyright (c) 2023, 2024 , Prof. Radhamadhab Dalai, ITER , Siksha O Aanusandhan University, 
Odisha, India
Author's email address :  radhamadhabdalai@soa.ac.in
#####################################################################################

import tensorflow as tf


def xywh_to_x1x2y1y2(box):
    xy = box[..., 0:2]
    wh = box[..., 2:4]

    x1y1 = xy - wh / 2
    x2y2 = xy + wh / 2

    y_box = tf.concat([x1y1, x2y2], axis=-1)
    return y_box


def xywh_to_y1x1y2x2(box):
    x = box[..., 0:1]
    y = box[..., 1:2]
    w = box[..., 2:3]
    h = box[..., 3:4]

    yx = tf.concat([y, x], axis=-1)
    hw = tf.concat([h, w], axis=-1)

    y1x1 = yx - hw / 2
    y2x2 = yx + hw / 2

    y_box = tf.concat([y1x1, y2x2], axis=-1)
    return y_box


def broadcast_iou(box_a, box_b):
    """
    calculate iou between box_a and multiple box_b in a broadcast way.
    Used this implementation as reference: 
    https://github.com/dmlc/gluon-cv/blob/c3dd20d4b1c1ef8b7d381ad2a7d04a68c5fa1221/gluoncv/nn/bbox.py#L206

    inputs:
    box_a: a tensor full of boxes, eg. (B, N, 4), box is in x1y1x2y2
    box_b: another tensor full of boxes, eg. (B, M, 4)
    """

    # (B, N, 1, 4)
    box_a = tf.expand_dims(box_a, -2)
    # (B, 1, M, 4)
    box_b = tf.expand_dims(box_b, -3)
    # (B, N, M, 4)
    new_shape = tf.broadcast_dynamic_shape(tf.shape(box_a), tf.shape(box_b))

    # (B, N, M, 4)
    # (B, N, M, 4)
    box_a = tf.broadcast_to(box_a, new_shape)
    box_b = tf.broadcast_to(box_b, new_shape)

    # (B, N, M, 1)
    al, at, ar, ab = tf.split(box_a, 4, -1)
    bl, bt, br, bb = tf.split(box_b, 4, -1)

    # (B, N, M, 1)
    left = tf.math.maximum(al, bl)
    right = tf.math.minimum(ar, br)
    top = tf.math.maximum(at, bt)
    bot = tf.math.minimum(ab, bb)

    # (B, N, M, 1)
    iw = tf.clip_by_value(right - left, 0, 1)
    ih = tf.clip_by_value(bot - top, 0, 1)
    i = iw * ih

    # (B, N, M, 1)
    area_a = (ar - al) * (ab - at)
    area_b = (br - bl) * (bb - bt)
    union = area_a + area_b - i

    # (B, N, M)
    iou = tf.squeeze(i / (union + 1e-7), axis=-1)

    return iou


def binary_cross_entropy(logits, labels):
    epsilon = 1e-7
    logits = tf.clip_by_value(logits, epsilon, 1 - epsilon)
    return -(labels * tf.math.log(logits) +
             (1 - labels) * tf.math.log(1 - logits))


2024-05-11 10:40:19.670044: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-11 10:40:19.750406: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-11 10:40:20.332318: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-11 10:40:20.336666: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import tensorflow as tf

# from utils import broadcast_iou, xywh_to_x1x2y1y2


class Postprocessor(object):
    def __init__(self, iou_thresh, score_thresh, max_detection=100):
        self.iou_thresh = iou_thresh
        self.score_thresh = score_thresh
        self.max_detection = max_detection

    def __call__(self, raw_yolo_outputs):
        boxes, objectness, class_probs = [], [], []

        for o in raw_yolo_outputs:
            batch_size = tf.shape(o[0])[0]
            num_classes = tf.shape(o[2])[-1]
            # needs to translate from xywh to y1x1y2x2 format
            boxes.append(tf.reshape(o[0], (batch_size, -1, 4)))
            objectness.append(tf.reshape(o[1], (batch_size, -1, 1)))
            class_probs.append(tf.reshape(o[2], (batch_size, -1, num_classes)))

        boxes = xywh_to_x1x2y1y2(tf.concat(boxes, axis=1))

        objectness = tf.concat(objectness, axis=1)
        class_probs = tf.concat(class_probs, axis=1)

        scores = objectness
        scores = tf.reshape(scores,
                            (tf.shape(scores)[0], -1, tf.shape(scores)[-1]))

        final_boxes, final_scores, final_classes, valid_detections = self.batch_non_maximum_suppression(
            boxes, scores, class_probs, self.iou_thresh, self.score_thresh,
            self.max_detection)

        return final_boxes, final_scores, final_classes, valid_detections

    @staticmethod
    def batch_non_maximum_suppression(boxes, scores, classes, iou_threshold,
                                      score_threshold, max_detection):
        """
        Unlike tf.image.combined_non_max_suppression, we are making multi-label classification on the detection
        """

        def single_batch_nms(candidate_boxes):
            # filter out predictions with score less than score_threshold
            candidate_boxes = tf.boolean_mask(
                candidate_boxes, candidate_boxes[..., 4] >= score_threshold)
            outputs = tf.zeros((max_detection + 1,
                                tf.shape(candidate_boxes)[-1]))
            indices = []
            updates = []

            count = 0
            # keep running this until there's no more candidate box or max_detection is met
            while tf.shape(candidate_boxes)[0] > 0 and count < max_detection:
                # pick the box with the highest score
                best_idx = tf.math.argmax(candidate_boxes[..., 4], axis=0)
                best_box = candidate_boxes[best_idx]
                # add this best box to the output
                indices.append([count])
                updates.append(best_box)
                count += 1
                # remove this box from candidate boxes
                candidate_boxes = tf.concat([
                    candidate_boxes[0:best_idx],
                    candidate_boxes[best_idx + 1:tf.shape(candidate_boxes)[0]]
                ],
                                            axis=0)
                # calculate IOU between this box and all remaining candidate boxes
                iou = broadcast_iou(best_box[0:4], candidate_boxes[..., 0:4])
                # remove all candidate boxes with IOU bigger than iou_threshold
                candidate_boxes = tf.boolean_mask(candidate_boxes,
                                                  iou[0] <= iou_threshold)
            if count > 0:
                # also append num_detection to the result
                count_index = [[max_detection]]
                count_updates = [
                    tf.fill([tf.shape(candidate_boxes)[-1]], count)
                ]
                indices = tf.concat([indices, count_index], axis=0)
                updates = tf.concat([updates, count_updates], axis=0)
                outputs = tf.tensor_scatter_nd_update(outputs, indices,
                                                      updates)
            return outputs

        combined_boxes = tf.concat([boxes, scores, classes], axis=2)
        result = tf.map_fn(single_batch_nms, combined_boxes)
        # take out num_detection from the result
        valid_counts = tf.expand_dims(
            tf.map_fn(lambda x: x[max_detection][0], result), axis=-1)
        final_result = tf.map_fn(lambda x: x[0:max_detection], result)
        nms_boxes, nms_scores, nms_classes = tf.split(
            final_result, [4, 1, -1], axis=-1)
        return nms_boxes, nms_scores, nms_classes, tf.cast(
            valid_counts, tf.int32)


In [12]:
#################### ylov3



import numpy as np
import tensorflow as tf
#import utils

from tensorflow.keras.layers import (
    Add,
    Concatenate,
    Conv2D,
    Input,
    Lambda,
    LeakyReLU,
    MaxPool2D,
    UpSampling2D,
    ZeroPadding2D,
    BatchNormalization,
)
# from utils import xywh_to_x1x2y1y2, xywh_to_y1x1y2x2, broadcast_iou, binary_cross_entropy

anchors_wh = np.array([[10, 13], [16, 30], [33, 23], [30, 61], [62, 45],
                       [59, 119], [116, 90], [156, 198], [373, 326]],
                      np.float32) / 416


def DarknetConv(inputs, filters, kernel_size, strides, name):
    x = Conv2D(
        filters=filters,
        kernel_size=kernel_size,
        strides=strides,
        padding='same',
        name=name + '_conv2d',
        use_bias=False,
        # kernel_regularizer=tf.keras.regularizers.l2(0.0005)
    )(inputs)
    # YoloV2:
    # "By adding batch normalization on all of the convolutional layers in
    #  YOLO we get more than 2% improvement in mAP."
    x = BatchNormalization(name=name + '_bn')(x)
    # YoloV1:
    # "We use a linear activation function for the ﬁnal layer and all other
    #  layers use the following leaky rectiﬁed linear activation"
    x = LeakyReLU(alpha=0.1, name=name + '_leakyrelu')(x)
    return x


def DarknetResidual(inputs, filters1, filters2, name):
    shortcut = inputs
    x = DarknetConv(
        inputs, filters=filters1, kernel_size=1, strides=1, name=name + '_1x1')
    x = DarknetConv(
        x, filters=filters2, kernel_size=3, strides=1, name=name + '_3x3')
    x = Add(name=name + '_add')([shortcut, x])
    return x


def Darknet(shape=(256, 256, 3)):
    # YoloV3:
    # Table 1. Darknet-53.
    inputs = Input(shape=shape)

    x = DarknetConv(inputs, 32, kernel_size=3, strides=1, name='conv2d_0')

    x = DarknetConv(x, 64, kernel_size=3, strides=2, name='conv2d_1')
    # 1x residual blocks
    for i in range(1):
        x = DarknetResidual(x, 32, 64, 'residual_0_' + str(i))

    x = DarknetConv(x, 128, kernel_size=3, strides=2, name='conv2d_2')
    # 2x residual blocks
    for i in range(2):
        x = DarknetResidual(x, 64, 128, 'residual_1_' + str(i))

    x = DarknetConv(x, 256, kernel_size=3, strides=2, name='conv2d_3')
    # 8x residual blocks
    for i in range(8):
        x = DarknetResidual(x, 128, 256, 'residual_2_' + str(i))

    y0 = x

    x = DarknetConv(x, 512, kernel_size=3, strides=2, name='conv2d_4')
    # 8x residual blocks
    for i in range(8):
        x = DarknetResidual(x, 256, 512, 'residual_3_' + str(i))

    y1 = x

    x = DarknetConv(x, 1024, kernel_size=3, strides=2, name='conv2d_5')
    # 4x residual blocks
    for i in range(4):
        x = DarknetResidual(x, 512, 1024, 'residual_4_' + str(i))

    y2 = x

    return tf.keras.Model(inputs, (y0, y1, y2), name='darknet_53')


def YoloV3(shape=(416, 416, 3), num_classes=2, training=False):
    # YoloV3:
    # "In our experiments with COCO [10] we predict 3 boxes at each scale so
    #  the tensor is N × N × [3 ∗ (4 + 1 + 80)] for the 4 bounding box offsets,
    #  1 objectness prediction, and 80 class predictions."
    # 3 * (4 + 1 + num_classes) = 21
    final_filters = 3 * (4 + 1 + num_classes)

    inputs = Input(shape=shape)

    backbone = Darknet(shape)
    x_small, x_medium, x_large = backbone(inputs)

    # large scale detection
    # https://github.com/pjreddie/darknet/blob/61c9d02ec461e30d55762ec7669d6a1d3c356fb2/cfg/yolov3.cfg#L549-L788
    x = DarknetConv(
        x_large,
        512,
        kernel_size=1,
        strides=1,
        name='detector_scale_large_1x1_1')
    x = DarknetConv(
        x, 1024, kernel_size=3, strides=1, name='detector_scale_large_3x3_1')
    x = DarknetConv(
        x, 512, kernel_size=1, strides=1, name='detector_scale_large_1x1_2')
    x = DarknetConv(
        x, 1024, kernel_size=3, strides=1, name='detector_scale_large_3x3_2')
    x = DarknetConv(
        x, 512, kernel_size=1, strides=1, name='detector_scale_large_1x1_3')

    y_large = DarknetConv(
        x, 1024, kernel_size=3, strides=1, name='detector_scale_large_3x3_3')
    y_large = Conv2D(
        filters=final_filters,
        kernel_size=1,
        strides=1,
        padding='same',
        name='detector_scale_large_final_conv2d',
    )(y_large)

    
    x, 256, kernel_size=1, strides=1, name='detector_scale_medium_1x1_0')
    x = UpSampling2D(size=(2, 2), name='detector_scale_1_upsampling')(x)
    x = Concatenate(name='detector_scale_1_concat')([x, x_medium])

    x = DarknetConv(
        x, 256, kernel_size=1, strides=1, name='detector_scale_medium_1x1_1')
    x = DarknetConv(
        x, 512, kernel_size=3, strides=1, name='detector_scale_medium_3x3_1')
    x = DarknetConv(
        x, 256, kernel_size=1, strides=1, name='detector_scale_medium_1x1_2')
    x = DarknetConv(
        x, 512, kernel_size=3, strides=1, name='detector_scale_medium_3x3_2')
    x = DarknetConv(
        x, 256, kernel_size=1, strides=1, name='detector_scale_medium_1x1_3')

    y_medium = DarknetConv(
        x, 512, kernel_size=3, strides=1, name='detector_scale_medium_3x3_3')
    y_medium = Conv2D(
        filters=final_filters,
        kernel_size=1,
        strides=1,
        padding='same',
        name='detector_scale_medium_final_conv2d',
    )(y_medium)

   
    x = DarknetConv(
        x, 128, kernel_size=1, strides=1, name='detector_scale_small_1x1_0')
    x = UpSampling2D(size=(2, 2), name='detector_scale_small_upsampling')(x)
    x = Concatenate(name='detector_scale_small_concat')([x, x_small])

    x = DarknetConv(
        x, 128, kernel_size=1, strides=1, name='detector_scale_small_1x1_1')
    x = DarknetConv(
        x, 256, kernel_size=3, strides=1, name='detector_scale_small_3x3_1')
    x = DarknetConv(
        x, 128, kernel_size=1, strides=1, name='detector_scale_small_1x1_2')
    x = DarknetConv(
        x, 256, kernel_size=3, strides=1, name='detector_scale_small_3x3_2')
    x = DarknetConv(
        x, 128, kernel_size=1, strides=1, name='detector_scale_small_1x1_3')

    y_small = DarknetConv(
        x, 256, kernel_size=3, strides=1, name='detector_scale_small_3x3_3')
    y_small = Conv2D(
        filters=final_filters,
        kernel_size=1,
        strides=1,
        padding='same',
        name='detector_scale_small_final_conv2d',
    )(y_small)

    # reshape (N, grid, grid, 21) into (N, grid, grid, 3, 7) to seprate predictions
    # for each anchor
    y_small_shape = tf.shape(y_small)
    y_medium_shape = tf.shape(y_medium)
    y_large_shape = tf.shape(y_large)

    y_small = tf.reshape(
        y_small, (y_small_shape[0], y_small_shape[1], y_small_shape[2], 3, -1),
        name='detector_reshape_small')
    y_medium = tf.reshape(
        y_medium,
        (y_medium_shape[0], y_medium_shape[1], y_medium_shape[2], 3, -1),
        name='detector_reshape_meidum')
    y_large = tf.reshape(
        y_large, (y_large_shape[0], y_large_shape[1], y_large_shape[2], 3, -1),
        name='detector_reshape_large')

    if training:
        return tf.keras.Model(inputs, (y_small, y_medium, y_large))

    box_small = Lambda(
        lambda x: get_absolute_yolo_box(x, anchors_wh[0:3], num_classes),
        name='detector_final_box_small')(y_small)
    box_medium = Lambda(
        lambda x: get_absolute_yolo_box(x, anchors_wh[3:6], num_classes),
        name='detector_final_box_medium')(y_medium)
    box_large = Lambda(
        lambda x: get_absolute_yolo_box(x, anchors_wh[6:9], num_classes),
        name='detector_final_box_large')(y_large)

    outputs = (box_small, box_medium, box_large)
    return tf.keras.Model(inputs, outputs)


def get_absolute_yolo_box(y_pred, valid_anchors_wh, num_classes):
   

    t_xy, t_wh, objectness, classes = tf.split(
        y_pred, (2, 2, 1, num_classes), axis=-1)

    objectness = tf.sigmoid(objectness)
    classes = tf.sigmoid(classes)

    grid_size = tf.shape(y_pred)[1]
    # meshgrid generates a grid that repeats by given range. It's the Cx and Cy in YoloV3 paper.
    # for example, tf.meshgrid(tf.range(3), tf.range(3)) will generate a list with two elements
    # note that in real code, the grid_size should be something like 13, 26, 52 for examples here and below
    #
    # [[0, 1, 2],
    #  [0, 1, 2],
    #  [0, 1, 2]]
    #
    # [[0, 0, 0],
    #  [1, 1, 1],
    #  [2, 2, 2]]
    #
    C_xy = tf.meshgrid(tf.range(grid_size), tf.range(grid_size))

    # next, we stack two items in the list together in the last dimension, so that
    # we can interleve these elements together and become this:
    #
    # [[[0, 0], [1, 0], [2, 0]],
    #  [[0, 1], [1, 1], [2, 1]],
    #  [[0, 2], [1, 2], [2, 2]]]
    #
    C_xy = tf.stack(C_xy, axis=-1)

    # let's add an empty dimension at axis=2 to expand the tensor to this:
    #
    # [[[[0, 0]], [[1, 0]], [[2, 0]]],
    #  [[[0, 1]], [[1, 1]], [[2, 1]]],
    #  [[[0, 2]], [[1, 2]], [[2, 2]]]]
    #
    # at this moment, we now have a grid, which can always give us (y, x)
    # if we access grid[x][y]. For example, grid[0][1] == [[1, 0]]
    C_xy = tf.expand_dims(C_xy, axis=2)  # [gx, gy, 1, 2]

    # YoloV2, YoloV3:
    # bx = sigmoid(tx) + Cx
    # by = sigmoid(ty) + Cy
    #
    # for example, if all elements in b_xy are (0.1, 0.2), the result will be
    #
    # [[[[0.1, 0.2]], [[1.1, 0.2]], [[2.1, 0.2]]],
    #  [[[0.1, 1.2]], [[1.1, 1.2]], [[2.1, 1.2]]],
    #  [[[0.1, 2.2]], [[1.1, 2.2]], [[2.1, 2.2]]]]
    #
    b_xy = tf.sigmoid(t_xy) + tf.cast(C_xy, tf.float32)

    # finally, divide this absolute box_xy by grid_size, and then we will get the normalized bbox centroids
    # for each anchor in each grid cell. b_xy is now in shape (batch_size, grid_size, grid_size, num_anchor, 2)
    #
    # [[[[0.1/3, 0.2/3]], [[1.1/3, 0.2/3]], [[2.1/3, 0.2/3]]],
    #  [[[0.1/3, 1.2/3]], [[1.1/3, 1.2]/3], [[2.1/3, 1.2/3]]],
    #  [[[0.1/3, 2.2/3]], [[1.1/3, 2.2/3]], [[2.1/3, 2.2/3]]]]
    #
    b_xy = b_xy / tf.cast(grid_size, tf.float32)

    # YoloV2:
    # "If the cell is offset from the top left corner of the image by (cx , cy)
    # and the bounding box prior has width and height pw , ph , then the predictions correspond to: "
    #
    # https://github.com/pjreddie/darknet/issues/568#issuecomment-469600294
    # "It’s OK for the predicted box to be wider and/or taller than the original image, but
    # it does not make sense for the box to have a negative width or height. That’s why
    # we take the exponent of the predicted number."
    b_wh = tf.exp(t_wh) * valid_anchors_wh

    y_box = tf.concat([b_xy, b_wh], axis=-1)
    return y_box, objectness, classes


def get_relative_yolo_box(y_true, valid_anchors_wh):
    """
    This is the inverse of `get_absolute_yolo_box` above. It's turning (bx, by, bw, bh) into
    (tx, ty, tw, th) that is relative to cell location.
    """
    grid_size = tf.shape(y_true)[1]
    C_xy = tf.meshgrid(tf.range(grid_size), tf.range(grid_size))
    C_xy = tf.expand_dims(tf.stack(C_xy, axis=-1), axis=2)

    b_xy = y_true[..., 0:2]
    b_wh = y_true[..., 2:4]
    t_xy = b_xy * tf.cast(grid_size, tf.float32) - tf.cast(C_xy, tf.float32)

    t_wh = tf.math.log(b_wh / valid_anchors_wh)
    # b_wh could have some cells are 0, divided by anchor could result in inf or nan
    t_wh = tf.where(
        tf.logical_or(tf.math.is_inf(t_wh), tf.math.is_nan(t_wh)),
        tf.zeros_like(t_wh), t_wh)

    y_box = tf.concat([t_xy, t_wh], axis=-1)
    return y_box


class YoloLoss(object):
    def __init__(self, num_classes, valid_anchors_wh):
        self.num_classes = num_classes
        self.ignore_thresh = 0.5
        self.valid_anchors_wh = valid_anchors_wh
        self.lambda_coord = 5.0
        self.lamda_noobj = 0.5

    def __call__(self, y_true, y_pred):
        """
        calculate the loss of model prediction for one scale
        """
        # for xy and wh, I seperated them into two groups with different suffix
        # suffix rel (relative) means that its coordinates are relative to cells
        # basically (tx, ty, tw, th) format from the paper
        # _rel is used to calcuate the loss
        # suffix abs (absolute) means that its coordinates are absolute with in whole image
        # basically (bx, by, bw, bh) format from the paper
        # _abs is used to calcuate iou and ignore mask

        # split y_pred into xy, wh, objectness and one-hot classes
        # pred_xy_rel: (batch, grid, grid, anchor, 2)
        # pred_wh_rel: (batch, grid, grid, anchor, 2)
        # TODO: Add comment for the sigmoid here
        pred_xy_rel = tf.sigmoid(y_pred[..., 0:2])
        pred_wh_rel = y_pred[..., 2:4]

        # this box is used to calculate iou, NOT loss. so we can't use
        # cell offset anymore and have to transform it into true values
        # both pred_obj and pred_class has been sigmoid'ed here
        # pred_xy_abs: (batch, grid, grid, anchor, 2)
        # pred_wh_abs: (batch, grid, grid, anchor, 2)
        # pred_obj: (batch, grid, grid, anchor, 1)
        # pred_class: (batch, grid, grid, anchor, num_classes)
        pred_box_abs, pred_obj, pred_class = get_absolute_yolo_box(
            y_pred, self.valid_anchors_wh, self.num_classes)
        pred_box_abs = xywh_to_x1x2y1y2(pred_box_abs)

        # split y_true into xy, wh, objectness and one-hot classes
        # pred_xy_abs: (batch, grid, grid, anchor, 2)
        # pred_wh_abs: (batch, grid, grid, anchor, 2)
        # pred_obj: (batch, grid, grid, anchor, 1)
        # pred_class: (batch, grid, grid, anchor, num_classes)
        true_xy_abs, true_wh_abs, true_obj, true_class = tf.split(
            y_true, (2, 2, 1, self.num_classes), axis=-1)
        true_box_abs = tf.concat([true_xy_abs, true_wh_abs], axis=-1)
        true_box_abs = xywh_to_x1x2y1y2(true_box_abs)

        # true_box_rel: (batch, grid, grid, anchor, 4)
        true_box_rel = get_relative_yolo_box(y_true, self.valid_anchors_wh)
        true_xy_rel = true_box_rel[..., 0:2]
        true_wh_rel = true_box_rel[..., 2:4]

        # some adjustment to improve small box detection, note the (2-truth.w*truth.h) below
        # https://github.com/pjreddie/darknet/blob/f6d861736038da22c9eb0739dca84003c5a5e275/src/yolo_layer.c#L190
        weight = 2 - true_wh_abs[..., 0] * true_wh_abs[..., 1]

        # YoloV2:
        # "If the cell is offset from the top left corner of the image by (cx , cy)
        # and the bounding box prior has width and height pw , ph , then the predictions correspond to:"
        #
        # to calculate the iou and determine the ignore mask, we need to first transform
        # prediction into real coordinates (bx, by, bw, bh)

        # YoloV2:
        # "This ground truth value can be easily computed by inverting the equations above."
        #
        # to calculate loss and differentiation, we need to transform ground truth into
        # cell offset first like demonstrated here:
        # https://github.com/pjreddie/darknet/blob/f6d861736038da22c9eb0739dca84003c5a5e275/src/yolo_layer.c#L93
        xy_loss = self.calc_xy_loss(true_obj, true_xy_rel, pred_xy_rel, weight)
        wh_loss = self.calc_wh_loss(true_obj, true_wh_rel, pred_wh_rel, weight)
        class_loss = self.calc_class_loss(true_obj, true_class, pred_class)

        # use the absolute yolo box to calculate iou and ignore mask
        ignore_mask = self.calc_ignore_mask(true_obj, true_box_abs,
                                            pred_box_abs)
        obj_loss = self.calc_obj_loss(true_obj, pred_obj, ignore_mask)

        # YoloV1: Function (3)
        return xy_loss + wh_loss + class_loss + obj_loss, (xy_loss, wh_loss,
                                                           class_loss,
                                                           obj_loss)

    def calc_ignore_mask(self, true_obj, true_box, pred_box):
        # YOLOv3:
        # "If the bounding box prior is not the best but does overlap a ground
        # truth object by more than some threshold we ignore the prediction,
        # following [17]. We use the threshold of .5."
        # calculate the iou for each pair of pred bbox and true bbox, then find the best among them

        # (None, 13, 13, 3, 4)
        true_box_shape = tf.shape(true_box)
        # (None, 13, 13, 3, 4)
        pred_box_shape = tf.shape(pred_box)
        # (None, 507, 4)
        true_box = tf.reshape(true_box, [true_box_shape[0], -1, 4])
        # sort true_box to have non-zero boxes rank first
        true_box = tf.sort(true_box, axis=1, direction="DESCENDING")
        # (None, 100, 4)
        # only use maximum 100 boxes per groundtruth to calcualte IOU, otherwise
        # GPU emory comsumption would explode for a matrix like (16, 52*52*3, 52*52*3, 4)
        true_box = true_box[:, 0:100, :]
        # (None, 507, 4)
        pred_box = tf.reshape(pred_box, [pred_box_shape[0], -1, 4])

        # https://github.com/dmlc/gluon-cv/blob/06bb7ec2044cdf3f433721be9362ab84b02c5a90/gluoncv/model_zoo/yolo/yolo_target.py#L198
        # (None, 507, 507)
        iou = broadcast_iou(pred_box, true_box)
        # (None, 507)
        best_iou = tf.reduce_max(iou, axis=-1)
        # (None, 13, 13, 3)
        best_iou = tf.reshape(best_iou, [pred_box_shape[0], pred_box_shape[1], pred_box_shape[2], pred_box_shape[3]])
        # ignore_mask = 1 => don't ignore
        # ignore_mask = 0 => should ignore
        ignore_mask = tf.cast(best_iou < self.ignore_thresh, tf.float32)
        # (None, 13, 13, 3, 1)
        ignore_mask = tf.expand_dims(ignore_mask, axis=-1)
        return ignore_mask

    def calc_obj_loss(self, true_obj, pred_obj, ignore_mask):
        """
        calculate loss of objectness: sum of L2 distances

        inputs:
        true_obj: objectness from ground truth in shape of (batch, grid, grid, anchor, num_classes)
        pred_obj: objectness from model prediction in shape of (batch, grid, grid, anchor, num_classes)

        outputs:
        obj_loss: objectness loss
        """
        obj_entropy = binary_cross_entropy(pred_obj, true_obj)

        obj_loss = true_obj * obj_entropy
        noobj_loss = (1 - true_obj) * obj_entropy * ignore_mask

        obj_loss = tf.reduce_sum(obj_loss, axis=(1, 2, 3, 4))
        noobj_loss = tf.reduce_sum(
            noobj_loss, axis=(1, 2, 3, 4)) * self.lamda_noobj

        return obj_loss + noobj_loss

    def calc_class_loss(self, true_obj, true_class, pred_class):
        """
        calculate loss of class prediction

        inputs:
        true_obj: if the object present from ground truth in shape of (batch, grid, grid, anchor, 1)
        true_class: one-hot class from ground truth in shape of (batch, grid, grid, anchor, num_classes)
        pred_class: one-hot class from model prediction in shape of (batch, grid, grid, anchor, num_classes)

        outputs:
        class_loss: class loss
        """
        # Yolov1:
        # "Note that the loss function only penalizes classiﬁcation error
        # if an object is present in that grid cell (hence the conditional
        # class probability discussed earlier).
        class_loss = binary_cross_entropy(pred_class, true_class)
        class_loss = true_obj * class_loss
        class_loss = tf.reduce_sum(class_loss, axis=(1, 2, 3, 4))
        return class_loss

    def calc_xy_loss(self, true_obj, true_xy, pred_xy, weight):
        """
        calculate loss of the centroid coordinate: sum of L2 distances

        inputs:
        true_obj: if the object present from ground truth in shape of (batch, grid, grid, anchor, 1)
        true_xy: centroid x and y from ground truth in shape of (batch, grid, grid, anchor, 2)
        pred_xy: centroid x and y from model prediction in shape of (batch, grid, grid, anchor, 2)
        weight: weight adjustment, reward smaller bounding box

        outputs:
        xy_loss: centroid loss
        """
        # shape (batch, grid, grid, anchor), eg. (32, 13, 13, 3)
        xy_loss = tf.reduce_sum(tf.square(true_xy - pred_xy), axis=-1)

        # in order to element-wise multiply the result from tf.reduce_sum
        # we need to squeeze one dimension for objectness here
        true_obj = tf.squeeze(true_obj, axis=-1)

        # YoloV1:
        # "It also only penalizes bounding box coordinate error if that
        # predictor is "responsible" for the ground truth box (i.e. has the
        # highest IOU of any predictor in that grid cell)."
        xy_loss = true_obj * xy_loss * weight

        xy_loss = tf.reduce_sum(xy_loss, axis=(1, 2, 3)) * self.lambda_coord

        return xy_loss

    def calc_wh_loss(self, true_obj, true_wh, pred_wh, weight):
        """
        calculate loss of the width and height: sum of L2 distances

        inputs:
        true_obj: if the object present from ground truth in shape of (batch, grid, grid, anchor, 1)
        true_wh: width and height from ground truth in shape of (batch, grid, grid, anchor, 2)
        pred_wh: width and height from model prediction in shape of (batch, grid, grid, anchor, 2)
        weight: weight adjustment, reward smaller bounding box

        outputs:
        wh_loss: width and height loss
        """
        # shape (batch, grid, grid, anchor), eg. (32, 13, 13, 3)
        wh_loss = tf.reduce_sum(tf.square(true_wh - pred_wh), axis=-1)
        true_obj = tf.squeeze(true_obj, axis=-1)
        wh_loss = true_obj * wh_loss * weight
        wh_loss = tf.reduce_sum(wh_loss, axis=(1, 2, 3)) * self.lambda_coord
        return wh_loss


##  PREPROCESS

import tensorflow as tf
import numpy as np




class Preprocessor(object):
    def __init__(self, is_train, num_classes, output_shape=(416, 416)):
        self.is_train = is_train
        self.num_classes = num_classes
        self.output_shape = output_shape

    def __call__(self, example):
        features = self.parse_tfexample(example)

        encoded = features['image/encoded']
        image = tf.io.decode_jpeg(encoded)
        image = tf.cast(image, tf.float32)

        classes, bboxes = self.parse_y_features(features)
        image, bboxes = self.random_flip_image_and_label(image, bboxes)
        image, bboxes = self.random_crop_image_and_label(image, bboxes)

        image = tf.image.resize(image, self.output_shape)
        image = tf.cast(image, tf.float32) / 127.5 - 1

        label = (
            self.preprocess_label_for_one_scale(classes, bboxes, 52,
                                                np.array([0, 1, 2])),
            self.preprocess_label_for_one_scale(classes, bboxes, 26,
                                                np.array([3, 4, 5])),
            self.preprocess_label_for_one_scale(classes, bboxes, 13,
                                                np.array([6, 7, 8])),
        )
        return image, label

    def random_flip_image_and_label(self, image, bboxes):
        """
        flip left and right for 50% of images
        """
        r = tf.random.uniform([1])
        if r < 0.5:
            image = tf.image.flip_left_right(image)
            xmin, ymin, xmax, ymax = tf.split(bboxes, [1, 1, 1, 1], -1)
            # note that we need to switch here
            xmin, xmax = 1 - xmax, 1 - xmin
            bboxes = tf.squeeze(
                tf.stack([xmin, ymin, xmax, ymax], axis=1), axis=-1)

        return image, bboxes

    def get_random_crop_delta(self, bboxes):
        """
        get a random crop which includes all bounding boxes. Since all bboxes here belong to one image,
        we can calcualte the minimum of all xmin and ymin, and the maximum of all xmax and ymax to get
        the an area that can include all boxes. the crop will be randomly picked between this area boundary and
        the boundary of the whole image.
        """
        min_xmin = tf.math.reduce_min(bboxes[..., 0])
        min_ymin = tf.math.reduce_min(bboxes[..., 1])
        max_xmax = tf.math.reduce_max(bboxes[..., 2])
        max_ymax = tf.math.reduce_max(bboxes[..., 3])

        # delta is the normalized margin from bboxes boundary the crop boundary
        # ____________________________________
        # |         ________________         |
        # |image    |crop ______   |         |
        # |<-DELTA->|     |bbox|   |<-DELTA->|
        # |         |     |____|   |         |
        # |         |______________|         |
        # |__________________________________|
        xmin_delta = tf.random.uniform([1], 0, min_xmin)
        ymin_delta = tf.random.uniform([1], 0, min_ymin)
        xmax_delta = tf.random.uniform([1], 0, 1 - max_xmax)
        ymax_delta = tf.random.uniform([1], 0, 1 - max_ymax)

        return xmin_delta, ymin_delta, xmax_delta, ymax_delta

    def random_crop_image_and_label(self, image, bboxes):
        """
        crop images randomly at 50% chance but preserve all bounding boxes. the crop is guaranteed to include
        all bounding boxes. 
        """
        r = tf.random.uniform([1])
        if r < 0.5:
            xmin_delta, ymin_delta, xmax_delta, ymax_delta = self.get_random_crop_delta(
                bboxes)

            xmin, ymin, xmax, ymax = tf.split(bboxes, [1, 1, 1, 1], -1)
            # before crop: |_0.1_|_0.1_|____________0.5___________|_0.1_|___0.2___|
            # after crop:  |_0.1_|____________0.5___________|_0.1_|
            # imagine old xmin is 0.2 (0.1+0.1), old xmax is 0.8 (0.1+0.1+0.5+0.1)
            # if we cut both left 0.1 (xmin_delta) and right 0.2 (xmax_delta)
            # the new xmin will be (0.2 - 0.1) / (1 - 0.1 - 0.2) = 1/7
            # the new xmax will be (0.8 - 0.1) / (1 - 0.1 - 0.2) = 6/7
            # same thing for y
            xmin = (xmin - xmin_delta) / (1 - xmin_delta - xmax_delta)
            ymin = (ymin - ymin_delta) / (1 - ymin_delta - ymax_delta)
            xmax = (xmax - xmin_delta) / (1 - xmin_delta - xmax_delta)
            ymax = (ymax - ymin_delta) / (1 - ymin_delta - ymax_delta)

            bboxes = tf.squeeze(
                tf.stack([xmin, ymin, xmax, ymax], axis=1), axis=-1)
            h = tf.cast(tf.shape(image)[0], dtype=tf.float32)
            w = tf.cast(tf.shape(image)[1], dtype=tf.float32)

            offset_height = tf.cast(ymin_delta[0] * h, dtype=tf.int32)
            offset_width = tf.cast(xmin_delta[0] * w, dtype=tf.int32)
            target_height = tf.cast(
                tf.math.ceil((1 - ymax_delta - ymin_delta)[0] * h),
                dtype=tf.int32)
            target_width = tf.cast(
                tf.math.ceil((1 - xmax_delta - xmin_delta)[0] * w),
                dtype=tf.int32)

            image = image[offset_height:offset_height +
                          target_height, offset_width:offset_width +
                          target_width, :]
        return image, bboxes

    def parse_y_features(self, features):
        classes = tf.sparse.to_dense(features['image/object/class/label'])
        classes = tf.one_hot(classes, self.num_classes)

        # tf.pad(classes, [[0, 100 - tf.shape(classes)[0]], []], 'CONSTANT')

        # bboxes shape (None, 4)
        bboxes = tf.stack([
            tf.sparse.to_dense(features['image/object/bbox/xmin']),
            tf.sparse.to_dense(features['image/object/bbox/ymin']),
            tf.sparse.to_dense(features['image/object/bbox/xmax']),
            tf.sparse.to_dense(features['image/object/bbox/ymax']),
        ],
                          axis=1)
        return classes, bboxes

    def preprocess_label_for_one_scale(self,
                                       classes,
                                       bboxes,
                                       grid_size=13,
                                       valid_anchors=None):
        """
        preprocess the class and bounding boxes annotations into model desired format for one scale
        (grid, grid, anchor, (centroid x, centroid y, width, height, objectness, ...one-hot classes...))

        inputs:
        grid_size: a scalar grid size to use

        outputs:
        y: the desired label format to calcualte loss
        """
        # construct an empty placeholder for the final output y first
        y = tf.zeros((grid_size, grid_size, 3, 5 + self.num_classes))

        # find the best anchor indices for each ground truth box
        anchor_indices = self.find_best_anchor(bboxes)

        # necessary assertion, otherwise the steps later would fail
        tf.Assert(classes.shape[0] == bboxes.shape[0], [classes])
        tf.Assert(anchor_indices.shape[0] == bboxes.shape[0], [anchor_indices])

        # this has to be tf.shape instead of classes.shape, otherwise would be None
        num_boxes = tf.shape(classes)[0]

        indices = tf.TensorArray(tf.int32, 1, dynamic_size=True)
        updates = tf.TensorArray(tf.float32, 1, dynamic_size=True)

        valid_count = 0
        for i in tf.range(num_boxes):
            curr_class = tf.cast(classes[i], tf.float32)
            curr_box = bboxes[i]
            curr_anchor = anchor_indices[i]

            # only use the anchor when it belongs to current scale (grid_size)
            # for example, when grid size is 13, only anchor 6, 7, 8 (big anchors) are valid
            # because the reception field of this grid size is the biggest
            # however, if grid size is 52, the finest grained grid, we can only use anchor
            # 0, 1, 2 (small anchors)
            anchor_found = tf.reduce_any(curr_anchor == valid_anchors)
            if anchor_found:
                # now that we found the anchor, we need to set it in our final output y
                # we only have three anchor boxes in y, so we need to mod by 3 first to get
                # adjusted index. eg. anchor 7 will have index 1
                # we need to reshape here so that adjusted_anchor_index is a vector
                adjusted_anchor_index = tf.math.floormod(curr_anchor, 3)

                # we need to turn (xmin, ymin, xmax, ymax) box format into
                # (centeroid x, centroid y, width, height) to be able to
                # calculate yolo loss later
                curr_box_xy = (curr_box[..., 0:2] + curr_box[..., 2:4]) / 2
                curr_box_wh = curr_box[..., 2:4] - curr_box[..., 0:2]

                # calculate which grid cell should we use
                # eg. when curr_box_xy = [0.25, 0.25], and grid size = 26, which is a quarter of the image
                # the index of grid cell is floor(0.25 * 26) = 6
                grid_cell_xy = tf.cast(
                    curr_box_xy // tf.cast((1 / grid_size), dtype=tf.float32),
                    tf.int32)

                # for this box, we need to update y at location (grid_size, grid_size, adjusted_anchor_index)
                # eg. shape in (13, 13, 1)
                # grid[y][x][anchor] = (tx, ty, bw, bh, obj, class)
                # note that it's not grid[x][y]
                index = tf.stack(
                    [grid_cell_xy[1], grid_cell_xy[0], adjusted_anchor_index])

                # this is the value we use to update the above location
                # eg. shape in (7)
                # note that we need to make this one-hot classes in order to use categorical crossentropy later
                update = tf.concat(
                    values=[
                        curr_box_xy, curr_box_wh,
                        tf.constant([1.0]), curr_class
                    ],
                    axis=0)
                # add to final indices and updates to be written into y
                indices = indices.write(valid_count, index)
                updates = updates.write(valid_count, update)
                # tf.print(indices.stack())
                # tf.print(updates.stack())
                valid_count = 1 + valid_count

        y = tf.tensor_scatter_nd_update(y, indices.stack(), updates.stack())
        return y

    def find_best_anchor(self, y_box):
        """
        find the best anchor for num_boxes ground truth boxes in y_box. Return a tensor in shape
        of (num_boxes) that indicates the indices of best anchor for each box

        inputs:
        y_box: ground truth boxes in shape of (num_boxes, 4)

        outputs:
        anchor_idx: anchor indices in shape of (num_boxes)
        """
        box_wh = y_box[..., 2:4] - y_box[..., 0:2]

        box_wh = tf.tile(
            tf.expand_dims(box_wh, -2), (1, tf.shape(anchors_wh)[0], 1))

        
        intersection = tf.minimum(box_wh[..., 0],
                                  anchors_wh[..., 0]) * tf.minimum(
                                      box_wh[..., 1], anchors_wh[..., 1])

        # box_area is the width*height for each box
        # eg box_area -> (2, 9)
        box_area = box_wh[..., 0] * box_wh[..., 1]

        # anchor area is the width*height for each anchor
        # eg anchor_area -> (9)
        anchor_area = anchors_wh[..., 0] * anchors_wh[..., 1]

        # eg. iou -> (2, 9)
        iou = intersection / (box_area + anchor_area - intersection)

        # find the best anchor for each box, there should be num_boxes indices
        # in the result
        # eg. anchor_idx -> (2)
        anchor_idx = tf.cast(tf.argmax(iou, axis=-1), tf.int32)
        return anchor_idx

    def parse_tfexample(self, example_proto):
        image_feature_description = {
            'image/height': tf.io.FixedLenFeature([], tf.int64),
            'image/width': tf.io.FixedLenFeature([], tf.int64),
            'image/depth': tf.io.FixedLenFeature([], tf.int64),
            'image/object/class/label': tf.io.VarLenFeature(tf.int64),
            'image/object/bbox/xmin': tf.io.VarLenFeature(tf.float32),
            'image/object/bbox/ymin': tf.io.VarLenFeature(tf.float32),
            'image/object/bbox/xmax': tf.io.VarLenFeature(tf.float32),
            'image/object/bbox/ymax': tf.io.VarLenFeature(tf.float32),
            'image/encoded': tf.io.FixedLenFeature([], tf.string),
            'image/filename': tf.io.FixedLenFeature([], tf.string),
        }
        return tf.io.parse_single_example(example_proto,
                                          image_feature_description)



In [17]:
import argparse
import math
import datetime
import os
import time

import tensorflow as tf
import numpy as np



BATCH_SIZE = 16
TOTAL_CLASSES = 80
TOTAL_EPOCHS = 300
OUTPUT_SHAPE = (416, 416)
TF_RECORDS = 'Datasets/yolo/tfrecords'

tf.random.set_seed(1)


class Trainer(object):
    def __init__(self,
                 model,
                 initial_epoch,
                 epochs,
                 global_batch_size,
                 strategy,
                 initial_learning_rate=0.01):
        self.model = model
        self.initial_epoch = initial_epoch
        self.epochs = epochs
        self.strategy = strategy
        self.global_batch_size = global_batch_size
        self.loss_objects = [
            YoloLoss(
                num_classes=TOTAL_CLASSES,
                valid_anchors_wh=anchors_wh[0:3]),  # small scale 52x52
            YoloLoss(
                num_classes=TOTAL_CLASSES,
                valid_anchors_wh=anchors_wh[3:6]),  # medium scale 26x26
            YoloLoss(
                num_classes=TOTAL_CLASSES,
                valid_anchors_wh=anchors_wh[6:9]),  # large scale 13x13
        ]
        self.optimizer = tf.keras.optimizers.Adam(
            learning_rate=initial_learning_rate)

        # for learning rate schedule
        self.current_learning_rate = initial_learning_rate
        self.last_val_loss = math.inf
        self.lowest_val_loss = math.inf
        self.patience_count = 0
        self.max_patience = 10

    def lr_decay(self):
        """
        This effectively simulate ReduceOnPlateau learning rate schedule. Learning rate
        will be reduced by a factor of 10 if there's no improvement over [max_patience] epochs
        """
        if self.patience_count > self.max_patience:
            self.current_learning_rate /= 10.0
            self.patience_count = 0
        elif self.last_val_loss == self.lowest_val_loss:
            self.patience_count = 0
        self.patience_count += 1

        self.optimizer.learning_rate = self.current_learning_rate

    def train_step(self, inputs):
        images, labels = inputs

        with tf.GradientTape() as tape:
            outputs = self.model(images, training=True)
            total_losses = []
            xy_losses = []
            wh_losses = []
            class_losses = []
            obj_losses = []
            # iterate over all three scales
            for loss_object, y_pred, y_true in zip(self.loss_objects, outputs,
                                                   labels):
                total_loss, loss_breakdown = loss_object(y_true, y_pred)
                xy_loss, wh_loss, class_loss, obj_loss = loss_breakdown
                total_losses.append(total_loss * (1. / self.global_batch_size))
                xy_losses.append(xy_loss * (1. / self.global_batch_size))
                wh_losses.append(wh_loss * (1. / self.global_batch_size))
                class_losses.append(class_loss * (1. / self.global_batch_size))
                obj_losses.append(obj_loss * (1. / self.global_batch_size))

            total_loss = tf.reduce_sum(total_losses)
            total_xy_loss = tf.reduce_sum(xy_losses)
            total_wh_loss = tf.reduce_sum(wh_losses)
            total_class_loss = tf.reduce_sum(class_losses)
            total_obj_loss = tf.reduce_sum(obj_losses)

        grads = tape.gradient(
            target=total_loss, sources=self.model.trainable_variables)
        self.optimizer.apply_gradients(
            zip(grads, self.model.trainable_variables))

        return total_loss, (total_xy_loss, total_wh_loss, total_class_loss,
                            total_obj_loss)

    def val_step(self, inputs):
        images, labels = inputs

        outputs = self.model(images, training=False)
        losses = []
        # iterate over all three scales
        for loss_object, y_pred, y_true in zip(self.loss_objects, outputs,
                                               labels):
            loss, _ = loss_object(y_true, y_pred)
            losses.append(loss * (1. / self.global_batch_size))
        total_loss = tf.reduce_sum(losses)

        return total_loss

    def get_current_time(self):
        return datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

    def run(self, train_dist_dataset, val_dist_dataset):
        total_steps = tf.constant(0, dtype=tf.int64)

        @tf.function
        def distributed_train_epoch(dataset, train_summary_writer,
                                    total_steps):
            total_loss = 0.0
            num_train_batches = tf.constant(0, dtype=tf.int64)
            for one_batch in dataset:
                per_replica_losses, per_replica_losses_breakdown = self.strategy.experimental_run_v2(
                    self.train_step, args=(one_batch, ))
                per_replica_xy_losses, per_replica_wh_losses, per_replica_class_losses, per_replica_obj_losses = per_replica_losses_breakdown
                batch_loss = self.strategy.reduce(
                    tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)
                batch_xy_loss = self.strategy.reduce(
                    tf.distribute.ReduceOp.SUM,
                    per_replica_xy_losses,
                    axis=None)
                batch_wh_loss = self.strategy.reduce(
                    tf.distribute.ReduceOp.SUM,
                    per_replica_wh_losses,
                    axis=None)
                batch_class_loss = self.strategy.reduce(
                    tf.distribute.ReduceOp.SUM,
                    per_replica_class_losses,
                    axis=None)
                batch_obj_loss = self.strategy.reduce(
                    tf.distribute.ReduceOp.SUM,
                    per_replica_obj_losses,
                    axis=None)
                total_loss += batch_loss
                num_train_batches += 1
                tf.print('Trained batch:', num_train_batches, 'batch loss:',
                         batch_loss, 'batch xy loss', batch_xy_loss,
                         'batch wh loss', batch_wh_loss, 'batch obj loss',
                         batch_obj_loss, 'batch_class_loss', batch_class_loss,
                         'epoch total loss:', total_loss)
                with train_summary_writer.as_default():
                    tf.summary.scalar(
                        'batch train loss',
                        batch_loss,
                        step=total_steps + num_train_batches)
                    tf.summary.scalar(
                        'batch xy loss',
                        batch_xy_loss,
                        step=total_steps + num_train_batches)
                    tf.summary.scalar(
                        'batch wh loss',
                        batch_wh_loss,
                        step=total_steps + num_train_batches)
                    tf.summary.scalar(
                        'batch obj loss',
                        batch_obj_loss,
                        step=total_steps + num_train_batches)
                    tf.summary.scalar(
                        'batch class loss',
                        batch_class_loss,
                        step=total_steps + num_train_batches)
            return total_loss, num_train_batches

        @tf.function
        def distributed_val_epoch(dataset):
            total_loss = 0.0
            num_val_batches = tf.constant(0, dtype=tf.int64)
            for one_batch in dataset:
                per_replica_losses = self.strategy.experimental_run_v2(
                    self.val_step, args=(one_batch, ))
                batch_loss = self.strategy.reduce(
                    tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)
                total_loss += batch_loss
                num_val_batches += 1
            return total_loss, num_val_batches

        current_time = self.get_current_time()
        train_log_dir = 'logs/gradient_tape/' + current_time + '/train'
        val_log_dir = 'logs/gradient_tape/' + current_time + '/val'
        train_summary_writer = tf.summary.create_file_writer(train_log_dir)
        val_summary_writer = tf.summary.create_file_writer(val_log_dir)

        tf.print('{} Start training...'.format(current_time))
        for epoch in range(self.initial_epoch, self.epochs + 1):
            t0 = time.time()
            self.lr_decay()

            tf.print(
                '{} Started epoch {} with learning rate {}. Current LR patience count is {} epochs. Last lowest val loss is {}.'
                .format(self.get_current_time(), epoch,
                        self.current_learning_rate, self.patience_count,
                        self.lowest_val_loss))

            train_total_loss, num_train_batches = distributed_train_epoch(
                train_dist_dataset, train_summary_writer, total_steps)
            t1 = time.time()
            train_loss = train_total_loss / tf.cast(
                num_train_batches, dtype=tf.float32)
            tf.print(
                '{} Epoch {} train loss {}, total train batches {}, {} examples per second'
                .format(
                    self.get_current_time(), epoch, train_loss,
                    num_train_batches,
                    tf.cast(num_train_batches, dtype=tf.float32) *
                    self.global_batch_size / (t1 - t0)))
            with train_summary_writer.as_default():
                tf.summary.scalar('epoch train loss', train_loss, step=epoch)
            total_steps += num_train_batches

            val_total_loss, num_val_batches = distributed_val_epoch(
                val_dist_dataset)

            t2 = time.time()
            val_loss = val_total_loss / tf.cast(
                num_val_batches, dtype=tf.float32)
            tf.print(
                '{} Epoch {} val loss {}, total val batches {}, {} examples per second'
                .format(
                    self.get_current_time(), epoch, val_loss, num_val_batches,
                    tf.cast(num_val_batches, dtype=tf.float32) *
                    self.global_batch_size / (t2 - t1)))
            with val_summary_writer.as_default():
                tf.summary.scalar('epoch val loss', val_loss, step=epoch)

            # save model when reach a new lowest validation loss
            if val_loss < self.lowest_val_loss:
                self.save_model(epoch, val_loss)
                self.lowest_val_loss = val_loss
            self.last_val_loss = val_loss

        self.save_model(self.epochs, self.last_val_loss)
        print('{} Finished.'.format(self.get_current_time()))

    def save_model(self, epoch, loss):
        # https://github.com/tensorflow/tensorflow/issues/33565
        model_name = 'Datasets/yolo/models/model-v1.0.1-epoch-{}-loss-{:.4f}.tf'.format(
            epoch, loss)
        self.model.save_weights(model_name)
        print("Model {} saved.".format(model_name))

In [22]:
def create_dataset(tfrecords, batch_size, is_train):
    preprocess = Preprocessor(is_train, TOTAL_CLASSES, OUTPUT_SHAPE)

    dataset = tf.data.Dataset.list_files(tfrecords)
    dataset = tf.data.TFRecordDataset(dataset)
    dataset = dataset.map(
        preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE)

    if is_train:
        dataset = dataset.shuffle(512)

    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return dataset


def main():
    # parser = argparse.ArgumentParser()
    # parser.add_argument('--checkpoint', type=str, help='checkpoint file path')
    # args = parser.parse_args()

    strategy = tf.distribute.MirroredStrategy()
    global_batch_size = strategy.num_replicas_in_sync * BATCH_SIZE
    train_dataset = create_dataset(
        '{}/train*'.format(TF_RECORDS), global_batch_size, is_train=True)
    val_dataset = create_dataset(
        '{}/val*'.format(TF_RECORDS), global_batch_size, is_train=False)
    if not os.path.exists(os.path.join('Datasets/yolo/models/')):
        os.makedirs(os.path.join('Datasets/yolo/models/'))

    with strategy.scope():
        train_dist_dataset = strategy.experimental_distribute_dataset(
            train_dataset)
        val_dist_dataset = strategy.experimental_distribute_dataset(
            val_dataset)
        model = YoloV3(
            shape=(416, 416, 3), num_classes=TOTAL_CLASSES, training=True)
        model.summary()
        model.load_weights(checkpoint)
        initial_epoch = int(args.checkpoint.split('-')[-3]) + 1
        
        trainer = Trainer(
            model=model,
            initial_epoch=initial_epoch,
            epochs=TOTAL_EPOCHS,
            global_batch_size=global_batch_size,
            strategy=strategy,
        )
        trainer.run(train_dist_dataset, val_dist_dataset)


# if __name__ == '__main__':
main()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)


InvalidArgumentError: Expected 'tf.Tensor(False, shape=(), dtype=bool)' to be true. Summarized data: b'No files matched pattern: Datasets/yolo/tfrecords/train*'

In [15]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import tensorflow as tf



TOTAL_CLASSES = 80
BATCH_SIZE = 1


preprocess = Preprocessor(False, TOTAL_CLASSES, (416, 416))
dataset = tf.data.Dataset.list_files('Datasets/yolo/tfrecords/val_0004_of_0008.tfrecords')
dataset = tf.data.TFRecordDataset(dataset)
dataset = dataset.map(preprocess, num_parallel_calls=1)
dataset = dataset.batch(BATCH_SIZE)
x, y_true = next(iter(dataset))


class_names = {}
with open('Datasets/yolo/MSCOCO/mscoco_2017_names.txt') as fp:
    lines = fp.read().splitlines()
    for i, line in enumerate(lines):
        class_names[i] = line


model = YoloV3(shape=(416, 416, 3), num_classes=TOTAL_CLASSES, training=False)
model.load_weights('Datasets/yolo/models/model-v1.0.1-epoch-56-loss-42.0143.tf')


postprocess = Postprocessor(iou_thresh=0.5, score_thresh=0.5)
y_pred = model(x, training=False)
y_pred_nms = postprocess(y_pred)

num_predictions = y_pred_nms[3][0][0].numpy()
print('Number of predictions: ', num_predictions)

colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w']
im = tf.cast((x[0] + 1) * 127.5, tf.int32)

plt.rcParams['figure.figsize'] = (20,20)
fig,ax = plt.subplots(1)
ax.imshow(im)
h, w, d = im.shape

for i in range(0, num_predictions):
    box = y_pred_nms[0][0][i].numpy()
    score = y_pred_nms[1][0][i][0].numpy()
    class_prob = y_pred_nms[2][0][i].numpy()
    values, indices = tf.math.top_k(class_prob, k=3)
    text= []
    for index, prob in zip(indices.numpy(), values.numpy()):
        text.append('{} {:.1f}%'.format(class_names[index], prob*100))
    text = ', '.join(text)
    xmin = box[0] * w
    ymin = box[1] * h
    width = (box[2] - box[0]) * w
    height = (box[3] - box[1]) * h
    color = colors[i % 8]
    rect = patches.Rectangle((xmin,ymin),width,height,linewidth=2,edgecolor=color,facecolor='none')
    ax.add_patch(rect)
    ax.annotate(text, (xmin, ymin - 5), color=color, weight='bold', 
                fontsize=18)

plt.show()


InvalidArgumentError: Expected 'tf.Tensor(False, shape=(), dtype=bool)' to be true. Summarized data: b'No files matched pattern: Datasets/yolo/tfrecords/val_0004_of_0008.tfrecords'