In [1]:
from datetime import datetime
import os
import cv2
import tensorflow as tf
import numpy as np
import random

2021-11-29 23:46:36.819273: I tensorflow/stream_executor/platform/default/dso_loader.cc:54] Successfully opened dynamic library libcudart.so.11.0


In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        # Select GPU number 1
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


2021-11-29 23:46:38.449284: I tensorflow/stream_executor/platform/default/dso_loader.cc:54] Successfully opened dynamic library libcuda.so.1
2021-11-29 23:46:38.505113: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1734] Found device 0 with properties: 
pciBusID: 0000:1c:00.0 name: Tesla V100-SXM2-32GB computeCapability: 7.0
coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 31.75GiB deviceMemoryBandwidth: 836.37GiB/s
2021-11-29 23:46:38.505164: I tensorflow/stream_executor/platform/default/dso_loader.cc:54] Successfully opened dynamic library libcudart.so.11.0
2021-11-29 23:46:38.512135: I tensorflow/stream_executor/platform/default/dso_loader.cc:54] Successfully opened dynamic library libcublas.so.11
2021-11-29 23:46:38.512240: I tensorflow/stream_executor/platform/default/dso_loader.cc:54] Successfully opened dynamic library libcublasLt.so.11
2021-11-29 23:46:38.514916: I tensorflow/stream_executor/platform/default/dso_loader.cc:54] Successfully opened dynamic library libcufft.

In [3]:
# Model options
YOLO_STRIDES                = [8, 16, 32]
YOLO_IOU_LOSS_THRESH        = 0.5
YOLO_ANCHOR_PER_SCALE       = 3
YOLO_MAX_BBOX_PER_SCALE     = 100
YOLO_ANCHORS                = [[[12,  16], [19,   36], [40,   28]],
                               [[36,  75], [76,   55], [72,  146]],
                               [[142,110], [192, 243], [459, 401]]]
IMAGE_SIZE                  = 448
CLASS_NAMES                 = ["aeroplane", "bicycle", "bird", "boat", "bottle", 
                               "bus", "car", "cat", "chair", "cow", "diningtable", 
                               "dog", "horse", "motorbike", "person", "pottedplant", 
                               "sheep", "sofa", "train","tvmonitor"]
BATCH_SIZE                  = 4
LEARNING_RATE               = 1e-4

TRAIN_IMAGE_DIR             = '/work/michael1017/data/comp2/VOCdevkit_train/AUG_VOC2007_NEW/Image/'
TRAIN_ANNOT_PATH            = "/work/michael1017/data/comp2/VOCdevkit_train/AUG_VOC2007_NEW/pascal_voc_aug_training_data.txt"
TRAIN_CHECKPOINTS_FOLDER    = "/work/michael1017/chiahan_work/ckpts_RES_cpypst/YOLOv4/"
TRAIN_INPUT_SIZE            = IMAGE_SIZE
TRAIN_EPOCHS                = 100

# TEST options
TEST_ANNOT_PATH             = "/work/michael1017/data/comp2/pascal_voc_testing_data.txt"
TEST_IMAGE_DIR              = "/work/michael1017/data/comp2/VOCdevkit_test/VOC2007/JPEGImages/"

# Dataset Loader

### image preprocess:<br>
因為model input shape要一樣，但是dataset並不是每張圖片的長寬比都固定，因此需要把圖片跟bbox等比例縮放後，再透過padding把圖片填滿

In [4]:
def image_preprocess(image, target_size, gt_boxes=None):
    ih, iw    = target_size
    h,  w, _  = image.shape
    
    scale = min(iw/w, ih/h)
    nw, nh  = int(scale * w), int(scale * h)
    image_resized = cv2.resize(image, (nw, nh))

    image_paded = np.full(shape=[ih, iw, 3], fill_value=128.0)
    dw, dh = (iw - nw) // 2, (ih-nh) // 2
    image_paded[dh:nh+dh, dw:nw+dw, :] = image_resized
    image_paded = image_paded / 255.

    if gt_boxes is None:
        return image_paded
    else:
        gt_boxes[:, [0, 2]] = gt_boxes[:, [0, 2]] * scale + dw
        gt_boxes[:, [1, 3]] = gt_boxes[:, [1, 3]] * scale + dh
        return image_paded, gt_boxes


### Dataset Class<br>
將Dataset寫成可以iterate的class，將所有preprocessing相關的步驟寫在裡面

In [5]:
class Dataset(object):
    # Dataset preprocess implementation
    def __init__(self, dataset_type):
        self.annot_path  = TRAIN_ANNOT_PATH if dataset_type == 'train' else TEST_ANNOT_PATH
        self.input_sizes = IMAGE_SIZE
        self.batch_size  = BATCH_SIZE
        self.data_aug    = True if dataset_type == 'train' else False
        self.image_dir = TRAIN_IMAGE_DIR if dataset_type == 'train' else TEST_IMAGE_DIR
        self.train_input_sizes = TRAIN_INPUT_SIZE
        self.strides = np.array(YOLO_STRIDES)
        self.classes = CLASS_NAMES
        self.num_classes = len(self.classes)
        self.anchors = (np.array(YOLO_ANCHORS).T/self.strides).T
        self.anchor_per_scale = YOLO_ANCHOR_PER_SCALE
        self.max_bbox_per_scale = YOLO_MAX_BBOX_PER_SCALE

        self.annotations = self.load_annotations(dataset_type)
        self.num_samples = len(self.annotations)
        self.num_batchs = int(np.ceil(self.num_samples / self.batch_size))
        self.batch_count = 0

    ## 回傳[image_name, line[index:], image] line[index:] 為bbox + class的部分
    def load_annotations(self, dataset_type):
        final_annotations = []
        with open(self.annot_path, 'r') as f:
            txt = f.read().splitlines()
            annotations = [line.strip() for line in txt if len(line.strip().split()[1:]) != 0]
        np.random.shuffle(annotations)

        for annotation in annotations:
            # fully parse annotations
            line = annotation.split()
            image_name, index = "", 1
            for i, one_line in enumerate(line):
                if not one_line.replace(",","").isnumeric():
                    if image_name != "": image_name += " "
                    image_name += one_line
                else:
                    index = i
                    break
            image_path = self.image_dir + image_name
            if not os.path.exists(image_path):
                raise KeyError("%s does not exist ... " %image_path)
            image = cv2.imread(image_path)
            final_annotations.append([image_name, line[index:], image])
        return final_annotations

    def __iter__(self):
        return self

    def Delete_bad_annotation(self, bad_annotation):
        print(f'Deleting {bad_annotation} annotation line')
        bad_image_path = bad_annotation[0]
        bad_image_name = bad_annotation[0].split('/')[-1] # can be used to delete bad image
        bad_xml_path = bad_annotation[0][:-3]+'xml' # can be used to delete bad xml file

        # remove bad annotation line from annotation file
        with open(self.annot_path, "r+") as f:
            d = f.readlines()
            f.seek(0)
            for i in d:
                if bad_image_name not in i:
                    f.write(i)
            f.truncate()

    def __next__(self):
        with tf.device('/cpu:0'):
            self.train_input_size = random.choice([self.train_input_sizes])
            self.train_output_sizes = self.train_input_size // self.strides

            batch_image = np.zeros((self.batch_size, self.train_input_size, self.train_input_size, 3), dtype=np.float32)

            batch_label_sbbox = np.zeros((self.batch_size, self.train_output_sizes[0], self.train_output_sizes[0], self.anchor_per_scale, 5 + self.num_classes), dtype=np.float32)
            batch_label_mbbox = np.zeros((self.batch_size, self.train_output_sizes[1], self.train_output_sizes[1], self.anchor_per_scale, 5 + self.num_classes), dtype=np.float32)
            batch_label_lbbox = np.zeros((self.batch_size, self.train_output_sizes[2], self.train_output_sizes[2], self.anchor_per_scale, 5 + self.num_classes), dtype=np.float32)

            batch_sbboxes = np.zeros((self.batch_size, self.max_bbox_per_scale, 4), dtype=np.float32)

            batch_mbboxes = np.zeros((self.batch_size, self.max_bbox_per_scale, 4), dtype=np.float32)
            batch_lbboxes = np.zeros((self.batch_size, self.max_bbox_per_scale, 4), dtype=np.float32)

            exceptions = False
            num = 0
            if self.batch_count < self.num_batchs:
                while num < self.batch_size:
                    index = self.batch_count * self.batch_size + num
                    if index >= self.num_samples: index -= self.num_samples
                    annotation = self.annotations[index]
                    image, bboxes = self.parse_annotation(annotation)
                    try:
                        label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes = self.preprocess_true_boxes(bboxes)
                    except IndexError:
                        exceptions = True
                        self.Delete_bad_annotation(annotation)
                        print("IndexError, something wrong with", annotation[0], "removed this line from annotation file")

                    batch_image[num, :, :, :] = image
                    batch_label_mbbox[num, :, :, :, :] = label_mbbox
                    batch_label_lbbox[num, :, :, :, :] = label_lbbox
                    batch_mbboxes[num, :, :] = mbboxes
                    batch_lbboxes[num, :, :] = lbboxes
                    batch_label_sbbox[num, :, :, :, :] = label_sbbox
                    batch_sbboxes[num, :, :] = sbboxes

                    num += 1

                if exceptions:
                    print('\n')
                    raise Exception("There were problems with dataset, I fixed them, now restart the training process.")
                self.batch_count += 1
                batch_smaller_target = batch_label_sbbox, batch_sbboxes
                batch_medium_target  = batch_label_mbbox, batch_mbboxes
                batch_larger_target  = batch_label_lbbox, batch_lbboxes

                return batch_image, (batch_smaller_target, batch_medium_target, batch_larger_target)
            else:
                self.batch_count = 0
                np.random.shuffle(self.annotations)
                raise StopIteration

    # 水平翻轉
    def random_horizontal_flip(self, image, bboxes):
        if random.random() < 0.5:
            _, w, _ = image.shape
            image = image[:, ::-1, :]
            bboxes[:, [0,2]] = w - bboxes[:, [2,0]]

        return image, bboxes
    
    # 隨機裁切圖片的一部份
    def random_crop(self, image, bboxes):
        if random.random() < 0.5:
            h, w, _ = image.shape
            max_bbox = np.concatenate([np.min(bboxes[:, 0:2], axis=0), np.max(bboxes[:, 2:4], axis=0)], axis=-1)

            max_l_trans = max_bbox[0]
            max_u_trans = max_bbox[1]
            max_r_trans = w - max_bbox[2]
            max_d_trans = h - max_bbox[3]

            crop_xmin = max(0, int(max_bbox[0] - random.uniform(0, max_l_trans)))
            crop_ymin = max(0, int(max_bbox[1] - random.uniform(0, max_u_trans)))
            crop_xmax = max(w, int(max_bbox[2] + random.uniform(0, max_r_trans)))
            crop_ymax = max(h, int(max_bbox[3] + random.uniform(0, max_d_trans)))

            image = image[crop_ymin : crop_ymax, crop_xmin : crop_xmax]

            bboxes[:, [0, 2]] = bboxes[:, [0, 2]] - crop_xmin
            bboxes[:, [1, 3]] = bboxes[:, [1, 3]] - crop_ymin

        return image, bboxes

    #圖片平移tx, ty
    def random_translate(self, image, bboxes):
        if random.random() < 0.5:
            h, w, _ = image.shape
            max_bbox = np.concatenate([np.min(bboxes[:, 0:2], axis=0), np.max(bboxes[:, 2:4], axis=0)], axis=-1)

            max_l_trans = max_bbox[0]
            max_u_trans = max_bbox[1]
            max_r_trans = w - max_bbox[2]
            max_d_trans = h - max_bbox[3]

            tx = random.uniform(-(max_l_trans - 1), (max_r_trans - 1))
            ty = random.uniform(-(max_u_trans - 1), (max_d_trans - 1))

            M = np.array([[1, 0, tx], [0, 1, ty]])
            image = cv2.warpAffine(image, M, (w, h))

            bboxes[:, [0, 2]] = bboxes[:, [0, 2]] + tx
            bboxes[:, [1, 3]] = bboxes[:, [1, 3]] + ty

        return image, bboxes

    # 把annotation parse 成 一張圖片和它的 bboxes
    def parse_annotation(self, annotation, mAP = False):
        image_name = annotation[0]
        image_path = self.image_dir + image_name
        image = annotation[2]
        
        bboxes = np.array([box for box in annotation[1]], dtype=np.int)
        bboxes = bboxes.reshape((len(bboxes)//5, 5))
        if self.data_aug:
            image, bboxes = self.random_horizontal_flip(np.copy(image), np.copy(bboxes))
            image, bboxes = self.random_crop(np.copy(image), np.copy(bboxes))
            image, bboxes = self.random_translate(np.copy(image), np.copy(bboxes))

        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if mAP == True:
            return image, bboxes

        image, bboxes = image_preprocess(np.copy(image), [self.input_sizes, self.input_sizes], np.copy(bboxes))
        return image, bboxes

    # 計算output label，以計算loss，根據stride會產出三種bboxes on different size feature map，label的size會是cell*cell*(numOfAnchors)
    # *(numOfClass + 5)， 5是bbox的x, y座標(on feature map)以及是否為前景
    
    def preprocess_true_boxes(self, bboxes):
        OUTPUT_LEVELS = len(self.strides)

        label = [np.zeros((self.train_output_sizes[i], self.train_output_sizes[i], self.anchor_per_scale,
                           5 + self.num_classes)) for i in range(OUTPUT_LEVELS)]
        bboxes_xywh = [np.zeros((self.max_bbox_per_scale, 4)) for _ in range(OUTPUT_LEVELS)]
        bbox_count = np.zeros((OUTPUT_LEVELS,))

        for bbox in bboxes:
            bbox_coor = bbox[:4]
            bbox_class_ind = bbox[4]

            onehot = np.zeros(self.num_classes, dtype=np.float)
            onehot[bbox_class_ind] = 1.0
            uniform_distribution = np.full(self.num_classes, 1.0 / self.num_classes)
            deta = 0.01
            smooth_onehot = onehot * (1 - deta) + deta * uniform_distribution

            bbox_xywh = np.concatenate([(bbox_coor[2:] + bbox_coor[:2]) * 0.5, bbox_coor[2:] - bbox_coor[:2]], axis=-1)
            bbox_xywh_scaled = 1.0 * bbox_xywh[np.newaxis, :] / self.strides[:, np.newaxis]

            iou = []
            exist_positive = False
            for i in range(OUTPUT_LEVELS):#range(3):
                anchors_xywh = np.zeros((self.anchor_per_scale, 4))
                anchors_xywh[:, 0:2] = np.floor(bbox_xywh_scaled[i, 0:2]).astype(np.int32) + 0.5
                anchors_xywh[:, 2:4] = self.anchors[i]

                iou_scale = bbox_iou(bbox_xywh_scaled[i][np.newaxis, :], anchors_xywh)
                iou.append(iou_scale)
                iou_mask = iou_scale > 0.3

                if np.any(iou_mask):
                    xind, yind = np.floor(bbox_xywh_scaled[i, 0:2]).astype(np.int32)

                    label[i][yind, xind, iou_mask, :] = 0
                    label[i][yind, xind, iou_mask, 0:4] = bbox_xywh
                    label[i][yind, xind, iou_mask, 4:5] = 1.0
                    label[i][yind, xind, iou_mask, 5:] = smooth_onehot

                    bbox_ind = int(bbox_count[i] % self.max_bbox_per_scale)
                    bboxes_xywh[i][bbox_ind, :4] = bbox_xywh
                    bbox_count[i] += 1

                    exist_positive = True

            if not exist_positive:
                best_anchor_ind = np.argmax(np.array(iou).reshape(-1), axis=-1)
                best_detect = int(best_anchor_ind / self.anchor_per_scale)
                best_anchor = int(best_anchor_ind % self.anchor_per_scale)
                xind, yind = np.floor(bbox_xywh_scaled[best_detect, 0:2]).astype(np.int32)

                label[best_detect][yind, xind, best_anchor, :] = 0
                label[best_detect][yind, xind, best_anchor, 0:4] = bbox_xywh
                label[best_detect][yind, xind, best_anchor, 4:5] = 1.0
                label[best_detect][yind, xind, best_anchor, 5:] = smooth_onehot

                bbox_ind = int(bbox_count[best_detect] % self.max_bbox_per_scale)
                bboxes_xywh[best_detect][bbox_ind, :4] = bbox_xywh
                bbox_count[best_detect] += 1

        
        label_sbbox, label_mbbox, label_lbbox = label
        sbboxes, mbboxes, lbboxes = bboxes_xywh
        return label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes

    def __len__(self):
        return self.num_batchs

# Model (YOLOv4)

In [6]:
from tensorflow.keras import layers, models
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Conv2D, Input, LeakyReLU, ZeroPadding2D, BatchNormalization, MaxPool2D

In [7]:
STRIDES = np.array(YOLO_STRIDES)
ANCHORS = (np.array(YOLO_ANCHORS).T/STRIDES).T

In [8]:
def conv_leaky_relu(inputs, filters, size, stride):
    x = layers.Conv2D(filters, size, stride, padding="same",
                      kernel_initializer=tf.keras.initializers.TruncatedNormal())(inputs)
    x = layers.LeakyReLU(0.1)(x)

    return x

In [9]:
def mish(x):
    return x * tf.math.tanh(tf.math.softplus(x))

In [10]:
def convolutional(input_layer, filters_shape, downsample=False, activate=True, bn=True, activate_type='leaky'):
    if downsample:
        input_layer = ZeroPadding2D(((1, 0), (1, 0)))(input_layer)
        padding = 'valid'
        strides = 2
    else:
        strides = 1
        padding = 'same'

    conv = Conv2D(filters=filters_shape[-1], kernel_size = filters_shape[0], strides=strides,
                  padding=padding, use_bias=not bn, kernel_regularizer=l2(0.0005),
                  kernel_initializer=tf.random_normal_initializer(stddev=0.01),
                  bias_initializer=tf.constant_initializer(0.))(input_layer)
    if bn:
        conv = BatchNormalization()(conv)
    if activate == True:
        if activate_type == "leaky":
            conv = LeakyReLU(alpha=0.1)(conv)
        elif activate_type == "mish":
            conv = mish(conv)

    return conv

In [11]:
def residual_block(input_layer, input_channel, filter_num1, filter_num2, activate_type='leaky'):
    short_cut = input_layer
    conv = convolutional(input_layer, filters_shape=(1, 1, input_channel, filter_num1), activate_type=activate_type)
    conv = convolutional(conv       , filters_shape=(3, 3, filter_num1,   filter_num2), activate_type=activate_type)

    residual_output = short_cut + conv
    return residual_output

In [12]:
def upsample(input_layer):
    return tf.image.resize(input_layer, (input_layer.shape[1] * 2, input_layer.shape[2] * 2), method='nearest')

![](https://i.imgur.com/pra2YPE.png)
![](https://i.imgur.com/PwSLmkd.png)


In [13]:
def cspdarknet53(input_data):
    #===============================================================================================================
    # 1 Part 
    input_data = convolutional(input_data, (3, 3,  3,  32), activate_type="mish")
    input_data = convolutional(input_data, (3, 3, 32,  64), downsample=True, activate_type="mish")
    
    # Split
    #------------------------------
    route = input_data
    route = convolutional(route, (1, 1, 64, 64), activate_type="mish")
    #------------------------------
    input_data = convolutional(input_data, (1, 1, 64, 64), activate_type="mish")
    #------------------------------
    #===============================================================================================================
    # Convolution, Filter = 32, Size = 1x1
    # Convolution, Filter = 64, Size = 3x3
    for i in range(1):
        input_data = residual_block(input_data,  64,  32, 64, activate_type="mish")
    #===============================================================================================================
    input_data = convolutional(input_data, (1, 1, 64, 64), activate_type="mish")
    input_data = tf.concat([input_data, route], axis=-1)
    input_data = convolutional(input_data, (1, 1, 128, 64), activate_type="mish")
    #===============================================================================================================
    # 2 Part 
    input_data = convolutional(input_data, (3, 3, 64, 128), downsample=True, activate_type="mish")
    
    # Split
    #------------------------------
    route = input_data
    route = convolutional(route, (1, 1, 128, 64), activate_type="mish")
    #------------------------------
    input_data = convolutional(input_data, (1, 1, 128, 64), activate_type="mish")
    #------------------------------
    #===============================================================================================================
    # Convolution, Filter = 64, Size = 1x1
    # Convolution, Filter = 64, Size = 3x3
    for i in range(2):
        input_data = residual_block(input_data, 64,  64, 64, activate_type="mish")
    #===============================================================================================================
    input_data = convolutional(input_data, (1, 1, 64, 64), activate_type="mish")
    input_data = tf.concat([input_data, route], axis=-1)
    input_data = convolutional(input_data, (1, 1, 128, 128), activate_type="mish")
    #===============================================================================================================
    # 3 Part 
    input_data = convolutional(input_data, (3, 3, 128, 256), downsample=True, activate_type="mish")
    
    # Split
    #------------------------------
    route = input_data
    route = convolutional(route, (1, 1, 256, 128), activate_type="mish")
    #------------------------------
    input_data = convolutional(input_data, (1, 1, 256, 128), activate_type="mish")
    #------------------------------
    #===============================================================================================================
    # Convolution, Filter = 128, Size = 1x1
    # Convolution, Filter = 128, Size = 3x3
    for i in range(8):
        input_data = residual_block(input_data, 128, 128, 128, activate_type="mish")
    #===============================================================================================================
    input_data = convolutional(input_data, (1, 1, 128, 128), activate_type="mish")
    input_data = tf.concat([input_data, route], axis=-1)
    input_data = convolutional(input_data, (1, 1, 256, 256), activate_type="mish")
    #===============================================================================================================
    # 4 Part
    route_1 = input_data
    input_data = convolutional(input_data, (3, 3, 256, 512), downsample=True, activate_type="mish")
    
    # Split
    #------------------------------
    route = input_data
    route = convolutional(route, (1, 1, 512, 256), activate_type="mish")
    #------------------------------
    input_data = convolutional(input_data, (1, 1, 512, 256), activate_type="mish")
    #------------------------------
    #===============================================================================================================
    # Convolution, Filter = 256, Size = 1x1
    # Convolution, Filter = 256, Size = 3x3
    for i in range(8):
        input_data = residual_block(input_data, 256, 256, 256, activate_type="mish")
    #===============================================================================================================
    input_data = convolutional(input_data, (1, 1, 256, 256), activate_type="mish")
    input_data = tf.concat([input_data, route], axis=-1)
    input_data = convolutional(input_data, (1, 1, 512, 512), activate_type="mish")
    #===============================================================================================================
    # 5 Part
    route_2 = input_data
    input_data = convolutional(input_data, (3, 3, 512, 1024), downsample=True, activate_type="mish")
    
    # Split
    #------------------------------
    route = input_data
    route = convolutional(route, (1, 1, 1024, 512), activate_type="mish")
    #------------------------------
    input_data = convolutional(input_data, (1, 1, 1024, 512), activate_type="mish")
    #------------------------------
    #===============================================================================================================
    # Convolution, Filter = 512, Size = 1x1
    # Convolution, Filter = 512, Size = 3x3
    for i in range(4):
        input_data = residual_block(input_data, 512, 512, 512, activate_type="mish")
    #===============================================================================================================
    input_data = convolutional(input_data, (1, 1, 512, 512), activate_type="mish")
    input_data = tf.concat([input_data, route], axis=-1)
    input_data = convolutional(input_data, (1, 1, 1024, 1024), activate_type="mish")
    #===============================================================================================================
    
    input_data = convolutional(input_data, (1, 1, 1024, 512))
    input_data = convolutional(input_data, (3, 3, 512, 1024))
    input_data = convolutional(input_data, (1, 1, 1024, 512))

    max_pooling_1 = tf.keras.layers.MaxPool2D(pool_size=13, padding='SAME', strides=1)(input_data)
    max_pooling_2 = tf.keras.layers.MaxPool2D(pool_size=9, padding='SAME', strides=1)(input_data)
    max_pooling_3 = tf.keras.layers.MaxPool2D(pool_size=5, padding='SAME', strides=1)(input_data)
    input_data = tf.concat([max_pooling_1, max_pooling_2, max_pooling_3, input_data], axis=-1)

    input_data = convolutional(input_data, (1, 1, 2048, 512))
    input_data = convolutional(input_data, (3, 3, 512, 1024))
    input_data = convolutional(input_data, (1, 1, 1024, 512))

    return route_1, route_2, input_data

# Architecture
![](https://i.imgur.com/qOa1WJr.png)
from [YOLOv4 paper](https://arxiv.org/pdf/2004.10934.pdf)

In this code, we use
* Backbone: CSPdarknet53
* Neck: SAM + FPN + PAN
* Head (Dense Prediction): YOLO
* Sparse Prediction: Not in YOLOv4
![](https://i.imgur.com/YMouIj6.png)

Propose 3 kinds of boxes like YOLOv3 implementation
1. conv_sbbox
2. conv_mbbox
3. conv_lbbox
![](https://i.imgur.com/RKCG1Dj.png)

The head block in yolo can be describe as a combination of a DBL block and a conv block in the right part of the image.
![](https://i.imgur.com/JOqkOGc.png)

In [14]:
"""
From cspdarknet, we get 3 different layer outputs
1. route_1
2. route_2
3. conv
And connects to Neck and Dense Prediction blocks
here we use three techs
1. Feature Pyramid Networks (FPN)
2. Spatial Attention Module (SAM)
3. Path Aggregation Network (PAN), to accelerate the computing use concat instead of '+'.
the architech is shown above.
"""
def YOLOv4(input_layer, NUM_CLASS):
    route_1, route_2, conv = cspdarknet53(input_layer)

    route = conv
    conv = convolutional(conv, (1, 1, 512, 256))
    conv = upsample(conv)
    
    route_2 = convolutional(route_2, (1, 1, 512, 256))
    conv = tf.concat([route_2, conv], axis=-1) # PAN
    # DBL block * 5
    conv = convolutional(conv, (1, 1, 512, 256))
    conv = convolutional(conv, (3, 3, 256, 512))
    conv = convolutional(conv, (1, 1, 512, 256))
    conv = convolutional(conv, (3, 3, 256, 512))
    conv = convolutional(conv, (1, 1, 512, 256))
    
    route_2 = conv
    # DBL block
    conv = convolutional(conv, (1, 1, 256, 128))
    # One conv block
    conv = upsample(conv)
    route_1 = convolutional(route_1, (1, 1, 256, 128))
    
    conv = tf.concat([route_1, conv], axis=-1) # PAN
    # DBL block * 5
    conv = convolutional(conv, (1, 1, 256, 128))
    conv = convolutional(conv, (3, 3, 128, 256))
    conv = convolutional(conv, (1, 1, 256, 128))
    conv = convolutional(conv, (3, 3, 128, 256))
    conv = convolutional(conv, (1, 1, 256, 128))
    
    route_1 = conv
    # DBL block
    conv = convolutional(conv, (3, 3, 128, 256))
    # One conv block
    conv_sbbox = convolutional(conv, (1, 1, 256, 3 * (NUM_CLASS + 5)), activate=False, bn=False)
#======================================================================
    conv = convolutional(route_1, (3, 3, 128, 256), downsample=True)
    conv = tf.concat([conv, route_2], axis=-1) # PAN
    # DBL block * 5
    conv = convolutional(conv, (1, 1, 512, 256))
    conv = convolutional(conv, (3, 3, 256, 512))
    conv = convolutional(conv, (1, 1, 512, 256))
    conv = convolutional(conv, (3, 3, 256, 512))
    conv = convolutional(conv, (1, 1, 512, 256))
    
    route_2 = conv
    # DBL block
    conv = convolutional(conv, (3, 3, 256, 512))
    # One conv block
    conv_mbbox = convolutional(conv, (1, 1, 512, 3 * (NUM_CLASS + 5)), activate=False, bn=False)
#======================================================================
    conv = convolutional(route_2, (3, 3, 256, 512), downsample=True)
    conv = tf.concat([conv, route], axis=-1) # PAN
    # DBL block * 5 
    conv = convolutional(conv, (1, 1, 1024, 512))
    conv = convolutional(conv, (3, 3, 512, 1024))
    conv = convolutional(conv, (1, 1, 1024, 512))
    conv = convolutional(conv, (3, 3, 512, 1024))
    conv = convolutional(conv, (1, 1, 1024, 512))
    # DBL block
    conv = convolutional(conv, (3, 3, 512, 1024))
    # One conv block
    conv_lbbox = convolutional(conv, (1, 1, 1024, 3 * (NUM_CLASS + 5)), activate=False, bn=False)
#======================================================================
    return [conv_sbbox, conv_mbbox, conv_lbbox]

In [15]:
def decode(conv_output, NUM_CLASS, i=0):
    # where i = 0, 1 or 2 to correspond to the three grid scales
    # And here we choose the small box
    conv_shape       = tf.shape(conv_output)
    batch_size       = conv_shape[0]
    output_size      = conv_shape[1]

    conv_output = tf.reshape(conv_output, (batch_size, output_size, output_size, 3, 5 + NUM_CLASS))
    
    # (x, y, w, h) --> (xmin, ymin, xmax, ymax)
    # We Extract the (xmin, ymin), (xmax, ymax) , predicted confidence, probability category box object
    conv_raw_dxdy, conv_raw_dwdh, conv_raw_conf, conv_raw_prob = tf.split(conv_output, (2, 2, 1, NUM_CLASS), axis=-1)
    
    xy_grid = tf.meshgrid(tf.range(output_size), tf.range(output_size))
    xy_grid = tf.expand_dims(tf.stack(xy_grid, axis=-1), axis=2)  # [gx, gy, 1, 2]
    xy_grid = tf.tile(tf.expand_dims(xy_grid, axis=0), [batch_size, 1, 1, 3, 1])
    xy_grid = tf.cast(xy_grid, tf.float32)
    
    # Calculate the center position of the prediction box:
    pred_xy = (tf.sigmoid(conv_raw_dxdy) + xy_grid) * STRIDES[i]
    # Calculate the length and width of the prediction box:
    pred_wh = (tf.exp(conv_raw_dwdh) * ANCHORS[i]) * STRIDES[i]

    pred_xywh = tf.concat([pred_xy, pred_wh], axis=-1)
    pred_conf = tf.sigmoid(conv_raw_conf) # object box calculates the predicted confidence
    pred_prob = tf.sigmoid(conv_raw_prob) # calculating the predicted probability category box object

    # calculating the predicted probability category box object
    return tf.concat([pred_xywh, pred_conf, pred_prob], axis=-1)

In [16]:
def Create_Yolov4(input_size=IMAGE_SIZE, channels=3, training=True, CLASSES=CLASS_NAMES):
    NUM_CLASS = len(CLASS_NAMES)
    input_layer  = Input([input_size, input_size, channels])

    conv_tensors = YOLOv4(input_layer, NUM_CLASS)
    
    output_tensors = []
    for i, conv_tensor in enumerate(conv_tensors):
        pred_tensor = decode(conv_tensor, NUM_CLASS, i)
        if training: output_tensors.append(conv_tensor)
        output_tensors.append(pred_tensor)

    Yolo = tf.keras.Model(input_layer, output_tensors)
    return Yolo

In [17]:
yolo = Create_Yolov4(input_size=IMAGE_SIZE, training=True)

In [18]:
yolo.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 448, 448, 3) 0                                            
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 448, 448, 32) 864         input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization (BatchNorma (None, 448, 448, 32) 128         conv2d[0][0]                     
__________________________________________________________________________________________________
tf.math.softplus (TFOpLambda)   (None, 448, 448, 32) 0           batch_normalization[0][0]        
______________________________________________________________________________________________

# Define Loss
## GIOU
    IOU has an disadvantage, gradient is not provided when two bounding boxes are not superimposed. Threrefore, GIOU recorded the most-top, most-bottom, most-left and most-right corners of the two boxes. GIOU is the proportion of |most enlarged box - Union of two boxes| and the most enlarged box.
    In compute loss function, we calculated (2 - (area of ground truth / area of image)) to get the weights of the prediction.
    
##  Conf loss
    Calculate the biggest IOU in ground truth as max_IOU.
    Respond_bgd is checking if the prediction box is background: if label has an object, then value is 0; if the IOU between one of the ground truths is higher than threshold(here is 0.5), also 0; if not, value is 1.
    Calculate the cross entropy of both respond box and respond bgd.
    
##  Prob loss
    Using the respond bounding box as mask, and calculating the cross entropy. Respond bounding box represents whether a category object is contained.


In [19]:
def bbox_iou(boxes1, boxes2):
    boxes1_area = boxes1[..., 2] * boxes1[..., 3]
    boxes2_area = boxes2[..., 2] * boxes2[..., 3]

    boxes1 = tf.concat([boxes1[..., :2] - boxes1[..., 2:] * 0.5,
                        boxes1[..., :2] + boxes1[..., 2:] * 0.5], axis=-1)
    boxes2 = tf.concat([boxes2[..., :2] - boxes2[..., 2:] * 0.5,
                        boxes2[..., :2] + boxes2[..., 2:] * 0.5], axis=-1)

    left_up = tf.maximum(boxes1[..., :2], boxes2[..., :2])
    right_down = tf.minimum(boxes1[..., 2:], boxes2[..., 2:])

    inter_section = tf.maximum(right_down - left_up, 0.0)
    inter_area = inter_section[..., 0] * inter_section[..., 1]
    union_area = boxes1_area + boxes2_area - inter_area

    return 1.0 * inter_area / union_area

def bbox_giou(boxes1, boxes2):
    boxes1 = tf.concat([boxes1[..., :2] - boxes1[..., 2:] * 0.5,
                        boxes1[..., :2] + boxes1[..., 2:] * 0.5], axis=-1)
    boxes2 = tf.concat([boxes2[..., :2] - boxes2[..., 2:] * 0.5,
                        boxes2[..., :2] + boxes2[..., 2:] * 0.5], axis=-1)

    boxes1 = tf.concat([tf.minimum(boxes1[..., :2], boxes1[..., 2:]),
                        tf.maximum(boxes1[..., :2], boxes1[..., 2:])], axis=-1)
    boxes2 = tf.concat([tf.minimum(boxes2[..., :2], boxes2[..., 2:]),
                        tf.maximum(boxes2[..., :2], boxes2[..., 2:])], axis=-1)

    boxes1_area = (boxes1[..., 2] - boxes1[..., 0]) * (boxes1[..., 3] - boxes1[..., 1])
    boxes2_area = (boxes2[..., 2] - boxes2[..., 0]) * (boxes2[..., 3] - boxes2[..., 1])

    left_up = tf.maximum(boxes1[..., :2], boxes2[..., :2])
    right_down = tf.minimum(boxes1[..., 2:], boxes2[..., 2:])

    inter_section = tf.maximum(right_down - left_up, 0.0)
    inter_area = inter_section[..., 0] * inter_section[..., 1]
    union_area = boxes1_area + boxes2_area - inter_area

    # Calculate the iou value between the two bounding boxes
    iou = inter_area / union_area

    # Calculate the coordinates of the upper left corner and the lower right corner of the smallest closed convex surface
    enclose_left_up = tf.minimum(boxes1[..., :2], boxes2[..., :2])
    enclose_right_down = tf.maximum(boxes1[..., 2:], boxes2[..., 2:])
    enclose = tf.maximum(enclose_right_down - enclose_left_up, 0.0)

    # Calculate the area of the smallest closed convex surface C
    enclose_area = enclose[..., 0] * enclose[..., 1]

    # Calculate the GIoU value according to the GioU formula  
    giou = iou - 1.0 * (enclose_area - union_area) / enclose_area

    return giou

def compute_loss(pred, conv, label, bboxes, i=0, CLASSES=CLASS_NAMES):
    NUM_CLASS = len(CLASS_NAMES)
    conv_shape  = tf.shape(conv)
    batch_size  = conv_shape[0]
    output_size = conv_shape[1]
    input_size  = STRIDES[i] * output_size
    conv = tf.reshape(conv, (batch_size, output_size, output_size, 3, 5 + NUM_CLASS))

    conv_raw_conf = conv[:, :, :, :, 4:5]
    conv_raw_prob = conv[:, :, :, :, 5:]

    pred_xywh     = pred[:, :, :, :, 0:4]
    pred_conf     = pred[:, :, :, :, 4:5]

    label_xywh    = label[:, :, :, :, 0:4]
    respond_bbox  = label[:, :, :, :, 4:5]
    label_prob    = label[:, :, :, :, 5:]

    giou = tf.expand_dims(bbox_giou(pred_xywh, label_xywh), axis=-1)
    input_size = tf.cast(input_size, tf.float32)

    bbox_loss_scale = 2.0 - 1.0 * label_xywh[:, :, :, :, 2:3] * label_xywh[:, :, :, :, 3:4] / (input_size ** 2)
    giou_loss = respond_bbox * bbox_loss_scale * (1 - giou)

    iou = bbox_iou(pred_xywh[:, :, :, :, np.newaxis, :], bboxes[:, np.newaxis, np.newaxis, np.newaxis, :, :])
    # Find the value of IoU with the real box The largest prediction box
    max_iou = tf.expand_dims(tf.reduce_max(iou, axis=-1), axis=-1)

    # If the largest iou is less than the threshold, it is considered that the prediction box contains no objects, then the background box
    respond_bgd = (1.0 - respond_bbox) * tf.cast( max_iou < YOLO_IOU_LOSS_THRESH, tf.float32 )

    conf_focal = tf.pow(respond_bbox - pred_conf, 2)

    # Calculate the loss of confidence
    # we hope that if the grid contains objects, then the network output prediction box has a confidence of 1 and 0 when there is no object.
    conf_loss = conf_focal * (
            respond_bbox * tf.nn.sigmoid_cross_entropy_with_logits(labels=respond_bbox, logits=conv_raw_conf)
            +
            respond_bgd * tf.nn.sigmoid_cross_entropy_with_logits(labels=respond_bbox, logits=conv_raw_conf)
    )

    prob_loss = respond_bbox * tf.nn.sigmoid_cross_entropy_with_logits(labels=label_prob, logits=conv_raw_prob)

    giou_loss = tf.reduce_mean(tf.reduce_sum(giou_loss, axis=[1,2,3,4]))
    conf_loss = tf.reduce_mean(tf.reduce_sum(conf_loss, axis=[1,2,3,4]))
    prob_loss = tf.reduce_mean(tf.reduce_sum(prob_loss, axis=[1,2,3,4]))

    return giou_loss, conf_loss, prob_loss

## Start Training

In [20]:
optimizer = tf.keras.optimizers.Adam(LEARNING_RATE)
train_loss_metric = tf.keras.metrics.Mean(name='loss')
ckpt = tf.train.Checkpoint(epoch=tf.Variable(0), net=yolo)
                                            
manager = tf.train.CheckpointManager(ckpt, TRAIN_CHECKPOINTS_FOLDER, max_to_keep=3,
                                     checkpoint_name='yolo')

In [21]:
@tf.function
def train_step(image_data, target):
    with tf.GradientTape() as tape:
        pred_result = yolo(image_data, training=True)
        giou_loss=conf_loss=prob_loss=0

        # optimizing process
        grid = 3
        for i in range(grid):
            conv, pred = pred_result[i*2], pred_result[i*2+1]
            loss_items = compute_loss(pred, conv, *target[i], i, CLASSES=CLASS_NAMES)
            giou_loss += loss_items[0]
            conf_loss += loss_items[1]
            prob_loss += loss_items[2]

        total_loss = giou_loss + conf_loss + prob_loss
        train_loss_metric(total_loss)

        gradients = tape.gradient(total_loss, yolo.trainable_variables)
        optimizer.apply_gradients(zip(gradients, yolo.trainable_variables))

In [22]:
trainset = Dataset('train')

In [23]:
checkpoint = TRAIN_CHECKPOINTS_FOLDER + 'yolo-1'
ckpt = tf.train.Checkpoint(epoch=tf.Variable(2), net=yolo)
ckpt.restore(checkpoint)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f5f0c0b26d0>

In [None]:
lowest = 10000
print("{}, start training.".format(datetime.now()))
i = 0
for epoch in range(TRAIN_EPOCHS):
    train_loss_metric.reset_states()
    ckpt.epoch.assign_add(1)
    
    for image_data, target in trainset:
        train_step(image_data, target)
    i += 1   
    print("{}, Epoch {}: loss {:.2f}".format(datetime.now(), i, train_loss_metric.result()))
    
    if(lowest > train_loss_metric.result()):
        save_path = manager.save()
        lowest = train_loss_metric.result()
    print("Saved checkpoint for epoch {}: {}".format(int(ckpt.epoch), save_path))   

2021-11-29 23:47:53.494552, start training.


2021-11-29 23:48:00.740022: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2021-11-29 23:48:00.817562: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3000000000 Hz
2021-11-29 23:48:00.822884: W tensorflow/core/grappler/optimizers/meta_optimizer.cc:153] TF_ENABLE_AUTO_MIXED_PRECISION has no effect.
2021-11-29 23:48:00.823164: W tensorflow/core/grappler/optimizers/meta_optimizer.cc:153] TF_ENABLE_AUTO_MIXED_PRECISION has no effect.
2021-11-29 23:48:00.823196: W tensorflow/core/grappler/optimizers/meta_optimizer.cc:153] TF_ENABLE_AUTO_MIXED_PRECISION has no effect.
2021-11-29 23:48:00.823203: W tensorflow/core/grappler/optimizers/meta_optimizer.cc:153] TF_ENABLE_AUTO_MIXED_PRECISION has no effect.
2021-11-29 23:48:04.339155: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x7f5e58017de0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devi

In [None]:
def postprocess_boxes(pred_bbox, original_image, input_size, score_threshold):
    valid_scale=[0, np.inf]
    pred_bbox = np.array(pred_bbox)

    pred_xywh = pred_bbox[:, 0:4]
    pred_conf = pred_bbox[:, 4]
    pred_prob = pred_bbox[:, 5:]

    # 1. (x, y, w, h) --> (xmin, ymin, xmax, ymax)
    pred_coor = np.concatenate([pred_xywh[:, :2] - pred_xywh[:, 2:] * 0.5,
                                pred_xywh[:, :2] + pred_xywh[:, 2:] * 0.5], axis=-1)
    # 2. (xmin, ymin, xmax, ymax) -> (xmin_org, ymin_org, xmax_org, ymax_org)
    org_h, org_w = original_image.shape[:2]
    resize_ratio = min(input_size / org_w, input_size / org_h)

    dw = (input_size - resize_ratio * org_w) / 2
    dh = (input_size - resize_ratio * org_h) / 2

    pred_coor[:, 0::2] = 1.0 * (pred_coor[:, 0::2] - dw) / resize_ratio
    pred_coor[:, 1::2] = 1.0 * (pred_coor[:, 1::2] - dh) / resize_ratio

    # 3. clip some boxes those are out of range
    pred_coor = np.concatenate([np.maximum(pred_coor[:, :2], [0, 0]),
                                np.minimum(pred_coor[:, 2:], [org_w - 1, org_h - 1])], axis=-1)
    invalid_mask = np.logical_or((pred_coor[:, 0] > pred_coor[:, 2]), (pred_coor[:, 1] > pred_coor[:, 3]))
    pred_coor[invalid_mask] = 0

    # 4. discard some invalid boxes
    bboxes_scale = np.sqrt(np.multiply.reduce(pred_coor[:, 2:4] - pred_coor[:, 0:2], axis=-1))
    scale_mask = np.logical_and((valid_scale[0] < bboxes_scale), (bboxes_scale < valid_scale[1]))

    # 5. discard boxes with low scores
    classes = np.argmax(pred_prob, axis=-1)
    scores = pred_conf * pred_prob[np.arange(len(pred_coor)), classes]
    score_mask = scores > score_threshold
    mask = np.logical_and(scale_mask, score_mask)
    coors, scores, classes = pred_coor[mask], scores[mask], classes[mask]

    return np.concatenate([coors, scores[:, np.newaxis], classes[:, np.newaxis]], axis=-1)

In [None]:
def bboxes_iou(boxes1, boxes2):
    boxes1 = np.array(boxes1)
    boxes2 = np.array(boxes2)

    boxes1_area = (boxes1[..., 2] - boxes1[..., 0]) * (boxes1[..., 3] - boxes1[..., 1])
    boxes2_area = (boxes2[..., 2] - boxes2[..., 0]) * (boxes2[..., 3] - boxes2[..., 1])

    left_up       = np.maximum(boxes1[..., :2], boxes2[..., :2])
    right_down    = np.minimum(boxes1[..., 2:], boxes2[..., 2:])

    inter_section = np.maximum(right_down - left_up, 0.0)
    inter_area    = inter_section[..., 0] * inter_section[..., 1]
    union_area    = boxes1_area + boxes2_area - inter_area
    ious          = np.maximum(1.0 * inter_area / union_area, np.finfo(np.float32).eps)

    return ious

def nms(bboxes, iou_threshold, sigma=0.3, method='nms'):
    """
    :param bboxes: (xmin, ymin, xmax, ymax, score, class)
    Note: soft-nms, https://arxiv.org/pdf/1704.04503.pdf
          https://github.com/bharatsingh430/soft-nms
    """
    classes_in_img = list(set(bboxes[:, 5]))
    best_bboxes = []

    for cls in classes_in_img:
        cls_mask = (bboxes[:, 5] == cls)
        cls_bboxes = bboxes[cls_mask]
        # Process 1: Determine whether the number of bounding boxes is greater than 0 
        while len(cls_bboxes) > 0:
            # Process 2: Select the bounding box with the highest score according to socre order A
            max_ind = np.argmax(cls_bboxes[:, 4])
            best_bbox = cls_bboxes[max_ind]
            best_bboxes.append(best_bbox)
            cls_bboxes = np.concatenate([cls_bboxes[: max_ind], cls_bboxes[max_ind + 1:]])
            # Process 3: Calculate this bounding box A and
            # Remain all iou of the bounding box and remove those bounding boxes whose iou value is higher than the threshold 
            iou = bboxes_iou(best_bbox[np.newaxis, :4], cls_bboxes[:, :4])
            weight = np.ones((len(iou),), dtype=np.float32)

            assert method in ['nms', 'soft-nms']

            if method == 'nms':
                iou_mask = iou > iou_threshold
                weight[iou_mask] = 0.0

            if method == 'soft-nms':
                weight = np.exp(-(1.0 * iou ** 2 / sigma))

            cls_bboxes[:, 4] = cls_bboxes[:, 4] * weight
            score_mask = cls_bboxes[:, 4] > 0.
            cls_bboxes = cls_bboxes[score_mask]

    return best_bboxes

In [None]:
def detect_image(Yolo, original_image, input_size=IMAGE_SIZE, score_threshold=0.3, iou_threshold=0.45):
    original_image      = cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB)
    original_image      = cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB)

    image_data = image_preprocess(np.copy(original_image), [input_size, input_size])
    image_data = image_data[np.newaxis, ...].astype(np.float32)

    pred_bbox = Yolo.predict(image_data)
    pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox]
    pred_bbox = tf.concat(pred_bbox, axis=0)
    
    bboxes = postprocess_boxes(pred_bbox, original_image, input_size, score_threshold)
    bboxes = nms(bboxes, iou_threshold, method='nms')
    return bboxes

### Build Test dataset Iterator

In [None]:
def Load_Yolo_model():
    checkpoint = TRAIN_CHECKPOINTS_FOLDER + 'yolo-1'
    print("Loading weights from:", checkpoint)
    yolo = Create_Yolov4(input_size=IMAGE_SIZE, training=False)
    ckpt = tf.train.Checkpoint(net=yolo)
    ckpt.restore(checkpoint)
        
    return yolo

### Make Prediction and Output to txt file
output = image_name   {xmin_i    ymin_i    xmax_i    ymax_i    class_i    confidence_score}

In [None]:
yolo = Load_Yolo_model()
input_file = open(TEST_ANNOT_PATH, 'r')
images_path = []
with input_file as f:
    for line in f:
        images_path.append(line.strip())

output_file = open('./test_prediction_RES_cpypst_34.txt', 'w')
with output_file as f:
    for image_path in images_path:
        image = cv2.imread(TEST_IMAGE_DIR + image_path)
        # xmin, ymin, xmax, ymax, class_num, conf = detect_image( yolo, image)
        bboxes = detect_image(yolo, image)
        # img filename, xmin, ymin, xmax, ymax, class, confidence
        line = image_path
        for xmin, ymin, xmax, ymax, conf, class_num in bboxes:
            line += f" {xmin} {ymin} {xmax} {ymax} {class_num} {conf}"
        line += '\n'
        f.write(line)

In [None]:
image_path = "/work/michael1017/data/comp2/VOCdevkit_train/VOC2007/JPEGImages/000007.jpg"
image = cv2.imread(image_path)
print(detect_image(yolo, image))

### Run Evaluation Metric
 mean Average Precision(mAP)

In [None]:
import sys
sys.path.insert(0, "./evaluate")

In [None]:
import evaluate
#evaluate.evaluate("input prediction file name", "desire output csv file name")
evaluate.evaluate(f"./test_prediction.txt", f"./output_file.csv")

# Visualization

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
for root, dirs, files in os.walk('/work/michael1017/data/comp2/VOCdevkit_test/VOC2007/JPEGImages/'):
    for f in files:
        origin_image = cv2.imread(root + '/' + f)
        bboxes = detect_image(yolo, origin_image)
        for xmin, ymin, xmax, ymax, conf, class_num in bboxes:
            class_name = CLASS_NAMES[int(class_num)]
            cv2.rectangle(origin_image, (int(xmin), int(ymin)), (int(xmax), int(ymax)), (0, 255, 255), 3)
            cv2.putText(origin_image, class_name, (int(xmin), int(ymin)), 2, 1, (0, 255, 255), 2)
        plt.imshow(origin_image)
        plt.show()
        input()