https://www.kaggle.com/c/datalab-cup2-object-detection-2020/overview

# 一、Data preprocessing

由於資料量不足、資料類別間有數量不平衡的問題以及預防overfitting，我們對data做了下列處理：

## (一)、解決imbalance問題

我們將training data增加到了四萬多筆，以及盡量讓各組的類別數量平衡一點。

## (二)、Data Augmentation
同時，我們對圖片做了以下的處理：

* 隨機抽樣圖片，把其中一個物件剪下來後，貼上在其他圖片上，增加圖片的多樣性。
* 圖片的翻轉以及裁切。
* 灰階處理。
* gaussian_noise。
* 亮度及不同顏色對比調整。

但我們發現如果對所有圖片都做這些方法處理的話，模型會學得非常慢，而且
也會造成某些物件完全學不到的結果，所以後來改成不會對每一張圖片都做所有augmentation方法，而是用類似抽樣的方式讓這些 augmentation方法被採用。

In [None]:
#%%
from __future__ import absolute_import, division, print_function, unicode_literals


import tensorflow as tf
# import tensorflow_addons as tfa
import numpy as np

from datetime import datetime

%matplotlib inline
import matplotlib.pyplot as plt
import cv2
import os
import random

from tensorflow import keras
from tensorflow.keras import layers

import imgaug as ia
import imgaug.augmenters as iaa
from imgaug.augmentables.bbs import BoundingBox, BoundingBoxesOnImage

from shutil import copyfile
# %%
classes_name =  ["aeroplane", "bicycle", "bird", "boat", "bottle", 
                 "bus", "car", "cat", "chair", "cow", "diningtable", 
                 "dog", "horse", "motorbike", "person", "pottedplant", 
                 "sheep", "sofa", "train","tvmonitor"]

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Restrict TensorFlow to only use the fourth GPU
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')

        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)
# %%
# common params
IMAGE_SIZE = 448
BATCH_SIZE = 6
NUM_CLASSES = 20
MAX_OBJECTS_PER_IMAGE = 20

# dataset params

DATA_PATH = './train_1.txt'
IMAGE_DIR = './VOCdevkit_train/VOC2007/JPEGImages/'

# model params
CELL_SIZE = 7
BOXES_PER_CELL = 2
OBJECT_SCALE = 1
NOOBJECT_SCALE = 0.5
CLASS_SCALE = 1
COORD_SCALE = 5

# training params
LEARNING_RATE = 1e-5
EPOCHS = 15

is_fine_tune = False

CKT_Dir = "../ckpts/17_resnet_balance"
checkpoint_name = 'yolo_v1_17_resnet_balance'

is_load_best = False

PROB_THRES = 0.01
IOU_THRES = 0.3
# %%
def list_add(a,b):
    c = []
    for i in range(len(a)):
        c.append(a[i]+b[i])
    return c

training_data_file = open("./pascal_voc_training_data.txt", "r")
class imbalance:
  def __init__(self):
    self.each_class_count = []
    self.each_pic_count = []
    self.add_pic_index = []
    self.new = []

    for i, line in enumerate(training_data_file):
      line = line.strip()
      self.new.append(line)
      a = line.split()
      b = len(a)
      e = []
      for j in range(5,b,5):
        e.append(a[j])
        f = [e.count('0'),e.count('1'),e.count('2'),
            e.count('3'),e.count('4'),e.count('5'),
            e.count('6'),e.count('7'),e.count('8'),
            e.count('9'),e.count('10'),e.count('11'),
            e.count('12'),e.count('13'),e.count('14'),
            e.count('15'),e.count('16'),e.count('17'),
            e.count('18'),e.count('19')]
      self.each_pic_count.append(f)
    g = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
    for i in range(4974):
      g = list_add(g,self.each_pic_count[i])
    self.each_class_count = g 
    d = self.each_pic_count
    
    #index 
    index5 = []
    for i in range(4974):
      if (d[i][5]>0) and (d[i][14] == 0):
        index5.append(i)
    index10 = []
    for i in range(4974):  
      if (d[i][10]>0) and (d[i][14] < 2) and (d[i][8] < 3):
        index10.append(i)
    index0 = []
    for i in range(4974): 
      if (d[i][0]>0) and (d[i][14] == 0) and (d[i][5] == 0)and (d[i][6] == 0) and(d[i][8] == 0) :
        index0.append(i)
    index18 = []
    for i in range(4974):
      if (d[i][18]>0) and (d[i][14] == 0) and (d[i][5] == 0)and (d[i][6] == 0) and(d[i][8] == 0) and (d[i][10] == 0) :
        index18.append(i)
    index19 = []
    for i in range(4974):    
      if (d[i][19]>0) and (d[i][14] == 0) and (d[i][8] == 0) and (d[i][10] == 0) and (d[i][0] == 0) and (d[i][18] == 0):
        index19.append(i)
    index9 = []
    for i in range(4974):
      if (d[i][9]>0) and (d[i][14] == 0):
        index9.append(i)
    index1 = []
    for i in range(4974):
      if (d[i][1]>0) and (d[i][14] == 0) :
        index1.append(i)
    index2 = []
    for i in range(4974):
      if (d[i][2]>0) and (d[i][14] == 0):
        index2.append(i)
    index3 = []
    for i in range(4974):
      if (d[i][3]>0) and (d[i][2]==0) and(d[i][14] == 0) :
        index3.append(i)
    index7 = []
    for i in range(4974):
      if (d[i][7]>0) and(d[i][14] == 0) :
        index7.append(i)
    index11 = []
    for i in range(4974):
      if (d[i][11]>0) and(d[i][14] == 0) :
        index11.append(i)
    index12 = []
    for i in range(4974):
      if (d[i][12]>0) and(d[i][14] == 0) :
        index12.append(i)
    index13 = []
    for i in range(4974):
      if (d[i][13]>0) and (d[i][14] == 0) and (d[i][6] == 0) and (d[i][10] == 0):
        index13.append(i)
    index16 = []
    for i in range(4974):
      if (d[i][16]>0) and (d[i][14] == 0) :
        index16.append(i)
    index17 = []
    for i in range(4974):
      if (d[i][17]>0) and (d[i][14] == 0) and (d[i][15] == 0)and (d[i][8] == 0) :
        index17.append(i)
    index4 = []
    for i in range(4974):
      if (d[i][4]>0) and(d[i][14] == 0) :
        index4.append(i)
    index15 = []
    for i in range(4974):
      if (d[i][15]>0) and(d[i][14] == 0) :
        index15.append(i)
    self.add_pic_index = list(np.repeat(index5,26))+list(np.repeat(index10,60))+list(np.repeat(index0,10))+list(np.repeat(index18,15))+list(np.repeat(index19,20))+list(np.repeat(index9,15))+list(np.repeat(index1,40))+list(np.repeat(index2,6))+list(np.repeat(index3,10))+list(np.repeat(index7,10))+list(np.repeat(index11,10))+list(np.repeat(index12,40))+list(np.repeat(index13,40))+list(np.repeat(index16,10))+list(np.repeat(index17,10))+list(np.repeat(index4,6))+list(np.repeat(index15,3))
    for i in self.add_pic_index:
      self.new.append(self.new[i])
# %%
# a = imbalance()

# num = []
# for line in a.new:
#   line = line.strip()
#   temp = line.split()
#   for j in range(5,len(temp),5):
#     num.append(temp[j])
# #result = Counter(num)

# new_data = a.new

# with open('./train_1.txt', 'w') as f:
#     for item in new_data:
#         f.write("%s\n" % item)
#%%
@tf.function
def random_flip(image,xcenter,ycenter):
    up_down_outcome = tf.random.uniform([1],0,1)
    right_left_outcome = tf.random.uniform([1],0,1)

    x_0 = tf.not_equal(xcenter,0.)
    y_0 = tf.not_equal(ycenter,0.)
    grand = tf.cast(tf.where(tf.math.logical_or(x_0,y_0),IMAGE_SIZE,0),
                    tf.float32)

    if up_down_outcome<up_down_flip_p:
        image = tf.image.flip_up_down(image)
        ycenter = grand-ycenter

    if right_left_outcome<left_right_flip_p:
        image = tf.image.flip_left_right(image)
        xcenter = grand-xcenter

    return image, xcenter, ycenter

@tf.function
def to_gray(image):
    prob = tf.random.uniform([1],0,1)

    if prob<gray_p:
        image = tf.image.rgb_to_grayscale(image)
        image = tf.image.grayscale_to_rgb(image)

    return image

@tf.function
def gaussian_noise(image):
    prob = tf.random.uniform([1],0,1)
    
    if prob<noise_p:
        noise = tf.random.normal(image.shape,stddev=5)
        image = tf.math.add(image, noise)
        
    return image

@tf.function
def brightness(image):
    prob = tf.random.uniform([1],0,1)
    
    if prob<brightness_p:
        image = tf.image.random_brightness(image,5)
        
    return image

@tf.function
def hue(image):
    prob = tf.random.uniform([1], 0, 1)
    
    if prob < hue_p:
        image = tf.image.random_hue(image, 0.5)
        
    return image

@tf.function
def saturation(image):
    prob = tf.random.uniform([1], 0, 1)
    
    if prob < saturation_p:
        image = tf.image.random_saturation(image, 0, 1.5)
        
    return image
# %%
# data augumentation parameter
up_down_flip_p = 0.05
left_right_flip_p = 0.5

theSame = 0.4
crop_p = 0.6
geom_p = 0.8
brightness_p = 0.3
hue_p = 0.8
saturation_p = 0.8
contrast = 0.3
gray_p = 0.1
noise_p = 0.4
blur = 0.2

rotate_range=(-45, 45)
scale_range=(0.2, 1.2)
translate_range=(-0.2, 0.2)
shear_range=(-20, 20)
crop_pad_range=(-0.2, 0.2)
#%%
def imgaug_trans(image,labels):
    n = tf.math.count_nonzero(labels[:,0]).numpy()
    image = image.numpy()
    labels = labels.numpy()
    output = np.zeros_like(labels)

    center_x = labels[:,0]
    center_y = labels[:,1]
    w_half = labels[:,2] / 2
    h_half = labels[:,3] / 2

    tempbb = [BoundingBox(x1=center_x[i] - w_half[i], y1=center_y[i] - h_half[i],
                          x2=center_x[i] + w_half[i], y2=center_y[i] + h_half[i]) for i in range(n)]
    bbs = BoundingBoxesOnImage(tempbb, shape=image.shape)

    seq = iaa.Sequential(
      [
       iaa.Sometimes(crop_p,iaa.CropAndPad(percent=(crop_pad_range[0], crop_pad_range[1]))),
       iaa.Sometimes(geom_p,
                     iaa.SomeOf((1, 5),[
                                       iaa.Affine(translate_percent={"x":(translate_range[0], translate_range[1])}),
                                       iaa.Affine(translate_percent={"y":(translate_range[0], translate_range[1])}),
                                       iaa.Affine(scale=(scale_range[0], scale_range[1])),
                                       iaa.Affine(rotate=(rotate_range[0], rotate_range[1])),
                                       iaa.Affine(shear=(shear_range[0],shear_range[1]))
                                       ],
                                random_order=True)
                     ),
       iaa.Sometimes(contrast,
                     iaa.OneOf([
                                iaa.contrast.LinearContrast(alpha=(1.25, 1.5),per_channel=True),
                                iaa.contrast.LinearContrast(alpha=(0.25, 0.5),per_channel=True),
                                iaa.contrast.LinearContrast(alpha=(0.25, 0.5)),
                                iaa.contrast.LinearContrast(alpha=(1.25, 1.5)),
                                iaa.ChannelShuffle()
                                ]
                               )
                     ),
       iaa.Sometimes(blur, iaa.GaussianBlur(sigma=(0.1,3)))
    ])

    image_aug, bbs_aug = seq(image=image, bounding_boxes=bbs)

    if len(bbs_aug.remove_out_of_image().bounding_boxes)==n:
        theIdx = [i for i in range(n)]
    else:
        set_bb = set(bbs_aug.remove_out_of_image().bounding_boxes)
        theIdx = [i for i in range(n) if bbs_aug.bounding_boxes[i] in set_bb]

    remain_labels = labels[theIdx, 4]
    clip_bbs = bbs_aug.remove_out_of_image().clip_out_of_image().bounding_boxes
    for i in range(len(theIdx)):
        theBox = clip_bbs[i]
        output[i, 0] = (theBox.x1 + theBox.x2) / 2 # x center
        output[i, 1] = (theBox.y1 + theBox.y2) / 2 # y center
        output[i, 2] = (theBox.x2 - theBox.x1) # w
        output[i, 3] = (theBox.y2 - theBox.y1) # h
        output[i, 4] = remain_labels[i]

    return image_aug, output
#%%
@tf.function
def data_aug(image,labels):
    same_sample = tf.random.uniform([1],0,1)
    if same_sample > theSame:
        # not the same
        image = gaussian_noise(image)
        image, labels = tf.py_function(
            func=imgaug_trans,
            inp=[image, labels],
            Tout=[tf.float32, tf.float32]
            )
        image = brightness(image)
        image = hue(image)
        image = saturation(image)
        image = to_gray(image)

    return image, labels
#%%
class DatasetGenerator:
    """
    Load pascalVOC 2007 dataset and creates an input pipeline ready to be fed into a model.
    - Reshapes images into 448 x 448
    - converts [0 1] to [-1 1]
    - shuffles the input
    - builds batches
    """

    def __init__(self):
        self.image_names = []
        self.record_list = []
        self.object_num_list = []
        # filling the record_list
        input_file = open(DATA_PATH, 'r')

        for line in input_file:
            line = line.strip()
            ss = line.split(' ')
            
            self.image_names.append(ss[0])
            self.record_list.append([float(num) for num in ss[1:]])
            self.object_num_list.append(min(len(self.record_list[-1])//5, MAX_OBJECTS_PER_IMAGE))
            
            # resize newest data
            if len(self.record_list[-1]) < MAX_OBJECTS_PER_IMAGE*5:
                self.record_list[-1] = self.record_list[-1] +\
                [0., 0., 0., 0., 0.]*\
                (MAX_OBJECTS_PER_IMAGE-len(self.record_list[-1])//5)
            elif len(self.record_list[-1]) > MAX_OBJECTS_PER_IMAGE*5:
                self.record_list[-1] = self.record_list[-1][:MAX_OBJECTS_PER_IMAGE*5]
                
        ## shuffle
        idx = random.sample(range(len(self.image_names)), len(self.image_names))
        self.image_names = [self.image_names[i] for i in idx]
        self.record_list = [self.record_list[i] for i in idx]
        self.object_num_list = [self.object_num_list[i] for i in idx]

    def _data_preprocess(self, image_name, raw_labels, object_num):
        image_file = tf.io.read_file(IMAGE_DIR+image_name)
        image = tf.io.decode_jpeg(image_file, channels=3)

        h = tf.shape(image)[0]
        w = tf.shape(image)[1]

        width_rate = IMAGE_SIZE * 1.0 / tf.cast(w, tf.float32) 
        height_rate = IMAGE_SIZE * 1.0 / tf.cast(h, tf.float32) 

        image = tf.image.resize(image, size=[IMAGE_SIZE, IMAGE_SIZE])

        raw_labels = tf.cast(tf.reshape(raw_labels, [-1, 5]), tf.float32)

        xmin = raw_labels[:, 0]
        ymin = raw_labels[:, 1]
        xmax = raw_labels[:, 2]
        ymax = raw_labels[:, 3]
        class_num = raw_labels[:, 4]

        xcenter = (xmin + xmax) * 1.0 / 2.0 * width_rate
        ycenter = (ymin + ymax) * 1.0 / 2.0 * height_rate

        box_w = (xmax - xmin) * width_rate
        box_h = (ymax - ymin) * height_rate

        image, xcenter, ycenter = random_flip(image, xcenter, ycenter)

        labels = tf.stack([xcenter, ycenter, box_w, box_h, class_num], axis = 1)

        image, labels = data_aug(image, labels)
        image = tf.keras.applications.resnet.preprocess_input(image)

        return image, labels, tf.cast(object_num, tf.int32)

    def generate(self):
        dataset = tf.data.Dataset.from_tensor_slices((self.image_names, 
                                                      np.array(self.record_list), 
                                                      np.array(self.object_num_list)))
        dataset = dataset.map(self._data_preprocess, num_parallel_calls = tf.data.experimental.AUTOTUNE)
        dataset = dataset.shuffle(1000)
        dataset = dataset.batch(BATCH_SIZE)
        
        dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

        return dataset


In [None]:
#%%
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import cv2
#%%
DATA_PATH = './pascal_voc_training_data.txt'
IMAGE_DIR = './VOCdevkit_train/VOC2007/JPEGImages/'
IMAGE_SIZE = 224
BATCH_SIZE = 6
NUM_CLASSES = 20
MAX_OBJECTS_PER_IMAGE = 20

#%%
image_names = []
record_list = []
object_num_list = []
# filling the record_list
input_file = open(DATA_PATH, 'r')
#%%
for line in input_file:
    line = line.strip()
    ss = line.split(' ')
    image_names.append(ss[0])
    record_list.append([float(num) for num in ss[1:]])

    object_num_list.append(min(len(record_list[-1])//5, 
                                    MAX_OBJECTS_PER_IMAGE))
    if len(record_list[-1]) < MAX_OBJECTS_PER_IMAGE*5:
        record_list[-1] = record_list[-1] +\
        [0., 0., 0., 0., 0.]*\
        (MAX_OBJECTS_PER_IMAGE-len(record_list[-1])//5)
                
    elif len(record_list[-1]) > MAX_OBJECTS_PER_IMAGE*5:
        record_list[-1] = record_list[-1][:MAX_OBJECTS_PER_IMAGE*5]
#%%
image_collect = []
label_collect = []
object_num_collect = []
for image_name,raw_labels,obj_n in zip(image_names,record_list,object_num_list):
    image_file = tf.io.read_file(IMAGE_DIR+image_name)
    image = tf.io.decode_jpeg(image_file, channels=3)
    h = tf.shape(image)[0]
    w = tf.shape(image)[1]
    raw_labels = tf.cast(tf.reshape(raw_labels, [-1, 5]), tf.float32)
    image_collect.append(image)
    label_collect.append(raw_labels)
    object_num_collect.append(obj_n)

#%%
def sample_cut(image_collect,label_collect,object_num_collect):
    p = [1/20 for i in range(20)]
    # random 抽一張要黏貼的圖
    wanted = np.where(np.random.multinomial(1,p, size=None))[0][0]
    random_object = np.random.randint(0,4973)
    cut_label = label_collect[random_object].numpy()
    cut_obj_n = object_num_collect[random_object]
    cut_label = cut_label[0:cut_obj_n]
    if (wanted in cut_label[:,4]):
        cut_img = image_collect[random_object].numpy()
    else:
        while(wanted not in cut_label[:,4]):
            random_object = np.random.randint(0,4973)
            cut_label = label_collect[random_object].numpy()
            cut_obj_n = object_num_collect[random_object]
            cut_label = cut_label[0:cut_obj_n]
        cut_img = image_collect[random_object].numpy()
    # 記錄出物件的 xmin,ymin,xmax,ymax
    label = cut_label[wanted==cut_label[:,4]].astype("int32")[0]
    # 把物件剪下來
    sub_obj = cut_img[label[1]:label[3],label[0]:label[2]]
    return sub_obj,label
def sample_paste(image_collect,label_collect,object_num_collect):
    # sample 一張要被貼的圖片
    random_paste = np.random.randint(0,4973)
    paste_label = label_collect[random_paste].numpy()
    paste_obj_n = object_num_collect[random_paste]
    if (14 not in paste_label[:,4] and paste_label.shape[0]!=20):
        paste_img = image_collect[random_paste].numpy()
    else:
        while(14 in paste_label[:,4] or paste_obj_n==20):
            random_paste = np.random.randint(0,4973)
            paste_label = label_collect[random_paste].numpy()
            paste_obj_n = object_num_collect[random_paste]        
        paste_img = image_collect[random_paste].numpy()
    return paste_img,paste_label,paste_obj_n
def calculate_area(label):
    return (label[2]-label[0])*(label[3]-label[1])
def max_label_sample(paste_label,paste_obj_n,p_1,p_2):
    if paste_obj_n>1:
        area = []
        for i in range(paste_obj_n):
            area.append(calculate_area(paste_label[i,0:4]))
        max_pos = np.where(area==np.max(area))[0][0]
    else:
        max_pos = 0
    xmin = paste_label[max_pos,0]
    ymin = paste_label[max_pos,1]
    xmax = paste_label[max_pos,2]
    ymax = paste_label[max_pos,3]
    if xmin-0>p_2-xmax:
        top_left_2 = np.random.randint(0,xmin)
    else:
        top_left_2 = np.random.randint(xmax,p_2)
    if ymin-0>p_1-ymax:
        top_left_1 = np.random.randint(0,ymin)
    else:
        top_left_1 = np.random.randint(0,p_1)
    
    return top_left_1,top_left_2

def iou(boxes1,boxes2):
    #calculate the left up point of boxes' overlap area
    lu = np.maximum(boxes1[0:2], boxes2[0:2])
    #calculate the right down point of boxes overlap area
    rd = np.minimum(boxes1[2:], boxes2[2:])

    #intersection
    intersection = rd - lu

    #the size of the intersection area
    inter_square = intersection[0] * intersection[1]

    mask = ((intersection[0] > 0) * (intersection[1] > 0))*1.

    #if intersection is negative, then the boxes don't overlap
    inter_square = mask * inter_square

    #calculate the boxs1 square and boxs2 square
    square1 = (boxes1[2] - boxes1[0]) * (boxes1[3] - boxes1[1])
    square2 = (boxes2[2] - boxes2[0]) * (boxes2[3] - boxes2[1])

    return inter_square/square2
def iou_procedure(new_sub_1,new_sub_2,top_left_1,top_left_2,p_1,p_2,paste_label,paste_obj_n):
    iou_out = []
    if (new_sub_1+top_left_1)<p_1 and (new_sub_2+top_left_2)<p_2:
        new_pos = np.array([top_left_2,top_left_1,top_left_2+new_sub_2,top_left_1+new_sub_1],dtype=float)
        for i in range(paste_obj_n):
            iou_out.append(iou(new_pos,paste_label[i,:4]))
    else:
        i=0
        while((new_sub_1+top_left_1)>p_1 or (new_sub_2+top_left_2)>p_2):
            if i%5==4: new_sub_1,new_sub_2 =new_sub_1*(0.9),new_sub_2*(0.9)
            top_left_1,top_left_2 = max_label_sample(paste_label,paste_obj_n,p_1,p_2)
            print(i)
            i+=1
            if i>1000: 
                new_pos = [0.,0.,0.,0.]
                iou_out = [5]
                break
        new_pos = np.array([top_left_2,top_left_1,top_left_2+new_sub_2,top_left_1+new_sub_1],dtype=float)
        for i in range(paste_obj_n):
            iou_out.append(iou(new_pos,paste_label[i,:4]))
    return np.array(iou_out),new_pos
def add_data(image_collect,label_collect,object_num_collect):
    # sample cut
    sub_obj,label = sample_cut(image_collect,label_collect,object_num_collect)
    sub_1,sub_2,_ = sub_obj.shape
    print(1)
    # sample paste
    paste_img,paste_label,paste_obj_n = sample_paste(image_collect,label_collect,object_num_collect)
    p_1,p_2,_ = paste_img.shape
    print(2)
    # 將長寬縮放任二 ratio (random sample)，記錄新的圖片與新的長寬
    if (sub_1>p_1-20) or (sub_2>p_2-20):
        new_sub_1,new_sub_2 = sub_1*0.5,sub_2*0.5
    ratio = np.random.uniform(0.5,1.5,1)[0]
    new_sub_1,new_sub_2 = np.around(sub_1*ratio,0).astype('int32'),np.around(sub_2*ratio,0).astype('int32')
    top_left_1,top_left_2 = max_label_sample(paste_label,paste_obj_n,p_1,p_2)
    print(3)
    # iou threshold 0.2
    iou_out,new_pos = iou_procedure(new_sub_1,new_sub_2,top_left_1,top_left_2,p_1,p_2,paste_label,paste_obj_n)
    area = calculate_area(new_pos)
    return iou_out,new_pos,area,label,paste_img,paste_label,paste_obj_n,sub_obj
# %%

f = open('somefile.txt', 'w')
for k in range(5000):
    iou_out,new_pos,area,label,paste_img,paste_label,paste_obj_n,sub_obj = add_data(image_collect,label_collect,object_num_collect)
    print(4)
    w,h,_ = paste_img.shape
    thd = w*h/49
    if np.sum(iou_out>=0.2)!=0 or area<thd:
        j=0
        while np.sum(iou_out>=0.2)!=0 or area<thd:
            iou_out,new_pos,area,label,paste_img,paste_label,paste_obj_n,sub_obj = add_data(image_collect,label_collect,object_num_collect)
            j+=1
    label[0:4]=new_pos
    paste_label[paste_obj_n]=label
    paste_obj_n+=1
    new_pos = new_pos.astype('int32')
    sub_obj = cv2.resize(sub_obj, (new_pos[2]-new_pos[0], new_pos[3]-new_pos[1]), interpolation=cv2.INTER_CUBIC)
    paste_img[new_pos[1]:new_pos[3],new_pos[0]:new_pos[2]]=sub_obj
    words = np.reshape(paste_label[:paste_obj_n].astype('int32'),[5*paste_obj_n]).astype("str").tolist()
    
    f.write(str(k)+".jpg "+" ".join(words)+"\n")  # python will convert \n to os.linesep
    paste_img = cv2.cvtColor(paste_img, cv2.COLOR_RGB2BGR) 
    cv2.imwrite("C:\\Users\\linre\\Desktop\\DL_HW\\DL_competition_2\\img_file\\"+str(k)+".jpg ",paste_img)
f.close() 
# %%


# 二、 Model Building
做完Data preprocessing後，接下來我們嘗試用各種model建模，並切validation data來判斷模型訓練過程當中的好壞。

In [None]:
import matplotlib.pyplot as plt
import cv2
import numpy as np 
classes_name =  ["aeroplane", "bicycle", "bird", "boat", "bottle", 
                 "bus", "car", "cat", "chair", "cow", "diningtable", 
                 "dog", "horse", "motorbike", "person", "pottedplant", 
                 "sheep", "sofa", "train","tvmonitor"]

import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        # Select GPU number 1
        tf.config.experimental.set_visible_devices(gpus[5], 'GPU')
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)


In [None]:
# common params
IMAGE_SIZE = 448
BATCH_SIZE = 100
VAL_BATCH_SIZE = 200
NUM_CLASSES = 20
MAX_OBJECTS_PER_IMAGE = 42

# dataset params
DATA_PATH = './train_1.txt'
IMAGE_DIR = './VOCdevkit_train/VOC2007/JPEGImages/'

# model params
CELL_SIZE = 7
BOXES_PER_CELL = 2
OBJECT_SCALE = 1
NOOBJECT_SCALE = 0.5
CLASS_SCALE = 1
COORD_SCALE = 5
pretrain_model_list =  ['EfficientNetB0','Xception','Resnet','DenseNet121']
pretrain_model = pretrain_model[0]
# training params
LEARNING_RATE = 1e-5
EPOCHS = 40


In [None]:
def image_flip_and_coordinate(image,labels,num_classes):
    if tf.random.uniform([1],0,1)>=0.4:
        w,h,_ = image.shape
        x_center = tf.concat([w-labels[0:(num_classes),0],tf.zeros(MAX_OBJECTS_PER_IMAGE-num_classes)],axis=0)
        labels = tf.stack([x_center,labels[:,1],labels[:,2],labels[:,3],labels[:,4]],
                            axis=1)
        image = tf.image.flip_left_right(image)
    else:
        pass;
    return  image,labels
def random_noise(image,k):
    num = tf.random.uniform([1],1,k,tf.dtypes.int32)[0]
    h,w,_ = image.shape
    coordinate = tf.range(0,h+1,448//k)
    for j in range(num):
        w_loc = tf.random.uniform([1],1,k,tf.dtypes.int32)[0]
        h_loc = tf.random.uniform([1],1,k,tf.dtypes.int32)[0]
        indices = tf.reshape(tf.range(coordinate[w_loc-1],coordinate[w_loc]),[coordinate[w_loc]-coordinate[w_loc-1],1])
        noise = tf.random.normal((w//k,h//k,3),0,0.8)
        tensor = tf.ones([w,h//k,3],dtype=tf.dtypes.float32)
        noise = tf.tensor_scatter_nd_update(tensor, indices, noise)
        noise = tf.transpose(noise,[1,0,2])
        indices = tf.reshape(tf.range(coordinate[h_loc-1],coordinate[h_loc]),[coordinate[h_loc]-coordinate[h_loc-1],1])
        tensor = tf.ones([w,h,3],dtype=tf.dtypes.float32)
        noise = tf.tensor_scatter_nd_update(tensor, indices, noise)
        noise = tf.transpose(noise,[1,0,2])
        image = image*noise
    return(image)


In [None]:
class DatasetGenerator:
    """
    Load pascalVOC 2007 dataset and creates an input pipeline.
    - Reshapes images into 448 x 448
    - converts [0 1] to [-1 1]
    - shuffles the input
    - builds batches
    """

    def __init__(self,test_size=0.2,random_state=0):
        image_names = []
        record_list = []
        object_num_list = []
        # filling the record_list
        input_file = open(DATA_PATH, 'r')
  
        for line in input_file:
            line = line.strip()
            ss = line.split(' ')
            image_names.append(ss[0])

            record_list.append([float(num) for num in ss[1:]])

            object_num_list.append(min(len(record_list[-1])//5, 
                                            MAX_OBJECTS_PER_IMAGE))
            if len(record_list[-1]) < MAX_OBJECTS_PER_IMAGE*5:
                # if there are objects less than MAX_OBJECTS_PER_IMAGE, pad the list
                record_list[-1] = record_list[-1] +                [0., 0., 0., 0., 0.]*                (MAX_OBJECTS_PER_IMAGE-len(record_list[-1])//5)
                
            elif len(record_list[-1]) > MAX_OBJECTS_PER_IMAGE*5:
               # if there are objects more than MAX_OBJECTS_PER_IMAGE, crop the list
                record_list[-1] = record_list[-1][:MAX_OBJECTS_PER_IMAGE*5]
        self.image_names_train, self.image_names_val, self.record_list_train, self.record_list_val,self.object_num_list_train,self.object_num_list_val = train_test_split(image_names, record_list,object_num_list, test_size=test_size, random_state=random_state)

    def _data_preprocess(self, image_name, raw_labels, object_num):
        image_file = tf.io.read_file(IMAGE_DIR+image_name)
        image = tf.io.decode_jpeg(image_file, channels=3)

        h = tf.shape(image)[0]
        w = tf.shape(image)[1]

        width_ratio  = IMAGE_SIZE * 1.0 / tf.cast(w, tf.float32) 
        height_ratio = IMAGE_SIZE * 1.0 / tf.cast(h, tf.float32) 

        image = tf.image.resize(image, size=[IMAGE_SIZE, IMAGE_SIZE])
        # image = (image/255) * 2 - 1
        if pretrain_model == 'Resnet':
            image = tf.keras.applications.resnet.preprocess_input(image)
        elif pretrain_model == 'Xception':
            image = tf.keras.applications.xception.preprocess_input(image)
        elif pretrain_model == 'EfficientNetB0':
            image = tf.keras.applications.efficientnet.preprocess_input(image)
        elif pretrain_model == 'DenseNet121':
            image = tf.keras.applications.densenet.preprocess_input(image)
        raw_labels = tf.cast(tf.reshape(raw_labels, [-1, 5]), tf.float32)

        xmin = raw_labels[:, 0]
        ymin = raw_labels[:, 1]
        xmax = raw_labels[:, 2]
        ymax = raw_labels[:, 3]
        class_num = raw_labels[:, 4]

        xcenter = (xmin + xmax) * 1.0 / 2.0 * width_ratio
        ycenter = (ymin + ymax) * 1.0 / 2.0 * height_ratio

        box_w = (xmax - xmin) * width_ratio
        box_h = (ymax - ymin) * height_ratio

        labels = tf.stack([xcenter, ycenter, box_w, box_h, class_num], axis=1)

        return image, labels, tf.cast(object_num, tf.int32)
    def _map_fun(self,image,labels,object_num):
        image= tf.image.random_brightness(image, max_delta=2)
        image = tf.image.random_contrast(image, lower=0.4, upper=2)
        image = tf.image.random_hue(image,max_delta=0.2)
        image = tf.image.random_saturation(image,lower=0,upper=1)
        image = random_noise(image,k=10)
        image,labels = image_flip_and_coordinate(image,labels,object_num)
        
        return image,labels,object_num

    def generate(self):
        
        # train data
        train_dataset = tf.data.Dataset.from_tensor_slices((self.image_names_train, 
                                                      np.array(self.record_list_train), 
                                                      np.array(self.object_num_list_train)))
        train_dataset = train_dataset.shuffle(100000)
            
        train_dataset = train_dataset.map(self._data_preprocess, 
                              num_parallel_calls = tf.data.experimental.AUTOTUNE)
        train_dataset = train_dataset.map(self._map_fun, 
                              num_parallel_calls = tf.data.experimental.AUTOTUNE)

        train_dataset = train_dataset.batch(BATCH_SIZE,True)
        train_dataset = train_dataset.prefetch(buffer_size=1000)

        return train_dataset

    def generate_val(self):

        val_dataset = tf.data.Dataset.from_tensor_slices((self.image_names_val, 
                                                      np.array(self.record_list_val), 
                                                      np.array(self.object_num_list_val)))
        val_dataset = val_dataset.map(self._data_preprocess, 
                              num_parallel_calls = tf.data.experimental.AUTOTUNE)

        val_dataset = val_dataset.batch(VAL_BATCH_SIZE,True)

        val_dataset = val_dataset.prefetch(buffer_size=1000)
        return val_dataset

## (一)、Pre-trained model 
* 我們曾經嘗試不同的 pre-trained model，包含 ResNet152、Xception、EfficientNetB0、DenseNet121。
* 由於時間因素，我們暫時將Pre-trained model 的layers 凍結住 ，雖然說 Pre-trained model中的係數，並不是用來針對YOLO 做最佳化的，我們目前只將 Pre-trained model 作為 類似feature 提去的方式，然會對我們自己設計的 yolo 模型 做最佳化。

In [None]:
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications import ResNet152,Xception,EfficientNetB0,DenseNet121

def conv_leaky_relu(inputs, filters, size, stride):
    x = layers.Conv2D(filters, size, stride, padding="same",
                    kernel_initializer=tf.keras.initializers.TruncatedNormal())(inputs)
    x = layers.LeakyReLU(0.1)(x)
    return x

if pretrain_model == 'Resnet':
    base_model = ResNet152(
        include_top=False,
        weights='imagenet',
        input_shape=(448,448,3),
        pooling=None)
elif pretrain_model == 'Xception':
    base_model = Xception(
        include_top=False,
        weights='imagenet',
        input_shape=(448,448,3),
        pooling=None)
elif pretrain_model == 'EfficientNetB0':
    base_model = EfficientNetB0(
        include_top=False, 
        weights='imagenet', 
        input_shape=(448,448,3),
        pooling=None)
elif pretrain_model == 'DenseNet121':
    base_model = DenseNet121(
        include_top=False, 
        weights='imagenet', 
        input_shape=(448,448,3),
        pooling=None)
for layer in base_model.layers:
    if layer.name in ['No']:
        layer.trainable = True
    else:
        layer.trainable = False
        
    

## (二)、Model construction
我們在pre-trained model 後仿造paper 前三層 接上三層 Convolution+Leaky ReLU ，其中第四層開始，與助教提供的示範 code 不一樣的是，依照我們自己認為預測的重要程度 從 class_prob , object_prob 到位置 location 分別依序把多維的輸入壓扁為一維輸出(接上扁平層)後，接上全連接階層(Dense)， sequential 將計算Loss 所需要的維度作為模型的輸出。

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
class network(keras.Model):
    def __init__(self):
        super(network,self).__init__()
        self.base_model = base_model
        self.conv_1 = layers.Conv2D(1024,3,1)
        self.conv_2 = layers.Conv2D(1024,3,2)
        self.conv_3 = layers.Conv2D(1024,3,1)
        #self.conv_4 = layers.Conv2D(20,1,1)
        #self.conv_5 = layers.Conv2D(1024,1,1)
        self.dense_1 = layers.Dense(98,kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.01))
        self.dense_2 = layers.Dense(392,kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.01))
        self.dense_3 = layers.Dense(980,kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.01))
        self.dense_4 = layers.Dense(2048,kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.01))
        self.dense_5 = layers.Dense(2048,kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.01))
        self.dense_6 = layers.Dense(2048,kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.01))
        # self.dense_7 = layers.Dense(1024,kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.01))
        # self.dense_8 = layers.Dense(1024,kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.01))
        # self.dense_9 = layers.Dense(1024,kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.01))
    def call(self,inputs):
        x=base_model(inputs)
        x=self.conv_1(x)
        x=layers.LeakyReLU(0.1)(x)
        x=self.conv_2(x)
        x=layers.LeakyReLU(0.1)(x)
        x=self.conv_3(x)
        x=layers.LeakyReLU(0.1)(x)
        x = layers.Flatten()(x)
        c = self.dense_4(x)
        c=layers.LeakyReLU(0.1)(c)
        class_prob = self.dense_3(c)

        #class_prob = tf.nn.sigmoid(class_prob)
        class_prob = tf.reshape(class_prob,[-1,7,7,20])
        #x = tf.concat([c,class_prob],-1)
        c = self.dense_5(c)
        c=layers.LeakyReLU(0.1)(c)
        object_prob = self.dense_1(c)
        object_prob = tf.reshape(object_prob,[-1,7,7,2])

        # x = tf.concat([x,object_prob],-1)
        c = self.dense_6(c)
        c=layers.LeakyReLU(0.1)(c)
        # x = self.conv_5(x)
        #x = layers.Flatten()(x)
        bbox = self.dense_2(c)
        bbox = tf.reshape(bbox,[-1,7,7,8])

        #outputs=layers.Flatten()(tf.concat([class_prob,object_prob,bbox],-1))
        return [class_prob,object_prob,bbox]

In [None]:
img_inputs = keras.Input(shape=(IMAGE_SIZE, IMAGE_SIZE, 3))


YOLO = network()
outputs=YOLO(img_inputs)
YOLO.summary()

## (三)、Loss
我們有嘗試改寫yolo loss 的定義，我們認為在計算probability時，並不應該使用 L2 norm, 應該改採用 entropy 的方式計算，但在幾次試驗之後發現，如此更改Loss 會導致原先yolo loss 的比重需要做相當程度的調整，最後我們還是採用助教提供的loss 定義方式。

In [None]:
# base boxes (for loss calculation)
base_boxes = np.zeros([CELL_SIZE, CELL_SIZE, 4])

# initializtion for each cell
for y in range(CELL_SIZE):
    for x in range(CELL_SIZE):
        ######x,y flip
        base_boxes[y, x, :] = [IMAGE_SIZE / CELL_SIZE * x, 
                            IMAGE_SIZE / CELL_SIZE * y, 0, 0]

base_boxes = np.resize(base_boxes, [CELL_SIZE, CELL_SIZE, 1, 4])
base_boxes = np.tile(base_boxes, [1, 1, BOXES_PER_CELL, 1])
def yolo_loss(predicts, labels, objects_num):
    """
    Add Loss to all the trainable variables
    Args:
        predicts: 4-D tensor [batch_size, cell_size, cell_size, num_classes + 5 * boxes_per_cell]
        ===> (num_classes, boxes_per_cell, 4 * boxes_per_cell)
        labels  : 3-D tensor of [batch_size, max_objects, 5]
        objects_num: 1-D tensor [batch_size]
    """

    loss = 0.
    
    #you can parallel the code with tf.map_fn or tf.vectorized_map (big performance gain!)
    for i in tf.range(BATCH_SIZE):
        predict = predicts[i, :, :, :]
        label = labels[i, :, :]
        object_num = objects_num[i]

        for j in tf.range(object_num):

            results = losses_calculation(predict, label[j:j+1, :])

            loss = loss + results

    return loss/BATCH_SIZE

In [None]:
def iou(boxes1, boxes2):
    """calculate ious
    Args:
    boxes1: 4-D tensor [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL, 4]  ====> (x_center, y_center, w, h)
    boxes2: 1-D tensor [4] ===> (x_center, y_center, w, h)

    Return:
    iou: 3-D tensor [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
    ====> iou score for each cell
    """

    #boxes1 : [4(xmin, ymin, xmax, ymax), cell_size, cell_size, boxes_per_cell]
    boxes1 = tf.stack([boxes1[:, :, :, 0] - boxes1[:, :, :, 2] / 2, boxes1[:, :, :, 1] - boxes1[:, :, :, 3] / 2,
                    boxes1[:, :, :, 0] + boxes1[:, :, :, 2] / 2, boxes1[:, :, :, 1] + boxes1[:, :, :, 3] / 2])

    #boxes1 : [cell_size, cell_size, boxes_per_cell, 4(xmin, ymin, xmax, ymax)]
    boxes1 = tf.transpose(boxes1, [1, 2, 3, 0])

    boxes2 =  tf.stack([boxes2[0] - boxes2[2] / 2, boxes2[1] - boxes2[3] / 2,
                    boxes2[0] + boxes2[2] / 2, boxes2[1] + boxes2[3] / 2])

    #calculate the left up point of boxes' overlap area
    lu = tf.maximum(boxes1[:, :, :, 0:2], boxes2[0:2])
    #calculate the right down point of boxes overlap area
    rd = tf.minimum(boxes1[:, :, :, 2:], boxes2[2:])

    #intersection
    intersection = rd - lu 

    #the size of the intersection area
    inter_square = intersection[:, :, :, 0] * intersection[:, :, :, 1]

    mask = tf.cast(intersection[:, :, :, 0] > 0, tf.float32) * tf.cast(intersection[:, :, :, 1] > 0, tf.float32)

    #if intersection is negative, then the boxes don't overlap
    inter_square = mask * inter_square

    #calculate the boxs1 square and boxs2 square
    square1 = (boxes1[:, :, :, 2] - boxes1[:, :, :, 0]) * (boxes1[:, :, :, 3] - boxes1[:, :, :, 1])
    square2 = (boxes2[2] - boxes2[0]) * (boxes2[3] - boxes2[1])

    return inter_square/(square1 + square2 - inter_square)

In [None]:
def losses_calculation(predict, label):
    """
    calculate loss
    Args:
      predict: 3-D tensor [cell_size, cell_size, num_classes + 5 * boxes_per_cell]
      label : [1, 5]  (x_center, y_center, w, h, class)
    """
    label = tf.reshape(label, [-1])

    #Step A. calculate objects tensor [CELL_SIZE, CELL_SIZE]
    #turn pixel position into cell position (corner)
    min_x = (label[0] - label[2] / 2) / (IMAGE_SIZE / CELL_SIZE)
    max_x = (label[0] + label[2] / 2) / (IMAGE_SIZE / CELL_SIZE)

    min_y = (label[1] - label[3] / 2) / (IMAGE_SIZE / CELL_SIZE)
    max_y = (label[1] + label[3] / 2) / (IMAGE_SIZE / CELL_SIZE)

    min_x = tf.floor(min_x)
    min_x = tf.maximum(min_x,0)
    min_y = tf.floor(min_y)
    min_y = tf.maximum(min_y,0)
    max_x = tf.minimum(tf.math.ceil(max_x), CELL_SIZE)
    max_y = tf.minimum(tf.math.ceil(max_y), CELL_SIZE)
    
    #calculate mask of object with cells
    onset = tf.cast(tf.stack([max_y - min_y, max_x - min_x]), dtype=tf.int32)
    object_mask = tf.ones(onset, tf.float32)

    offset = tf.cast(tf.stack([min_y, CELL_SIZE - max_y, min_x, CELL_SIZE - max_x]), tf.int32)
    offset = tf.reshape(offset, (2, 2))
    object_mask = tf.pad(object_mask, offset, "CONSTANT")

    #Step B. calculate the coordination of object center and the corresponding mask
    #turn pixel position into cell position (center)
    center_x = label[0] / (IMAGE_SIZE / CELL_SIZE)
    center_x = tf.floor(center_x)

    center_y = label[1] / (IMAGE_SIZE / CELL_SIZE)
    center_y = tf.floor(center_y)

    response = tf.ones([1, 1], tf.float32)

    #calculate the coordination of object center with cells
    objects_center_coord = tf.cast(tf.stack([center_y, CELL_SIZE - center_y - 1, 
                             center_x, CELL_SIZE - center_x - 1]), 
                             tf.int32)
    objects_center_coord = tf.reshape(objects_center_coord, (2, 2))

    #make mask
    response = tf.pad(response, objects_center_coord, "CONSTANT")

    #Step C. calculate iou_predict_truth [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
    predict_boxes = predict[:, :, NUM_CLASSES + BOXES_PER_CELL:]

    predict_boxes = tf.reshape(predict_boxes, [CELL_SIZE, 
                                               CELL_SIZE, 
                                               BOXES_PER_CELL, 4])
    #cell position to pixel position
    predict_boxes = predict_boxes * [IMAGE_SIZE / CELL_SIZE, 
                                     IMAGE_SIZE / CELL_SIZE, 
                                     IMAGE_SIZE, IMAGE_SIZE]

    #if there's no predict_box in that cell, then the base_boxes will be calcuated with label and got iou equals 0
    predict_boxes = base_boxes + predict_boxes

    iou_predict_truth = iou(predict_boxes, label[0:4])

    #calculate C tensor [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
    C = iou_predict_truth * tf.reshape(response, [CELL_SIZE, CELL_SIZE, 1])

    #calculate I tensor [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
    I = iou_predict_truth * tf.reshape(response, [CELL_SIZE, CELL_SIZE, 1])

    max_I = tf.reduce_max(I, 2, keepdims=True)

    #replace large iou scores with response (object center) value
    I = tf.cast((I >= max_I), tf.float32) * tf.reshape(response, (CELL_SIZE, CELL_SIZE, 1))

    #calculate no_I tensor [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
    no_I = tf.ones_like(I, dtype=tf.float32) - I

    p_C = tf.nn.sigmoid(predict[:, :, NUM_CLASSES:NUM_CLASSES + BOXES_PER_CELL])
    #tf.print(p_C)
    #calculate truth x, y, sqrt_w, sqrt_h 0-D
    x = label[0]
    y = label[1]

    sqrt_w = tf.sqrt(tf.abs(label[2]))
    sqrt_h = tf.sqrt(tf.abs(label[3]))

    #calculate predict p_x, p_y, p_sqrt_w, p_sqrt_h 3-D [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
    p_x = predict_boxes[:, :, :, 0]
    p_y = predict_boxes[:, :, :, 1]

    p_sqrt_w = tf.sqrt(tf.minimum(IMAGE_SIZE * 1.0, tf.maximum(0.0, predict_boxes[:, :, :, 2])))
    p_sqrt_h = tf.sqrt(tf.minimum(IMAGE_SIZE * 1.0, tf.maximum(0.0, predict_boxes[:, :, :, 3])))

    #calculate ground truth p 1-D tensor [NUM_CLASSES]
    P = tf.one_hot(tf.cast(label[4], tf.int32), NUM_CLASSES, dtype=tf.float32)

    #calculate predicted p_P 3-D tensor [CELL_SIZE, CELL_SIZE, NUM_CLASSES]
    p_P = tf.nn.sigmoid(predict[:, :, 0:NUM_CLASSES])

    #class_loss
    class_loss = tf.nn.l2_loss(tf.reshape(object_mask, (CELL_SIZE, CELL_SIZE, 1)) * (p_P - P)) * CLASS_SCALE

    #object_loss
    object_loss = tf.nn.l2_loss(I * (p_C - C)) * OBJECT_SCALE

    #noobject_loss
    noobject_loss = tf.nn.l2_loss(no_I * (p_C)) * NOOBJECT_SCALE

    #coord_loss
    coord_loss = (tf.nn.l2_loss(I * (p_x - x)/(IMAGE_SIZE/CELL_SIZE)) +
                  tf.nn.l2_loss(I * (p_y - y)/(IMAGE_SIZE/CELL_SIZE)) +
                  tf.nn.l2_loss(I * (p_sqrt_w - sqrt_w))/IMAGE_SIZE +
                  tf.nn.l2_loss(I * (p_sqrt_h - sqrt_h))/IMAGE_SIZE) * COORD_SCALE

    return class_loss + object_loss + noobject_loss + coord_loss

In [None]:
ddd = DatasetGenerator(0.1)
dataset = ddd.generate()
dataset_val = ddd.generate_val()
dataset.take(1)
optimizer = tf.keras.optimizers.Adam(LEARNING_RATE)
train_loss_metric = tf.keras.metrics.Mean(name='loss')

ckpt = tf.train.Checkpoint(epoch=tf.Variable(0), net=YOLO)
manager = tf.train.CheckpointManager(ckpt, './ckpts/'+pretrain_model, max_to_keep=10,
                                    checkpoint_name='yolo')
ckpt.restore(manager.latest_checkpoint)

#  ckpt.restore('./ckpts/'+pretrain_model+'/yolo-10')
valid_loss_metric = tf.keras.metrics.Mean(name='lossval')

## (四)、Epochs 
我們不僅對 Train step 觀察在梯度下降過程中的loss 下降趨勢，同時觀察validation set上loss的下降趨勢。
根據我們運算的資源和測試，在epoch設定為 40 時，可以在運算速度以及loss下降趨勢上有較好的平衡。

In [None]:
@tf.function

def train_step(image, labels, objects_num):
    with tf.GradientTape() as tape:

        outputs = YOLO(image)
        
        # class_end = CELL_SIZE * CELL_SIZE * NUM_CLASSES

        # conf_end = class_end + CELL_SIZE * CELL_SIZE * BOXES_PER_CELL

        class_probs = outputs[0]#tf.reshape(outputs[:, 0:class_end], (-1, 7, 7, 20))

        confs = outputs[1]#tf.reshape(outputs[:, class_end:conf_end], (-1, 7, 7, 2))

        boxes = outputs[2]#tf.reshape(outputs[:, conf_end:], (-1, 7, 7, 2*4))

        predicts = tf.concat([class_probs, confs, boxes], 3)
        
        loss = yolo_loss(predicts, labels, objects_num)

        train_loss_metric(loss)
        


    
    grads = tape.gradient(loss, YOLO.trainable_weights)
    optimizer.apply_gradients(zip(grads, YOLO.trainable_weights))
    

@tf.function

def valid_step(image, labels, objects_num):
        outputs = YOLO(image,training=False)

        # class_end = CELL_SIZE * CELL_SIZE * NUM_CLASSES

        # conf_end = class_end + CELL_SIZE * CELL_SIZE * BOXES_PER_CELL

        class_probs = outputs[0]#tf.reshape(outputs[:, 0:class_end], (-1, 7, 7, 20))

        confs = outputs[1]#tf.reshape(outputs[:, class_end:conf_end], (-1, 7, 7, 2))

        boxes = outputs[2]#tf.reshape(outputs[:, conf_end:], (-1, 7, 7, 2*4))

        predicts = tf.concat([class_probs, confs, boxes], 3)

        lossval = yolo_loss(predicts, labels, objects_num)

        valid_loss_metric(lossval)

In [None]:
from datetime import datetime

print("{}, start training.".format(datetime.now()))
for i in range(EPOCHS):
    train_loss_metric.reset_states()
    valid_loss_metric.reset_states()
    ckpt.epoch.assign_add(1)
    for idx, (image, labels, objects_num) in enumerate(dataset):


        train_step(image, labels, objects_num)

        tf.print(train_loss_metric.result())
    print("{}, Epoch {}: train loss {:.2f}".format(datetime.now(), i+1, train_loss_metric.result()))
    for idx, (image, labels, objects_num) in enumerate(dataset_val):

        valid_step(image, labels, objects_num)
        tf.print(valid_loss_metric.result())
    print("{}, Epoch {}: valid loss {:.2f}".format(datetime.now(), i+1, valid_loss_metric.result()))
    save_path = manager.save()
    print("Saved checkpoint for epoch {}: {}".format(int(ckpt.epoch), save_path))
#%%



# (五)、Non-max Suppression
在助教提供的程式碼中，只會預測機率最大的Bounding Box與其類別，但在一張圖中可能會有許多不同的Object，因此我們後來採取下列方式：
* 將confidence大於此confidence threshold的預測 bounding box留著，其餘刪掉。
* 由於同一個物體可能被需多多個bounding box框住，所以多個高度重疊的bounding box選擇出一個。(Non-max suppression)
* object probability 最高的bounding box 作為 true label 並計算其他與此 label 的IOU 。當 iou 高過某一個iou threshold時，視其為框住同一個物體，並將其刪除，並重複此步驟直到結束。

我們有寫出自己的版本後，但發現有套件可以達到同樣的效果後，即採用套件的模式實現 Non-max suppression。

In [None]:
def process_outputs(outputs):
    """
    Process YOLO outputs into bou
    """

    # class_end = CELL_SIZE * CELL_SIZE * NUM_CLASSES
    # conf_end = class_end + CELL_SIZE * CELL_SIZE * BOXES_PER_CELL
    # class_probs = np.reshape(outputs[:, 0:class_end], (-1, 7, 7, 20))
    # confs = np.reshape(outputs[:, class_end:conf_end], (-1, 7, 7, 2))
    # boxes = np.reshape(outputs[:, conf_end:], (-1, 7, 7, 2*4))
    # predicts = np.concatenate([class_probs, confs, boxes], 3)

    p_classes = outputs[0]

    C = outputs[1]

    coordinate = outputs[2]


    p_classes = np.reshape(p_classes, (CELL_SIZE, CELL_SIZE, 1, 20))
    p_classes = 1/(1+np.exp(-p_classes))


    C = np.reshape(C, (CELL_SIZE, CELL_SIZE, BOXES_PER_CELL, 1))
    C = 1/(1+np.exp(-C))

    coordinate = np.reshape(coordinate, 
                            (CELL_SIZE, 
                             CELL_SIZE,
                             BOXES_PER_CELL, 
                             4))
    coordinate = coordinate * [IMAGE_SIZE / CELL_SIZE, 
                                     IMAGE_SIZE / CELL_SIZE, 
                                     IMAGE_SIZE, IMAGE_SIZE]
    coordinate = base_boxes + coordinate
    
    coordinate[:, :, :, 2] = np.minimum(IMAGE_SIZE * 1.0, np.maximum(0.0, coordinate[:, :, :, 2]))
    coordinate[:, :, :, 3] = np.minimum(IMAGE_SIZE * 1.0, np.maximum(0.0, coordinate[:, :, :, 3]))
    coordinate_tips = np.zeros_like(coordinate)
    coordinate_tips[:,:,:,0] = coordinate[:, :, :, 0]-coordinate[:, :, :, 2]/2.
    coordinate_tips[:,:,:,1] = coordinate[:, :, :, 0]-coordinate[:, :, :, 3]/2.
    coordinate_tips[:,:,:,2] = coordinate[:, :, :, 0]+coordinate[:, :, :, 2]/2.
    coordinate_tips[:,:,:,3] = coordinate[:, :, :, 0]+coordinate[:, :, :, 3]/2.

    coordinate_tips = np.reshape(coordinate_tips,(1,-1,1,4))
    #coordinate_tips = np.reshape(coordinate_tips,(-1,4))

    coordinate_tips=coordinate_tips.astype("float32")

    #X = np.reshape(C,-1)
    P = C * p_classes
    P = np.reshape(P,(1,-1,20))
    #P's shape [7, 7, 2, 20]
    index = tf.image.combined_non_max_suppression(coordinate_tips,P,5,2,iou_threshold=0.2,clip_boxes=False,score_threshold=0.02)
    #index = tf.image.non_max_suppression(coordinate_tips,X,3, iou_threshold=0.5,score_threshold=0.1)
    #max_coordinate = coordinate_tips[index,:]
    # max_conf = X[[index]]
    # index = np.unravel_index(index, C.shape)
    # class_num = np.argmax(p_classes[index[0],index[1],0,:],1)
    end=np.sum(index[1]!=0)
    
    return index[0][0,0:end,0], index[0][0,0:end,1], index[0][0,0:end,2], index[0][0,0:end,3], index[2][0,0:end], index[1][0,0:end]

### 手刻版

In [None]:
def process_outputs(outputs):
    """
    Process YOLO outputs into bou
    """
    class_end = CELL_SIZE * CELL_SIZE * NUM_CLASSES
    conf_end = class_end + CELL_SIZE * CELL_SIZE * BOXES_PER_CELL
    class_probs = np.reshape(outputs[:, 0:class_end], (-1, 7, 7, 20))
    confs = np.reshape(outputs[:, class_end:conf_end], (-1, 7, 7, 2))
    boxes = np.reshape(outputs[:, conf_end:], (-1, 7, 7, 2*4))
    BOXES = boxes.reshape(7, 7,2,4)
    predicts = np.concatenate([class_probs, confs, boxes], 3)

    p_classes = predicts[0, :, :, 0:20]
    C = predicts[0, :, :, 20:22]
    coordinate = predicts[0, :, :, 22:]

    p_classes = np.reshape(p_classes, (CELL_SIZE, CELL_SIZE, 1, 20))
    C = np.reshape(C, (CELL_SIZE, CELL_SIZE, BOXES_PER_CELL, 1))

    P_threshold = 0.6
    iou_threshold = 0.8
    C_threshold = 0.4 # np.mean(C)


    C = C*(C>C_threshold)    
    P = C * p_classes
    #P's shape [7, 7, 2, 20]

    #choose the most confidence one

    output = []

    num_of_candidator = np.sum(P>P_threshold)
    # 最少框一個
    if num_of_candidator == 0:
        num_of_candidator = 1

    while num_of_candidator >0:
        max_conf = np.max(P)
        index = np.argmax(P)
        index = np.unravel_index(index, P.shape)

        max_class = np.max(P[index[0],index[1],:])
        # P[index[0],index[1],index[2],:] = 0.
        class_num = index[3]

        max_coordinate = BOXES[index[0], index[1], index[2], :]

        xcenter = max_coordinate[0]
        ycenter = max_coordinate[1]
        w = max_coordinate[2]
        h = max_coordinate[3]

        xcenter = (index[1] + xcenter) * (IMAGE_SIZE/float(CELL_SIZE))
        ycenter = (index[0] + ycenter) * (IMAGE_SIZE/float(CELL_SIZE))

        w = w * IMAGE_SIZE
        h = h * IMAGE_SIZE

        xmin = xcenter - w/2.0
        ymin = ycenter - h/2.0

        xmax = xmin + w
        ymax = ymin + h

        IOU = iou(tf.constant(max_coordinate),tf.constant(BOXES))
        IOU = IOU.numpy().reshape(1,7,7,2)
        C = C*(IOU<iou_threshold)
        P = C * p_classes
        num_of_candidator = np.sum(P>P_threshold)
        output.append([xmin, ymin, xmax, ymax, class_num, max_conf])
        output=np.array(output)


    return output[:,0],output[:,1],output[:,2],output[:,3],output[:,4],output[:,5]


# 三、Prediction

In [None]:
test_img_files = open('pascal_voc_testing_data.txt')
test_img_dir = 'VOCdevkit_test/VOC2007/JPEGImages/'
test_images = []

for line in test_img_files:
    line = line.strip()
    ss = line.split(' ')
    test_images.append(ss[0])

test_dataset = tf.data.Dataset.from_tensor_slices(test_images)

def load_img_data(image_name):
    image_file = tf.io.read_file(test_img_dir+image_name)
    image = tf.image.decode_jpeg(image_file, channels=3)

    h = tf.shape(image)[0]
    w = tf.shape(image)[1]

    image = tf.image.resize(image, size=[IMAGE_SIZE, IMAGE_SIZE])
    #image = (image/255) * 2 - 1
    if pretrain_model == 'Resnet':
        image = tf.keras.applications.resnet.preprocess_input(image)
    elif pretrain_model == 'Xception':
        image = tf.keras.applications.xception.preprocess_input(image)
    elif pretrain_model == 'EfficientNetB0':
        image = tf.keras.applications.efficientnet.preprocess_input(image)
    elif pretrain_model == 'DenseNet121':
            image = tf.keras.applications.densenet.preprocess_input(image)
    return image_name, image, h, w

test_dataset = test_dataset.map(load_img_data, num_parallel_calls = tf.data.experimental.AUTOTUNE)
test_dataset = test_dataset.batch(32)
ckpt = tf.train.Checkpoint(net=YOLO)
ckpt.restore('./ckpts/'+pretrain_model+'/yolo-16')

In [None]:
@tf.function
def prediction_step(img):
    return YOLO(img, training=False)

In [None]:
output_file = open('./test_predictionJJ.txt', 'w')

for img_name, test_img, img_h, img_w in test_dataset:
    batch_num = img_name.shape[0]
    for i in range(batch_num):
        xmin, ymin, xmax, ymax, class_num, conf = process_outputs(prediction_step(test_img[i:i+1]))
        xmin =tf.cast(xmin,tf.float64)
        xmax =tf.cast(xmax,tf.float64)
        ymin =tf.cast(ymin,tf.float64)
        ymax =tf.cast(ymax,tf.float64)
        class_num = tf.cast(class_num,tf.int32)
        xmin, ymin, xmax, ymax = xmin*(img_w[i:i+1]/IMAGE_SIZE), ymin*(img_h[i:i+1]/IMAGE_SIZE), xmax*(img_w[i:i+1]/IMAGE_SIZE), ymax*(img_h[i:i+1]/IMAGE_SIZE)
        output_file.write(img_name[i:i+1].numpy()[0].decode('ascii'))
        #img filename, xmin, ymin, xmax, ymax, class, confidence
        if xmin.shape==1 :
            output_file.write(" %d %d %d %d %d %f" %(xmin, ymin, xmax, ymax, class_num, conf))
            output_file.write("\n")
        else:
            for xmin, ymin, xmax, ymax, class_num, conf in zip(xmin, ymin, xmax, ymax, class_num, conf):
                output_file.write(" %d %d %d %d %d %f" %(xmin, ymin, xmax, ymax, class_num, conf))
            output_file.write("\n")
output_file.close()


In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import cv2
import sys
sys.path.insert(0, './evaluate')
import evaluate
# os.chdir('/home/rabbit2/contest2/datalab-cup2-object-detection-2020/JBR')

#evaluate.evaluate("input prediction file name", "desire output csv file name")
evaluate.evaluate('./test_predictionJJ.txt', './output_fileJJ.csv')

In [None]:
import matplotlib.pyplot as plt
import cv2

np_img = cv2.imread('./VOCdevkit_test/VOC2007/JPEGImages/000179.jpg')
resized_img = cv2.resize(np_img, (IMAGE_SIZE, IMAGE_SIZE))
np_img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB)
resized_img = np_img
np_img = np_img.astype(np.float32)

if pretrain_model == 'Resnet':
    np_img = tf.keras.applications.resnet.preprocess_input(np_img)
elif pretrain_model == 'Xception':
    np_img = tf.keras.applications.xception.preprocess_input(np_img)
elif pretrain_model == 'EfficientNetB0':
    np_img = tf.keras.applications.efficientnet.preprocess_input(np_img)
elif pretrain_model == 'DenseNet121':
    np_img = tf.keras.applications.densenet.preprocess_input(np_img)

y_pred = YOLO(np_img, training=False)
xmin, ymin, xmax, ymax, class_num, conf = process_outputs(y_pred)

class_num = tf.cast(class_num,tf.int32)

class_name = [classes_name[i] for i in class_num]


for i in range(len(xmin)):
    cv2.rectangle(resized_img, (int(xmin[i]), int(ymin[i])), (int(xmax[i]), int(ymax[i])), (0, 255, 255), 3)
    cv2.putText(resized_img, class_name[i], (0, 200), 2, 1.5, (0, 255, 255), 2)

plt.imshow(resized_img)
plt.show()

# 四、Summary
* 我們曾經嘗試用ResNet50、ResNet152、Xception、EfficientNetB0、DenseNet121、VGG16、VGG19等模型進行預測，在只做上述data preprocessing的情況下，直接去train的效果表現一般。
* 最後我們在Kaggle上private score最好的是用EfficientNetB0的模型train出來的。
* 由於運算資源和時間因素，我們的模型其實迭代次數並不多，相信若迭代次數更多，模型表現應該可以更好。
* 由於不同的pre-trained model 其實也可以代表不同模型預測出的不同結果，也就意味著，我們或許可以再利用Non-max suppression 的方式，將模型做 ensemble，可惜因時間因素，並不能實現此階段。