# this notebook implements the yolov5 in tensorflow gpu with replicating the process from official repository https://github.com/ultralytics/yolov5 

converted the pretrained weights to tensorflow format

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import os,shutil,math , yaml
import matplotlib.pyplot as plt
import ast
from tqdm import tqdm
import argparse
import sys, math
from copy import deepcopy
from pathlib import Path
from tensorflow import keras

In [None]:
# from module import Conv2d, Conv, Bottleneck, SPP, SPPF, Focus, BottleneckCSP, C3,Upsample,Concat,Detect, BN
# from yololoss import YoloLoss
# from anchorlabel import AnchorLabeler
# from lrscheduler import LrScheduler
# from preprocess_data import DataReader

In [None]:
def auto_select_accelerator():
    TPU_DETECTED = False
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
        print("Running on TPU:", tpu.master())
        TPU_DETECTED = True
    except ValueError:
        strategy = tf.distribute.get_strategy()
    print(f"Running on {strategy.num_replicas_in_sync} replicas")

    return strategy , strategy.num_replicas_in_sync

In [None]:
def coco2yolo(image_height, image_width, bboxes):
    """
    coco => [xmin, ymin, w, h]
    yolo => [xmid, ymid, w, h] (normalized)
    """
    
    bboxes = bboxes.copy().astype(float) # otherwise all value will be 0 as voc_pascal dtype is np.int
    
    # normolizinig
    bboxes[..., [0, 2]]= bboxes[..., [0, 2]]/ image_width
    bboxes[..., [1, 3]]= bboxes[..., [1, 3]]/ image_height
    
    # converstion (xmin, ymin) => (xmid, ymid)
    bboxes[..., [0, 1]] = bboxes[..., [0, 1]] + bboxes[..., [2, 3]]/2
    
    return bboxes.tolist()

In [None]:
def get_bbox(annots):
    bboxes = [list(annot.values()) for annot in annots]
#     print(np.array(bboxes).shape)
    bboxes = coco2yolo(image_height = 720, image_width = 1280, bboxes = np.array(bboxes))
    bboxes = [bbox + [0] for bbox in bboxes]
    return bboxes


FOLD = 1
df = pd.read_csv('../input/reef-cv-strategy-subsequences-dataframes/cross-validation/train-10folds.csv')
df['annotations'] = df['annotations'].apply(lambda x: ast.literal_eval(x))
df = df[df['n_annotations'] > 0]
df['bboxes'] = df['annotations'].apply(get_bbox)
train_df = df.query('fold != @FOLD')
val_df = df.query('fold == @FOLD')

train_annotations_dict = {}
train_annotations_dict['image_dir'] = train_df['image_path'].values.tolist()
train_annotations_dict['labels'] = train_df['bboxes'].values.tolist()

In [None]:
class Params:
    warmup_epochs = 2
    n_epochs = 3
    batch_size = 4
    warmup_steps = 500
    img_size = 640
    label_smoothing = 0.0
    num_classes = 1
    buffer_size = 256
    momentum = 0.93
    optimizer = 'adam'
    init_learning_rate = 1e-4
    warmup_learning_rate = 1e-5
    len_train_dataset = 5000
    box = 0.05
    obj = 1.0
    cls = 1.0
    fliplr  = 0.5
    flipud  = 0.5
    hsv_v = 0.4
    hsv_h = 0.017
    hsv_s = 0.7
    degrees = 0.0
    shear = 0.0
    scale = 0.9
    perspective = 0.0
    translate = 0.1
    mosaic = 1.0
    mixup = 0.1
    weight_dir = '/kaggle/input/yolov5-cv-045/yolov5s6.h5'
    yaml_dir = '/kaggle/input/yolov5-lib-ds/models/hub/yolov5s6.yaml'

# preprocessing is being kept same as pytorch version to get similar result after training

In [None]:
import cv2
import math
import random
import numpy as np
import tensorflow as tf
# random.seed(1919)
class DataReader(object):
    '''
    read the image and label from the text information (generated by dataset/prepare_data.py)
    resize the image, and adjust the label rect if necessary
    augment the dataset (augment function is defined in dataset/augment_data.py)
    '''
    def __init__(self, hyp, annotations_dict, img_size=640, transforms=None, mosaic=False, augment=False, filter_idx=None, test=False):
#         self.annotations_dir = annotations_dir
#         self.annotations = self.load_annotations(annotations_dir)
        self.hyp = hyp
        self.img_size = img_size  # image_target_size
        self.transforms = transforms
        self.mosaic = mosaic
        self.augment = augment
        self.test = test
        self.image_paths = annotations_dict['image_dir']
        self.labels = annotations_dict['labels']
        self.n = len(self.image_paths)
        self.idx = range(len(self.image_paths))
        if filter_idx is not None:  # filter some samples
            self.idx = [i for i in self.idx if i in filter_idx]
            print('filter {} from {}'.format(len(self.idx), len(self.annotations)))

#         for i in self.idx:
#             image_dir, label = self.parse_annotations(self.annotations[i])
#             self.images_dir.append(image_dir)
#             self.labels_ori.append(label)

    def iter(self):
        for i in self.idx:
            yield self[i]
            
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, index):
#             index = self.indices[index]  # linear, shuffled, or image_weights

            hyp = self.hyp
            mosaic = self.mosaic and random.random() < hyp.mosaic
            if mosaic:
                # Load mosaic
                
                img, labels = load_mosaic(self, index)
                shapes = None

                # MixUp augmentation
                if random.random() < hyp.mixup:
                    img, labels = mixup(img, labels, *load_mosaic(self, random.randint(0, self.n - 1)))

            else:
                # Load image
                img, (h0, w0), (h, w) = load_image(self, index)

                # Letterbox
                shape = self.batch_shapes[self.batch[index]] if self.rect else self.img_size  # final letterboxed shape
                img, ratio, pad = letterbox(img, shape, auto=False, scaleup=self.augment)
                shapes = (h0, w0), ((h / h0, w / w0), pad)  # for COCO mAP rescaling

                labels = self.labels[index].copy()
                if labels.size:  # normalized xywh to pixel xyxy format
                    labels[:, 0:4] = xywhn2xyxy(labels[:, 0:4], ratio[0] * w, ratio[1] * h, padw=pad[0], padh=pad[1])

                if self.augment:
                    img, labels = random_perspective(img, labels,
                                                     degrees=hyp.degrees,
                                                     translate=hyp.translate,
                                                     scale=hyp.scale,
                                                     shear=hyp.shear,
                                                     perspective=hyp.perspective)

            nl = len(labels)  # number of labels
            if nl:
                labels[:, 0:4] = xyxy2xywhn(labels[:, 0:4], w=img.shape[1], h=img.shape[0], clip=True, eps=1E-3)

            if self.augment:
                # Albumentations
#                 img, labels = self.albumentations(img, labels)
                nl = len(labels)  # update after albumentations

                # HSV color-space
                augment_hsv(img, hgain=hyp.hsv_h, sgain=hyp.hsv_s, vgain=hyp.hsv_v)

                # Flip up-down
                if random.random() < hyp.flipud:
                    img = np.flipud(img)
                    if nl:
                        labels[:, 1] = 1 - labels[:, 1]

                # Flip left-right
                if random.random() < hyp.fliplr:
                    img = np.fliplr(img)
                    if nl:
                        labels[:, 0] = 1 - labels[:, 0]
            return img, labels
                        

                # Cutouts
                # labels = cutout(img, labels, p=0.5)
                # nl = len(labels)  # update after cutout

#             labels_out = torch.zeros((nl, 6))
#             if nl:
#                 labels_out[:, 1:] = torch.from_numpy(labels)

                
def load_image(self, idx):
    image_path  = self.image_paths[idx]
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32)
#     img = img / 255.0
    return img

def load_mosaic(self, index):
    # YOLOv5 4-mosaic loader. Loads 1 image + 3 random images into a 4-image mosaic
    labels4 = []
    s = self.img_size
    mosaic_border = [-self.img_size // 2, -self.img_size // 2]
    yc, xc = (int(random.uniform(-x, 2 * s + x)) for x in mosaic_border)  # mosaic center x, y
    indices = [index] + random.choices(self.idx, k=3)  # 3 additional image indices
    random.shuffle(indices)
#     print(indices)
    for i, index in enumerate(indices):
        # Load image
        img = load_image(self, index)
        h, w,_ = img.shape
#         plt.imshow(img)
#         plt.show()
        # place img in img4
        if i == 0:  # top left
            img4 = np.full((s * 2, s * 2, img.shape[2]), 114, dtype=np.uint8)  # base image with 4 tiles
            x1a, y1a, x2a, y2a = max(xc - w, 0), max(yc - h, 0), xc, yc  # xmin, ymin, xmax, ymax (large image)
            x1b, y1b, x2b, y2b = w - (x2a - x1a), h - (y2a - y1a), w, h  # xmin, ymin, xmax, ymax (small image)
        elif i == 1:  # top right
            x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, s * 2), yc
            x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h
        elif i == 2:  # bottom left
            x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(s * 2, yc + h)
            x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, w, min(y2a - y1a, h)
        elif i == 3:  # bottom right
            x1a, y1a, x2a, y2a = xc, yc, min(xc + w, s * 2), min(s * 2, yc + h)
            x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h)

        img4[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b]  # img4[ymin:ymax, xmin:xmax]
        padw = x1a - x1b
        padh = y1a - y1b

        # Labels
        labels = np.array(self.labels[index].copy())
        if labels.size:
            labels[:, 0:4] = xywhn2xyxy(labels[:, 0:4], w, h, padw, padh)  # normalized xywh to pixel xyxy format
#             segments = [xyn2xy(x, w, h, padw, padh) for x in segments]
        labels4.append(labels)
#         segments4.extend(segments)
#     plt.imshow(img4)
#     plt.show()
    # Concat/clip labels
    labels4 = np.concatenate(labels4, 0)
    for x in (labels4[:, 0:4]):
        np.clip(x, 0, 2 * s, out=x)  # clip when using random_perspective()
    # img4, labels4 = replicate(img4, labels4)  # replicate
    
    # Augment
#     img4, labels4, segments4 = copy_paste(img4, labels4, segments4, p=self.hyp['copy_paste'])
    img4, labels4 = random_perspective(img4, labels4,
                                       degrees = self.hyp.degrees,
                                       translate=self.hyp.translate,
                                       scale=self.hyp.scale,
                                       shear=self.hyp.shear,
                                       perspective=self.hyp.perspective,
                                       border= mosaic_border)  # border to remove

    return img4, labels4



def random_perspective(img, label=(), degrees=10, translate=.1, scale=.1, shear=10, perspective=0.0, border=(0, 0)):
    # labels style: pixel, [xyxy, cls]
    img = img.astype(np.uint8)

    height = img.shape[0] + border[0] * 2  # shape(h,w,c)
    width = img.shape[1] + border[1] * 2

    # Center
    C = np.eye(3)
    C[0, 2] = -img.shape[1] / 2  # x translation (pixels)
    C[1, 2] = -img.shape[0] / 2  # y translation (pixels)

    # Perspective
    P = np.eye(3)
    P[2, 0] = random.uniform(-perspective, perspective)  # x perspective (about y)
    P[2, 1] = random.uniform(-perspective, perspective)  # y perspective (about x)

    # Rotation and Scale
    R = np.eye(3)
    a = random.uniform(-degrees, degrees)
    # a += random.choice([-180, -90, 0, 90])  # add 90deg rotations to small rotations
    s = random.uniform(1 - scale, 1 + scale)
    # s = 2 ** random.uniform(-scale, scale)
    R[:2] = cv2.getRotationMatrix2D(angle=a, center=(0, 0), scale=s)

    # Shear
    S = np.eye(3)
    S[0, 1] = math.tan(random.uniform(-shear, shear) * math.pi / 180)  # x shear (deg)
    S[1, 0] = math.tan(random.uniform(-shear, shear) * math.pi / 180)  # y shear (deg)

    # Translation
    T = np.eye(3)
    T[0, 2] = random.uniform(0.5 - translate, 0.5 + translate) * width  # x translation (pixels)
    T[1, 2] = random.uniform(0.5 - translate, 0.5 + translate) * height  # y translation (pixels)

    # Combined rotation matrix
    M = T @ S @ R @ P @ C  # order of operations (right to left) is IMPORTANT
    if (border[0] != 0) or (border[1] != 0) or (M != np.eye(3)).any():  # image changed
        if perspective:
            img = cv2.warpPerspective(img, M, dsize=(width, height), borderValue=(114, 114, 114))
        else:  # affine
            img = cv2.warpAffine(img, M[:2], dsize=(width, height), borderValue=(114, 114, 114))

    # Transform label coordinates
    n = len(label)
    if n:
        if np.max(label[:, 0:4]) <= 1.0:  # transfer to pixel level
            label[:, [0, 2]] = label[:, [0, 2]] * img.shape[1]
            label[:, [1, 3]] = label[:, [1, 3]] * img.shape[0]
        # assert np.max(labels[:, 0:4]) > 1, "don't use norm box coordinates here"
        # warp points
        xy = np.ones((n * 4, 3))
        xy[:, :2] = label[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(n * 4, 2)  # x1y1, x2y2, x1y2, x2y1

        xy = (xy @ M.T)[:, :2].reshape(n, 8)

        # create new boxes
        x = xy[:, [0, 2, 4, 6]]
        y = xy[:, [1, 3, 5, 7]]
        xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T

        # reject warped points outside of image
        xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width)
        xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height)
        w = xy[:, 2] - xy[:, 0]
        h = xy[:, 3] - xy[:, 1]
        area = w * h
        area0 = (label[:, 2] - label[:, 0]) * (label[:, 3] - label[:, 1])
        ar = np.maximum(w / (h + 1e-16), h / (w + 1e-16))  # aspect ratio
        i = (w > 2) & (h > 2) & (area / (area0 * scale + 1e-16) > 0.2) & (ar < 20)

        label = label[i]
        label[:, 0:4] = xy[i]
        
        if label.size == 0:  # in case, all labels is out
            label = np.array([[0, 0, 0, 0, 0]], np.float32)
    return img, label


def augment_hsv(img, hgain=0.5, sgain=0.5, vgain=0.5):
    rand = np.random.uniform(-1, 1, 3) * [hgain, sgain, vgain] + 1
    hue, sat, val = cv2.split(cv2.cvtColor(img, cv2.COLOR_BGR2HSV))
    dtype = img.dtype

    x = np.arange(0, 256, dtype=np.int16)
    lut_hue = ((x * rand[0]) % 180).astype(dtype)
    lut_sat = np.clip(x * rand[1], 0, 255).astype(dtype)
    lut_val = np.clip(x * rand[2], 0, 255).astype(dtype)

    img_hsv = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val))).astype(dtype)
    return cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR)


def mixup(im, labels, im2, labels2):
    # Applies MixUp augmentation https://arxiv.org/pdf/1710.09412.pdf
    r = np.random.beta(32.0, 32.0)  # mixup ratio, alpha=beta=32.0
    im = (im * r + im2 * (1 - r)).astype(np.uint8)
    labels = np.concatenate((labels, labels2), 0)
    return im, labels


def xywhn2xyxy(x, w=640, h=640, padw=0, padh=0):
    # Convert nx4 boxes from [x, y, w, h] normalized to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
    y = np.copy(x)
    y[:, 0] = w * (x[:, 0] - x[:, 2] / 2) + padw  # top left x
    y[:, 1] = h * (x[:, 1] - x[:, 3] / 2) + padh  # top left y
    y[:, 2] = w * (x[:, 0] + x[:, 2] / 2) + padw  # bottom right x
    y[:, 3] = h * (x[:, 1] + x[:, 3] / 2) + padh  # bottom right y
    return y


def xyxy2xywhn(x, w=640, h=640, clip=False, eps=0.0):
    # Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] normalized where xy1=top-left, xy2=bottom-right
#     if clip:
#         clip_coords(x, (h - eps, w - eps))  # warning: inplace clip
    y = np.copy(x)
    y[:, 0] = ((x[:, 0] + x[:, 2]) / 2) / w  # x center
    y[:, 1] = ((x[:, 1] + x[:, 3]) / 2) / h  # y center
    y[:, 2] = (x[:, 2] - x[:, 0]) / w  # width
    y[:, 3] = (x[:, 3] - x[:, 1]) / h  # height
    return y

def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=False, scaleFill=False, scaleup=True, stride=32):
    # Resize and pad image while meeting stride-multiple constraints
    shape = im.shape[:2]  # current shape [height, width]
    if isinstance(new_shape, int):
        new_shape = (new_shape, new_shape)

    # Scale ratio (new / old)
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
    if not scaleup:  # only scale down, do not scale up (for better val mAP)
        r = min(r, 1.0)

    # Compute padding
    ratio = r, r  # width, height ratios
    new_unpad =  int(round(shape[0] * r)), int(round(shape[1] * r))
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
    if auto:  # minimum rectangle
        dw, dh = tf.math.floormod(dw, stride), tf.math.floormod(dh, stride)  # wh padding
    elif scaleFill:  # stretch
        dw, dh = 0.0, 0.0
        new_unpad = (new_shape[1], new_shape[0])
        ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios

    dw /= 2  # divide padding into 2 sides
    dh /= 2

    if shape[::-1] != new_unpad:  # resize
#         print(new_unpad)
        im = tf.image.resize(im, size = new_unpad, method='bilinear')

    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    im = tf.image.resize_with_crop_or_pad(im, new_shape[0], new_shape[1])
#     im = cv2.copyMakeBorder(im.numpy(), top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
    return im, ratio, (dw, dh)

#this is the main part of yolo that differs from other detectors where we need to assign true label to each grid of output to the model
#since tensorflow does not support indexing so we need to assign label to each grid before feeding data to model

In [None]:
class AnchorLabeler(object):
    # transfer the annotated label to model target by anchor encoding, to calculate anchor based loss next step
    def __init__(self, anchors, grids, img_size=640, assign_method='wh', extend_offset=True, rect_style='rect4', anchor_match_threshold=4.0):  # 4.0 or 0.3
        self.anchors = anchors  # from yaml.anchors to Detect.anchors, w/h based on grid coordinators
        self.grids = grids
        self.img_size = img_size
        self.assign_method = assign_method
        self.extend_offset = extend_offset
        self.rect_style = rect_style
        self.anchor_match_threshold = anchor_match_threshold

    def encode(self, labels):
        self.num_scales = self.anchors.shape[0]
        self.n_anchor_per_scale = self.anchors.shape[1]
        y_anchor_encode = []
        gain = tf.ones(5, tf.float32)

        for i in range(self.num_scales):
            anchor = self.anchors[i]
            grid_size = tf.cast(self.grids[i], tf.int32)
            y_true = tf.zeros([grid_size, grid_size, self.n_anchor_per_scale, 6], tf.float32)
            gain = tf.tensor_scatter_nd_update(gain, [[0], [1], [2], [3]], [grid_size] * 4)
            scaled_labels = labels * gain  # label coordinator now is the same with anchors

            if labels is not None:
                gt_wh = scaled_labels[..., 2:4]  # n_gt * 2
                if self.assign_method == 'wh':
                    assert self.anchor_match_threshold > 1, 'threshold is totally different for wh and iou assign'
                    matched_matrix = self.assign_criterion_wh(gt_wh, anchor, self.anchor_match_threshold)
                elif self.assign_method == 'iou':
                    assert self.anchor_match_threshold < 1, 'threshold is totally different for wh and iou assign'
                    matched_matrix = self.assign_criterion_iou(gt_wh, anchor, self.anchor_match_threshold)
                else:
                    raise ValueError

                n_gt = tf.shape(gt_wh)[0]
                assigned_anchor = tf.tile(tf.reshape(tf.range(self.n_anchor_per_scale), (self.n_anchor_per_scale, 1)),
                                          (1, n_gt))

                assigned_anchor = tf.expand_dims(assigned_anchor[matched_matrix], 1)  # filter
                assigned_anchor = tf.cast(assigned_anchor, tf.int32)

                assigned_label = tf.tile(tf.expand_dims(scaled_labels, 0), [self.n_anchor_per_scale, 1, 1])
                assigned_label = assigned_label[matched_matrix]

                if self.extend_offset:
                    assigned_label, assigned_anchor, grid_offset = self.enrich_pos_by_position(
                        assigned_label, assigned_anchor, gain, matched_matrix)
                else:
                    grid_offset = tf.zeros_like(assigned_label[:, 0:2])

                assigned_grid = tf.cast(assigned_label[..., 0:2] - grid_offset, tf.int32)  # n_matched * 2
                assigned_grid = tf.clip_by_value(assigned_grid, clip_value_min=0, clip_value_max=grid_size-1)
                
                # tensor: grid * grid * 3 * 6, indices（sparse index）: ~n_gt * gr * gr * 3, updates: ~n_gt * 6
                assigned_indices = tf.concat([assigned_grid[:, 1:2], assigned_grid[:, 0:1], assigned_anchor],
                                             axis=1)

                xy, wh, clss = tf.split(assigned_label, (2, 2, 1), axis=-1)
                xy = xy / gain[0] * self.img_size
                wh = wh / gain[1] * self.img_size
                obj = tf.ones_like(clss)
                assigned_updates = tf.concat([xy, wh, obj, clss], axis=-1)

                y_true = tf.tensor_scatter_nd_update(y_true, assigned_indices, assigned_updates)
            y_anchor_encode.append(y_true)
        return tuple(y_anchor_encode)  # add a tuple is important here, otherwise raise an error

    def assign_criterion_wh(self, gt_wh, anchors, anchor_threshold):
        # return: please note that the v5 default anchor_threshold is 4.0, related to the positive sample augment
        gt_wh = tf.expand_dims(gt_wh, 0)  # => 1 * n_gt * 2
        anchors = tf.expand_dims(anchors, 1)  # => n_anchor * 1 * 2
        ratio = gt_wh / anchors  # => n_anchor * n_gt * 2
        matched_matrix = tf.reduce_max(tf.math.maximum(ratio, 1 / ratio),
                                       axis=2) < anchor_threshold  # => n_anchor * n_gt
        return matched_matrix

    def assign_criterion_iou(self, gt_wh, anchors, anchor_threshold):
        # by IOU, anchor_threshold < 1
        box_wh = tf.expand_dims(gt_wh, 0)  # => 1 * n_gt * 2
        box_area = box_wh[..., 0] * box_wh[..., 1]  # => 1 * n_gt

        anchors = tf.cast(anchors, tf.float32)  # => n_anchor * 2
        anchors = tf.expand_dims(anchors, 1)  # => n_anchor * 1 * 2
        anchors_area = anchors[..., 0] * anchors[..., 1]  # => n_anchor * 1

        inter = tf.math.minimum(anchors[..., 0], box_wh[..., 0]) * tf.math.minimum(anchors[..., 1],
                                                                                   box_wh[..., 1])  # n_gt * n_anchor
        iou = inter / (anchors_area + box_area - inter + 1e-9)

        iou = iou > anchor_threshold
        return iou

    def enrich_pos_by_position(self, assigned_label, assigned_anchor, gain, matched_matrix, rect_style='rect4'):
        # using offset to extend more postive result, if x
        assigned_xy = assigned_label[..., 0:2]  # n_matched * 2
        offset = tf.constant([[0, 0], [1, 0], [0, 1], [-1, 0], [0, -1]], tf.float32)
        grid_offset = tf.zeros_like(assigned_xy)

        if rect_style == 'rect2':
            g = 0.2  # offset
        elif rect_style == 'rect4':
            g = 0.5  # offset
            matched = (assigned_xy % 1. < g) & (assigned_xy > 1.)
            matched_left = matched[:, 0]
            matched_up = matched[:, 1]
            matched = (assigned_xy % 1. > (1 - g)) & (assigned_xy < tf.expand_dims(gain[0:2], 0) - 1.)
            matched_right = matched[:, 0]
            matched_down = matched[:, 1]

            assigned_anchor = tf.concat([assigned_anchor, assigned_anchor[matched_left], assigned_anchor[matched_up],
                                         assigned_anchor[matched_right], assigned_anchor[matched_down]], axis=0)
            assigned_label = tf.concat([assigned_label, assigned_label[matched_left], assigned_label[matched_up],
                                        assigned_label[matched_right], assigned_label[matched_down]], axis=0)

            grid_offset = g * tf.concat(
                [grid_offset, grid_offset[matched_left] + offset[1], grid_offset[matched_up] + offset[2],
                 grid_offset[matched_right] + offset[3], grid_offset[matched_down] + offset[4]], axis=0)

        return assigned_label, assigned_anchor, grid_offset

# data pipeline from data_reader (image,label) to tf.data

In [None]:
class DataLoader(object):

    def __init__(self, data_reader, anchors, stride, img_size=640, anchor_assign_method='wh',
                 anchor_positive_augment=True):
        self.data_reader = data_reader
        self.anchor_label = AnchorLabeler(anchors,
                                          grids=img_size / stride,
                                          img_size=img_size,
                                          assign_method=anchor_assign_method,
                                          extend_offset=anchor_positive_augment)
        self.img_size = img_size

    def __call__(self, batch_size=8, anchor_label=True):
        dataset = tf.data.Dataset.from_generator(self.data_reader.iter,
                                                 output_types=(tf.float32, tf.float32),
                                                 output_shapes=([self.img_size, self.img_size, 3], [None, 5]))

        if anchor_label:  # when train
            dataset = dataset.map(self.transform, num_parallel_calls=tf.data.experimental.AUTOTUNE)
        dataset = dataset.batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)
        return dataset

    def transform(self, image, label):
        label_encoder = self.anchor_label.encode(label)
        return image, label_encoder

# module for yolo model

In [None]:
tf.keras.backend.clear_session()

class BN(keras.layers.Layer):
    # TensorFlow BatchNormalization wrapper
    def __init__(self, w=None):
        super().__init__()
        self.bn = keras.layers.BatchNormalization()

    def call(self, inputs):
        return self.bn(inputs)


class Pad(keras.layers.Layer):
    def __init__(self, pad):
        super().__init__()
        self.pad = tf.constant([[0, 0], [pad, pad], [pad, pad], [0, 0]])

    def call(self, inputs):
        return tf.pad(inputs, self.pad, mode='constant', constant_values=0)
    
def autopad(k, p=None):  # kernel, padding
    # Pad to 'same'
    if p is None:
        p = k // 2 if isinstance(k, int) else [x // 2 for x in k]  # auto-pad
        return p
    else:
        return p

class Conv(keras.layers.Layer):
    # Standard convolution
    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True, w=None):
        # ch_in, ch_out, weights, kernel, stride, padding, groups
        super().__init__()
        assert g == 1, "TF v2.2 Conv2D does not support 'groups' argument"
        assert isinstance(k, int), "Convolution with multiple kernels are not allowed."
        # TensorFlow convolution padding is inconsistent with PyTorch (e.g. k=3 s=2 'SAME' padding)
        # see https://stackoverflow.com/questions/52975843/comparing-conv2d-with-padding-between-tensorflow-and-pytorch

        conv = keras.layers.Conv2D(
            c2, k, s, 'SAME' if s == 1 else 'VALID', use_bias=False)
#             kernel_initializer=keras.initializers.Constant(w.conv.weight.permute(2, 3, 1, 0).numpy()),
#             bias_initializer='zeros' if hasattr(w, 'bn') else keras.initializers.Constant(w.conv.bias.numpy()))
        self.conv = conv if s == 1 else keras.Sequential([Pad(autopad(k, p)), conv])
        self.bn = BN()# if hasattr(w, 'bn') else tf.identity

        # YOLOv5 activations
#         if isinstance(w.act, nn.LeakyReLU):
#             self.act = (lambda x: keras.activations.relu(x, alpha=0.1)) if act else tf.identity
#         elif isinstance(w.act, nn.Hardswish):
#             self.act = (lambda x: x * tf.nn.relu6(x + 3) * 0.166666667) if act else tf.identity
#         elif isinstance(w.act, (nn.SiLU, SiLU)):
        self.act = (lambda x: keras.activations.swish(x)) if act else tf.identity
#         else:
#             raise Exception(f'no matching TensorFlow activation found for {w.act}')

    def call(self, inputs):
        return self.act(self.bn(self.conv(inputs)))


class Focus(keras.layers.Layer):
    # Focus wh information into c-space
    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True, w=None):
        # ch_in, ch_out, kernel, stride, padding, groups
        super().__init__()
        self.conv = Conv(c1 * 4, c2, k, s, p, g, act)

    def call(self, inputs):  # x(b,w,h,c) -> y(b,w/2,h/2,4c)
        # inputs = inputs / 255  # normalize 0-255 to 0-1
        return self.conv(tf.concat([inputs[:, ::2, ::2, :],
                                    inputs[:, 1::2, ::2, :],
                                    inputs[:, ::2, 1::2, :],
                                    inputs[:, 1::2, 1::2, :]], 3))


class Bottleneck(keras.layers.Layer):
    # Standard bottleneck
    def __init__(self, c1, c2, shortcut=True, g=1, e=0.5, w=None):  # ch_in, ch_out, shortcut, groups, expansion
        super().__init__()
        c_ = int(c2 * e)  # hidden channels
        self.cv1 = Conv(c1, c_, 1, 1)
        self.cv2 = Conv(c_, c2, 3, 1, g=g)
        self.add = shortcut and c1 == c2

    def call(self, inputs):
        return inputs + self.cv2(self.cv1(inputs)) if self.add else self.cv2(self.cv1(inputs))


class Conv2d(keras.layers.Layer):
    # Substitution for PyTorch nn.Conv2D
    def __init__(self, c1, c2, k, s=1, g=1, bias=True, w=None):
        super().__init__()
        assert g == 1, "TF v2.2 Conv2D does not support 'groups' argument"
        self.conv = keras.layers.Conv2D(
            c2, k, s, 'VALID', use_bias=bias,
)

    def call(self, inputs):
        return self.conv(inputs)


class BottleneckCSP(keras.layers.Layer):
    # CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks
    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, w=None):
        # ch_in, ch_out, number, shortcut, groups, expansion
        super().__init__()
        c_ = int(c2 * e)  # hidden channels
        self.cv1 = Conv(c1, c_, 1, 1)
        self.cv2 = Conv2d(c1, c_, 1, 1, bias=False)
        self.cv3 = Conv2d(c_, c_, 1, 1, bias=False)
        self.cv4 = Conv(2 * c_, c2, 1, 1, w=w.cv4)
        self.bn = BN()
        self.act = lambda x: keras.activations.relu(x, alpha=0.1)
        self.m = keras.Sequential([Bottleneck(c_, c_, shortcut, g, e=1.0, w=w.m[j]) for j in range(n)])

    def call(self, inputs):
        y1 = self.cv3(self.m(self.cv1(inputs)))
        y2 = self.cv2(inputs)
        return self.cv4(self.act(self.bn(tf.concat((y1, y2), axis=3))))


class C3(keras.layers.Layer):
    # CSP Bottleneck with 3 convolutions
    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, w=None):
        # ch_in, ch_out, number, shortcut, groups, expansion
        super().__init__()
        c_ = int(c2 * e)  # hidden channels
        self.cv1 = Conv(c1, c_, 1, 1)
        self.cv2 = Conv(c1, c_, 1, 1)
        self.cv3 = Conv(2 * c_, c2, 1, 1)
        self.m = keras.Sequential([Bottleneck(c_, c_, shortcut, g, e=1.0) for j in range(n)])

    def call(self, inputs):
        return self.cv3(tf.concat((self.m(self.cv1(inputs)), self.cv2(inputs)), axis=3))


class SPP(keras.layers.Layer):
    # Spatial pyramid pooling layer used in YOLOv3-SPP
    def __init__(self, c1, c2, k=(5, 9, 13), w=None):
        super().__init__()
        c_ = c1 // 2  # hidden channels
        self.cv1 = Conv(c1, c_, 1, 1)
        self.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1)
        self.m = [keras.layers.MaxPool2D(pool_size=x, strides=1, padding='SAME') for x in k]

    def call(self, inputs):
        x = self.cv1(inputs)
        return self.cv2(tf.concat([x] + [m(x) for m in self.m], 3))


class SPPF(keras.layers.Layer):
    # Spatial pyramid pooling-Fast layer
    def __init__(self, c1, c2, k=5, w=None):
        super().__init__()
        c_ = c1 // 2  # hidden channels
        self.cv1 = Conv(c1, c_, 1, 1)
        self.cv2 = Conv(c_ * 4, c2, 1, 1)
        self.m = keras.layers.MaxPool2D(pool_size=k, strides=1, padding='SAME')

    def call(self, inputs):
        x = self.cv1(inputs)
        y1 = self.m(x)
        y2 = self.m(y1)
        return self.cv2(tf.concat([x, y1, y2, self.m(y2)], 3))


class Detect(keras.layers.Layer):
    def __init__(self, nc=80, anchors=(), ch=(), imgsz=(640, 640), w=None):  # detection layer
        super().__init__()
#         self.stride = tf.convert_to_tensor(w.stride.numpy(), dtype=tf.float32)
        
        self.nc = nc  # number of classes
        self.no = nc + 5  # number of outputs per anchor
        self.nl = len(anchors)  # number of detection layers
        self.na = len(anchors[0]) // 2  # number of anchors
        self.grid = [tf.zeros(1)] * self.nl  # init grid
        if self.nl == 4:
            self.stride = tf.convert_to_tensor(np.array([8, 16, 32,64], np.float32))
        else:
            self.stride = tf.convert_to_tensor(np.array([8, 16, 32], np.float32))
            
#         self.anchors = tf.convert_to_tensor(anchors, dtype=tf.float32)
#         print(anchors)
        self.anchors = tf.cast(tf.reshape(anchors, [self.nl, -1, 2]), tf.float32)
        
        self.anchor_grid = tf.reshape(self.anchors * tf.reshape(self.stride, [self.nl, 1, 1]),
                                      [self.nl, 1, -1, 1, 2])
          # fixed here, modify if structure changes
        
        self.m = [Conv2d(x, self.no * self.na, 1) for i, x in enumerate(ch)]
          # set to False after building model
        self.imgsz = imgsz
        for i in range(self.nl):
            ny, nx = self.imgsz[0] // self.stride[i], self.imgsz[1] // self.stride[i]
            self.grid[i] = self._make_grid(nx, ny)

    def call(self, inputs, training = True):
        z = []  # inference output
        x = []
        for i in range(self.nl):
            x.append(self.m[i](inputs[i]))
            # x(bs,20,20,255) to x(bs,3,20,20,85)
            ny, nx = self.imgsz[0] // self.stride[i], self.imgsz[1] // self.stride[i]
            x[i] = tf.transpose(tf.reshape(x[i], [-1, ny * nx, self.na, self.no]), [0, 2, 1, 3])

            if not training:  # inference
                y = tf.sigmoid(x[i])
                xy = (y[..., 0:2] * 2 - 0.5 + self.grid[i]) * self.stride[i]  # xy
                wh = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  / self.stride[i]
                # Normalize xywh to 0-1 to reduce calibration error
#                 xy /= tf.constant([[self.imgsz[1], self.imgsz[0]]], dtype=tf.float32)
#                 wh /= tf.constant([[self.imgsz[1], self.imgsz[0]]], dtype=tf.float32)
                y = tf.concat([xy, wh, y[..., 4:]], -1)
                z.append(tf.reshape(y, [-1, self.na * ny * nx, self.no]))

        return x if training else (tf.concat(z, 1), x)

    @staticmethod
    def _make_grid(nx=20, ny=20):
        # yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)])
        # return torch.stack((xv, yv), 2).view((1, 1, ny, nx, 2)).float()
        xv, yv = tf.meshgrid(tf.range(nx), tf.range(ny))
        return tf.cast(tf.reshape(tf.stack([xv, yv], 2), [1, 1, ny * nx, 2]), dtype=tf.float32)


class Upsample(keras.layers.Layer):
    def __init__(self, size, scale_factor, mode, w=None):  # warning: all arguments needed including 'w'
        super().__init__()
        assert scale_factor == 2, "scale_factor must be 2"
        self.upsample = lambda x: tf.image.resize(x, (x.shape[1] * 2, x.shape[2] * 2), method=mode)
        # self.upsample = keras.layers.UpSampling2D(size=scale_factor, interpolation=mode)
        # with default arguments: align_corners=False, half_pixel_centers=False
        # self.upsample = lambda x: tf.raw_ops.ResizeNearestNeighbor(images=x,
        #                                                            size=(x.shape[1] * 2, x.shape[2] * 2))

    def call(self, inputs):
        return self.upsample(inputs)


class Concat(keras.layers.Layer):
    def __init__(self, dimension=1, w=None):
        super().__init__()
        assert dimension == 1, "convert only NCHW to NHWC concat"
        self.d = 3

    def call(self, inputs):
        return tf.concat(inputs, self.d)

In [None]:
def make_divisible(x, divisor):
    # Returns nearest x divisible by divisor
    if isinstance(divisor, tf.Tensor):
        divisor = int(divisor.max())  # to int
    return math.ceil(x / divisor) * divisor

class Yolo(object):
    def __init__(self, yaml_dir, params, img_size, tf_nms = False, training = True):
#         tf.keras.backend.clear_session()
        self.tf_nms = tf_nms
        with open(yaml_dir) as f:
            yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
#         print(yaml_dict)
        yaml_dict['nc'] = 1
#         yaml_dict['anchors'] =
        self.params = params
        self.module_list = self.parse_model(yaml_dict, ch = [3], imgsz = [img_size[0], img_size[1]])
        module = self.module_list[-1]
        self.training = training
#         print(module.f)
        if isinstance(module, Detect):
            # transfer the anchors to grid coordinator, 3 * 3 * 2
            module.anchors /= tf.reshape(module.stride, [-1, 1, 1])
     
    def __call__(self, img_size, name='yolo'):
        x = tf.keras.Input([img_size[0], img_size[1], 3])
        output = self.forward(x, self.tf_nms)
        return tf.keras.Model(inputs=x, outputs=output, name=name)

    def forward(self, x, tf_nms = False):
        y = []
        for module in self.module_list:
            if module.f != -1:  # if not from previous layer
                if isinstance(module.f, int):
                    x = y[module.f]
                else:
                    x = [x if j == -1 else y[j] for j in module.f]
            if isinstance(module, Detect):
                x = module(x, self.training)
            else:
                x = module(x)
            y.append(x)
        

        # Add TensorFlow NMS
        if tf_nms:
            boxes = self._xywh2xyxy(x[0][..., :4])
            probs = x[0][:, :, 4:5]
            classes = x[0][:, :, 5:]
            scores = probs * classes
#             if agnostic_nms:
#                 nms = AgnosticNMS()((boxes, classes, scores), topk_all, iou_thres, conf_thres)
#                 return nms, x[1]
#             else:
            boxes = tf.expand_dims(boxes, 2)
            nms = tf.image.combined_non_max_suppression(
                boxes, scores, 100, 100, 0.5, 0.10, clip_boxes=False)
            return nms, x[1]

        return x  # output only first tensor [1,6300,85] = [xywh, conf, class0, class1, ...]
    def parse_model(self, d, ch, imgsz):  # model_dict, input_channels(3)
#         LOGGER.info(f"\n{'':>3}{'from':>18}{'n':>3}{'params':>10}  {'module':<40}{'arguments':<30}")
        anchors, nc, gd, gw = d['anchors'], d['nc'], d['depth_multiple'], d['width_multiple']
        na = (len(anchors[0]) // 2) if isinstance(anchors, list) else anchors  # number of anchors
        no = na * (nc + 5)  # number of outputs = anchors * (classes + 5)
#         print(anchors)
        layers, save, c2 = [], [], ch[-1]  # layers, savelist, ch out
        for i, (f, n, m, args) in enumerate(d['backbone'] + d['head']):  # from, number, module, args
            m = m.replace('nn.', '') if isinstance(m, str) else m
#             print(m)
            m_str = m
            m = eval(m) if isinstance(m, str) else m  # eval strings
            for j, a in enumerate(args):
                try:
                    args[j] = eval(a) if isinstance(a, str) else a  # eval strings
                except NameError:
                    pass

            n = max(round(n * gd), 1) if n > 1 else n  # depth gain
    #         print(i, n)
            if m in [Conv2d, Conv, Bottleneck, SPP, SPPF, Focus, BottleneckCSP, C3]:
                c1, c2 = ch[f], args[0]
                c2 = make_divisible(c2 * gw, 8) if c2 != no else c2

                args = [c1, c2, *args[1:]]
                if m in [BottleneckCSP, C3]:
                    args.insert(2, n)
                    n = 1
            elif m is BN:
                args = [ch[f]]
            elif m is Concat:
                c2 = sum(ch[-1 if x == -1 else x + 1] for x in f)
            elif m is Detect:
                args.append([ch[x + 1] for x in f])
                if isinstance(args[1], int):  # number of anchors
                    args[1] = [list(range(args[1] * 2))] * len(f)
                args.append(imgsz)
            else:
                c2 = ch[f]
            tf_m = eval(m) if isinstance(m, str) else m
#             if i == 0:
#                 args[-2] = 1
            m_ = tf.keras.Sequential(*[tf_m(*args) for _ in range(n)]) if n > 1 else tf_m(*args)
#             else:
#                 m_ = tf.keras.Sequential(*[tf_m(*args) for _ in range(n)]) if n > 1 else tf_m(*args)
            m_.i, m_.f = i, f
            layers.append(m_)
            ch.append(c2)
        return layers
    
    @staticmethod
    def _xywh2xyxy(xywh):
        # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
        x, y, w, h = tf.split(xywh, num_or_size_splits=4, axis=-1)
        return tf.concat([x - w / 2, y - h / 2, x + w / 2, y + h / 2], axis=-1)

In [None]:
def one_cycle(y1=0.0, y2=1.0, steps=100):
    # lambda function for sinusoidal ramp from y1 to y2 https://arxiv.org/pdf/1812.01187.pdf
    return lambda x: ((1 - math.cos(x * math.pi / steps)) / 2) * (y2 - y1) + y1

class LrScheduler(object):
    def __init__(self, total_steps, params, scheduler_method='cosine'):
        self.scheduler = Cosine(total_steps, params)
        self.step_count = 0
        self.total_steps = total_steps

    def step(self):
        self.step_count += 1
        lr = self.scheduler(self.step_count)
        return lr

    def plot(self):
        lr = []
        for i in range(self.total_steps):
            lr.append(self.step())
        plt.plot(range(self.total_steps), lr)
        plt.show()



class Cosine(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, total_steps, params):
        # create the cosine learning rate with linear warmup
        super(Cosine, self).__init__()
        self.total_steps = total_steps
        self.params = params

    def __call__(self, global_step):
        init_lr = self.params.init_learning_rate
        warmup_lr = self.params.warmup_learning_rate# if 'warmup_learning_rate' in self.params else 0.0
        warmup_steps = self.params.warmup_steps
        assert warmup_steps < self.total_steps, "warmup {}, total {}".format(warmup_steps, self.total_steps)

        linear_warmup = warmup_lr + tf.cast(global_step, tf.float32) / warmup_steps * (init_lr - warmup_lr)
        cosine_learning_rate = init_lr * (
                    tf.cos(np.pi * (global_step - warmup_steps) / (self.total_steps - warmup_steps)) + 1.0) / 2.0
        learning_rate = tf.where(global_step < warmup_steps, linear_warmup, cosine_learning_rate)
        return learning_rate

In [None]:
class YoloLoss(object):
    def __init__(self, hyp, anchors, ignore_iou_threshold, num_classes, img_size, label_smoothing=0):
        self.hyp = hyp
        self.anchors = anchors
        self.strides = [8, 16, 32] if len(anchors) == 3 else [8,16,32,64] 
        self.ignore_iou_threshold = ignore_iou_threshold
        self.num_classes = num_classes
        self.img_size = img_size
        self.bce_conf = tf.keras.losses.BinaryCrossentropy(from_logits = True, reduction=tf.keras.losses.Reduction.NONE)
        self.bce_class = tf.keras.losses.BinaryCrossentropy(from_logits = True, reduction=tf.keras.losses.Reduction.NONE,
                                                            label_smoothing=label_smoothing)

    def __call__(self, y_true, y_pred):
        iou_loss_all = obj_loss_all = class_loss_all = 0
        balance = [4.0, 1.0, 0.4] if len(y_pred) == 3 else [4.0, 1.0, 0.4, 0.1]  # P3-5 or P3-6        
#         print(len(y_pred))
        for i, (pred, true) in enumerate(zip(y_pred, y_true)):
#             print(pred.numpy())
            # preprocess, true: batch_size * grid * grid * 3 * 6, pred: batch_size * grid * grid * clss+5
            grid = tf.cast(tf.sqrt(tf.cast(tf.shape(pred)[2], tf.float32)), tf.int32)
#             true = tf.expand_dims(true, axis= 0)
#             print(pred.shape)
            pred = tf.reshape(tf.transpose(pred, [0,2,1,3]), (tf.shape(pred)[0], grid, grid, 3,6))
            true_box, true_obj, true_class = tf.split(true, (4, 1, -1), axis=-1)
            pred_box_xy, pred_box_wh, pred_obj,pred_class = tf.split(pred, (2, 2, 1, -1), axis=-1)
            obj_mask = tf.squeeze(true_obj, -1)
            if tf.shape(true_class)[-1] == 1 and self.num_classes > 1:
                true_class = tf.squeeze(tf.one_hot(tf.cast(true_class, tf.dtypes.int32), depth=self.num_classes, axis=-1), -2) 

            pxy = tf.nn.sigmoid(pred_box_xy) * 2 - 0.5
            pwh = (tf.nn.sigmoid(pred_box_wh) * 2) ** 2 * self.anchors[i]
            pbox = tf.concat([pxy, pwh], axis = -1)  # predicted box
#             print(pbox.shape, true_box.shape)
            iou = bbox_iou(pbox, true_box, xyxy=False, ciou = True)  # iou(prediction, target)

            iou_loss = (1.0 - iou) * obj_mask  # iou loss
            iou_loss_all += tf.reduce_mean(iou_loss) * balance[i]
#             print(true_obj.shape, iou.shape, pred_obj.shape)
#             iou = tf.expand_dims(iou, axis= 4)
#             print(iou)
#             print(true_obj.shape, iou.shape, pred_obj.shape)
            iou_weight = tf.cast(tf.clip_by_value(iou, 0, 1) * obj_mask , obj_mask.dtype)
            obji = tf.nn.weighted_cross_entropy_with_logits(tf.squeeze(true_obj, axis = -1),
                                                             tf.squeeze(pred_obj, axis = -1), 
                                                            pos_weight = 1) * obj_mask
#             print(obji.shape)
#             obji = self.bce_conf(true_obj, pred_obj) * obj_mask
            obj_loss_all += tf.reduce_mean(obji) * balance[i]  # obj loss
        iou_loss_all *= self.hyp.box
        obj_loss_all *= self.hyp.obj
        class_loss_all *= self.hyp.cls

        return iou_loss_all, obj_loss_all, class_loss_all


def bbox_iou(bbox1, bbox2, xyxy=False, giou=False, diou=False, ciou=False, epsilon=1e-9):
    assert bbox1.shape == bbox2.shape
    # giou loss: https://arxiv.org/abs/1902.09630
    if xyxy:
        b1x1, b1y1, b1x2, b1y2 = bbox1[..., 0], bbox1[..., 1], bbox1[..., 2], bbox1[..., 3]
        b2x1, b2y1, b2x2, b2y2 = bbox2[..., 0], bbox2[..., 1], bbox2[..., 2], bbox2[..., 3]
    else:  # xywh -> xyxy
        b1x1, b1x2 = bbox1[..., 0] - bbox1[..., 2] / 2, bbox1[..., 0] + bbox1[..., 2] / 2
        b1y1, b1y2 = bbox1[..., 1] - bbox1[..., 3] / 2, bbox1[..., 1] + bbox1[..., 3] / 2
        b2x1, b2x2 = bbox2[..., 0] - bbox2[..., 2] / 2, bbox2[..., 0] + bbox2[..., 2] / 2
        b2y1, b2y2 = bbox2[..., 1] - bbox2[..., 3] / 2, bbox2[..., 1] + bbox2[..., 3] / 2

    # intersection area
    inter = tf.maximum(tf.minimum(b1x2, b2x2) - tf.maximum(b1x1, b2x1), 0) * \
            tf.maximum(tf.minimum(b1y2, b2y2) - tf.maximum(b1y1, b2y1), 0)

    # union area
    w1, h1 = b1x2 - b1x1 + epsilon, b1y2 - b1y1 + epsilon
    w2, h2 = b2x2 - b2x1+ epsilon, b2y2 - b2y1 + epsilon
    union = w1 * h1 + w2 * h2 - inter + epsilon

    # iou
    iou = inter / union

    if giou or diou or ciou:
        # enclosing box
        cw = tf.maximum(b1x2, b2x2) - tf.minimum(b1x1, b2x1)
        ch = tf.maximum(b1y2, b2y2) - tf.minimum(b1y1, b2y1)
        if giou:
            enclose_area = cw * ch + epsilon
            giou = iou - 1.0 * (enclose_area - union) / enclose_area
            return tf.clip_by_value(giou, -1, 1)
        if diou or ciou:
            c2 = cw ** 2 + ch ** 2 + epsilon
            rho2 = ((b2x1 + b2x2) - (b1x1 + b1x2)) ** 2 / 4 + ((b2y1 + b2y2) - (b1y1 + b1y2)) ** 2 / 4
            if diou:
                return iou - rho2 / c2
            elif ciou:
                v = (4 / math.pi ** 2) * tf.pow(tf.atan(w2 / h2) - tf.atan(w1 / h1), 2)
                alpha = v / (1 - iou + v)
                return iou - (rho2 / c2 + v * alpha)
    return tf.clip_by_value(iou, 0, 1)

In [None]:
class Trainer(object):
    """ Trainer class that uses the dataset and model to train
    # Usage
    data_loader = tf.data.Dataset()
    trainer = Trainer(params)
    trainer.train(data_loader)
    """
    def __init__(self, params):
        """ Constructor
        :param params: dict, with dir and training parameters
        """
        self.params = params
#         if os.path.exists(self.params['log_dir']):
#             shutil.rmtree(self.params['log_dir'])
#         self.log_writer = tf.summary.create_file_writer(self.params.log_dir)
        self.global_step = tf.Variable(0, trainable=False, dtype=tf.int64)
#         nc = Params.nc
#         nl = de_parallel(model).model[-1].nl  # number of detection layers (to scale hyps)
#         hyp['box'] *= 3 / nl  # scale to layers
#         hyp['cls'] *= nc / 80 * 3 / nl  # scale to classes and layers
#         hyp['obj'] *= (imgsz / 640) ** 2 * 3 / nl  # scale to image size and layers
        self.build_model()
        

    def build_model(self):
        """ Build the model,
        define the training strategy and model, loss, optimizer
        :return:
        """
        self.strategy, self.replicas = auto_select_accelerator()
        
        with self.strategy.scope():
            self.model = Yolo(yaml_dir=self.params.yaml_dir, img_size = [self.params.img_size, self.params.img_size], params = self.params)
#             print(self.model.module_list[-1].stride)
            self.anchors = self.model.module_list[-1].anchors   
            self.stride = self.model.module_list[-1].stride
            self.nl = self.model.module_list[-1].nl
            self.nc = self.model.module_list[-1].nc
#             self.num_classes = self.model.module_list[-1].num_classes
            self.loss_sum = tf.keras.metrics.Sum(name='sum', dtype=None)
            self.loss_fn = YoloLoss(self.params, self.model.module_list[-1].anchors,
                                    ignore_iou_threshold=0.3,
                                    num_classes=self.params.num_classes,
                                    label_smoothing=self.params.label_smoothing,
                                    img_size=self.params.img_size)
            if self.params.optimizer == 'sgd':
                self.optimizer = tf.keras.optimizers.SGD(momentum = self.params.momentum)
            elif self.params.optimizer == 'adam':
                self.optimizer = tf.keras.optimizers.Adam()
        self.params.box *= 3 / self.nl  # scale to layers
        self.params.cls *= self.nc / 80 * 3 / self.nl  # scale to classes and layers
        self.params.obj *= (self.params.img_size / 640) ** 2 * 3 / self.nl  # scale to image size and layers
        
    def train(self, train_dataset, valid_dataset=None, transfer='scratch'):
        """ train function
        :param train_dataset: train dataset built by tf.data
        :param valid_dataset: valid dataset build by td.data, optional
        :param transfer: pretrain
        :return:
        """
        steps_per_epoch = int(self.params.len_train_dataset // self.params.batch_size)
        self.total_steps = int(self.params.n_epochs * steps_per_epoch)
        self.params.warmup_steps = self.params.warmup_epochs * steps_per_epoch
# 
        with self.strategy.scope():
            self.lr_scheduler = LrScheduler(self.total_steps, self.params, scheduler_method='cosine')
            # => tf.keras.Model
            self.model = self.model([self.params.img_size, self.params.img_size])
#             self.model.summary()
            self.model.load_weights(self.params.weight_dir)

        train_dataset = self.strategy.experimental_distribute_dataset(train_dataset)        
        
#         epoch = 0
        for epoch in range(self.params.n_epochs):
            for step, (image, target) in enumerate(tqdm(train_dataset, total = steps_per_epoch)):                
                loss = self.dist_train_step(image, target)
#                 print('=> Epoch {}, Step {}, Loss {:.5f}'.format(epoch, self.global_step.numpy(), 
#                                                                 self.loss_sum.result() / (self.global_step.numpy())))
    #             print(iou_loss, conf_loss, class_loss)
    #             with self.log_writer.as_default():
    #                 tf.summary.scalar('loss', loss, step=self.global_step)
    #                 tf.summary.scalar('lr', self.optimizer.lr, step=self.global_step)
    #             self.log_writer.flush()
    #             print(steps_per_epoch)
#                 if (self.global_step.numpy() + 1) % steps_per_epoch == 0:
            print('=> Epoch {}, Step {}, Loss {:.5f}'.format(epoch, self.global_step.numpy(), 
                                                                self.loss_sum.result() / (self.global_step.numpy())))
#                     epoch += 1
            self.model.save_weights(f'/kaggle/working/model epoch {epoch}.h5')
#             step += 1

    @tf.function
    def train_step(self, image, target):
        
        with tf.GradientTape() as tape:
            logit = self.model(image, training=True)
#             print(logit)
            iou_loss, conf_loss, prob_loss = self.loss_fn(target, logit)
            total_loss = iou_loss + conf_loss + prob_loss
#             print(total_loss)
            total_loss = tf.reduce_sum(total_loss) * self.params.batch_size
#             total_loss = tf.nn.compute_average_loss(total_loss, global_batch_size = params.batch_size)
        gradients = tape.gradient(total_loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))
        self.loss_sum.update_state(total_loss)
        lr = self.lr_scheduler.step()
        self.optimizer.lr.assign(lr)
        self.global_step.assign_add(1)    
        return total_loss

    @tf.function
    def dist_train_step(self, image, target):
        with self.strategy.scope():
            loss = self.strategy.run(self.train_step, args=(image, target))
#             total_loss_mean = self.strategy.reduce(tf.distribute.ReduceOp.MEAN, loss, axis=None)
            return loss

    def validate(self, valid_dataset):
        valid_loss = []
        for step, (image, target) in enumerate(valid_dataset):
            step_valid_loss = self.valid_step(image, target)
            valid_loss.append(step_valid_loss)
        return np.mean(valid_loss)

    def valid_step(self, image, label):
        logit = self.model(image, training=False)
        iou_loss, conf_loss, prob_loss = self.loss_fn(label, logit)
        return iou_loss + conf_loss + prob_loss


In [None]:
trainer = Trainer(Params)  
reader = DataReader(hyp = Params, 
                    annotations_dict = train_annotations_dict,
                    img_size = Params.img_size,
                    mosaic = True, 
                    augment = True)

dataloader = DataLoader(reader, trainer.anchors, trainer.stride, Params.img_size)
train_dataset = dataloader(Params.batch_size, anchor_label = True)

In [None]:
# training the model
trainer.train(train_dataset)