In [346]:
import cv2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tqdm import tqdm
from itertools import product

import os

In [347]:
df = pd.read_csv('./res/train_df.csv')
image = cv2.imread('./res/train_imgs/001-1-1-01-Z17_A-0000001.jpg', cv2.COLOR_BGR2RGB)
inputs = tf.expand_dims(image, 0)

In [348]:
# plt.imshow(image)
# plt.axis('off')
# plt.show()

In [349]:
df['x_min'] = df.iloc[:, 1:49:2].apply(lambda x: int(min(x)), axis=1)
df['x_max'] = df.iloc[:, 1:49:2].apply(lambda x: int(max(x)), axis=1)
df['y_min'] = df.iloc[:, 2:49:2].apply(lambda x: int(min(x)), axis=1)
df['y_max'] = df.iloc[:, 2:49:2].apply(lambda x: int(max(x)), axis=1)

In [605]:
# df.head()

# Faster R-CNN

## backbone

### ResNet

In [351]:
# resnet = tf.keras.applications.resnet

### VGG

In [352]:
def get_base(img_size, model='vgg'):
    if model=='vgg':
        base_model = tf.keras.applications.VGG16(include_top=False, input_shape=img_size)
    elif model == 'resnet':
        pass
    else:
        raise Exception('vgg, resnet')
        
    base_model = tf.keras.applications.VGG16(include_top=False, weights='imagenet', input_shape=img_size)
    feature_extractor = base_model.get_layer("block5_conv3")
    base_model = tf.keras.models.Model(inputs=base_model.input, outputs=feature_extractor.output)
    return base_model

In [353]:
# base_model = get_base(image.shape, model='vgg')

# inputs = tf.expand_dims(image, 0)

# output_map = base_model(inputs)
# imgArray = output_map.numpy().squeeze(0)
# # output_map

## Region Proposal Network

### Image resize

In [354]:
df_new = df.copy()

In [355]:
(1080 * 0.4)

432.0

In [356]:
(1920 * 0.4)

768.0

In [357]:
# Ry = 600/1080
# Rx = 1072/1920

# Ry = 600/1080
# Rx = 600/1920

# Ry = size[0]/1080
# Rx = size[0]/1920

Ry = 0.4
Rx = 0.4
size = (432, 768)

df_new.iloc[:, 1:49:2] = df_new.iloc[:, 1:49:2] * Rx
df_new.iloc[:, 2:49:2] = df_new.iloc[:, 2:49:2] * Ry

In [358]:
df_new['x_min'] = df_new.iloc[:, 1:49:2].apply(lambda x: int(min(x)), axis=1)
df_new['x_max'] = df_new.iloc[:, 1:49:2].apply(lambda x: int(max(x)), axis=1)
df_new['y_min'] = df_new.iloc[:, 2:49:2].apply(lambda x: int(min(x)), axis=1)
df_new['y_max'] = df_new.iloc[:, 2:49:2].apply(lambda x: int(max(x)), axis=1)

In [359]:
image = np.copy(cv2.resize(image, size[::-1]))
inputs = tf.expand_dims(image, 0)
image.shape

(432, 768, 3)

### Ground Truth Generating 

In [360]:
ground_truth = df_new.iloc[:,:1].copy() 
ground_truth['x_min'] = df_new['x_min'] - (df_new['x_max'] - df_new['x_min'])*.065
ground_truth['x_max'] = df_new['x_max'] + (df_new['x_max'] - df_new['x_min'])*.065
ground_truth['y_min'] = df_new['y_min'] - (df_new['y_max'] - df_new['y_min'])*.065
ground_truth['y_max'] = df_new['y_max'] + (df_new['y_max'] - df_new['y_min'])*.065

In [361]:
ground_truth['w'] = ground_truth['x_max'] - ground_truth['x_min']
ground_truth['h'] = ground_truth['y_max'] - ground_truth['y_min']
ground_truth['x'] = ground_truth['w']/2 + ground_truth['x_min']
ground_truth['y'] = ground_truth['h']/2 + ground_truth['y_min']

In [362]:
GT = np.array(ground_truth[['x', 'y', 'w', 'h']])

In [363]:
min(np.sqrt(GT[:, 2] * GT[:, 3])), np.mean(np.sqrt(GT[:, 2] * GT[:, 3])), max(np.sqrt(GT[:, 2] * GT[:, 3]))

(78.40278438933144, 160.93931523544822, 278.6612992505418)

In [364]:
np.mean(np.sqrt(GT[:, 2] * GT[:, 3])) - 2*np.std(np.sqrt(GT[:, 2] * GT[:, 3])), np.mean(np.sqrt(GT[:, 2] * GT[:, 3])) + 2*np.std(np.sqrt(GT[:, 2] * GT[:, 3]))

(97.5204133751619, 224.35821709573455)

In [365]:
sigma = 3
print(f'min : {int(np.min(np.sqrt(GT[:, 2] * GT[:, 3])))}')
print(f'mean - {sigma} std : {int(np.mean(np.sqrt(GT[:, 2] * GT[:, 3])) - sigma*np.std(np.sqrt(GT[:, 2] * GT[:, 3])))}')
print(f'mean : {int(np.mean(np.sqrt(GT[:, 2] * GT[:, 3])))}')
print(f'mean + {sigma} std : {int(np.mean(np.sqrt(GT[:, 2] * GT[:, 3])) + sigma*np.std(np.sqrt(GT[:, 2] * GT[:, 3])))}')
print(f'max : {int(np.max(np.sqrt(GT[:, 2] * GT[:, 3])))}')

min : 78
mean - 3 std : 65
mean : 160
mean + 3 std : 256
max : 278


In [366]:
# h가 w보다 큰 경우 h/w 비율의 평균
GT_h = GT[(GT[:, 2]/GT[:, 3] < 1)]
print('비율의 평균 :', np.mean(GT_h[:, 3] / GT_h[:, 2]))
print(f'w, h 에 곱해줄 비율 : w = {1/np.sqrt(2)}, h = {np.sqrt(3.5)}')
print(f'적용한 비율 : h/w = {np.sqrt(3.5) / (1/np.sqrt(2))}')

비율의 평균 : 2.6544662108614125
w, h 에 곱해줄 비율 : w = 0.7071067811865475, h = 1.8708286933869707
적용한 비율 : h/w = 2.6457513110645907


In [367]:
# w가 h보다 큰 경우 w/h 비율의 평균
GT_w = GT[(GT[:, 2]/GT[:, 3] > 1)]
print('비율의 평균 :', np.mean(GT_w[:, 2] / GT_w[:, 3]))
print(f'w, h 에 곱해줄 비율 : w = {np.sqrt(3)}, h = {1/np.sqrt(2)}')
print(f'적용한 비율 : w/h = {np.sqrt(3) / (1/np.sqrt(2))}')

비율의 평균 : 2.454610664879193
w, h 에 곱해줄 비율 : w = 1.7320508075688772, h = 0.7071067811865475
적용한 비율 : w/h = 2.4494897427831783


### Anchor boxes

In [368]:
def anchor_box_generator(x, y):
    scales = [97, 160, 256]
#     ratio = [(1/np.sqrt(2), np.sqrt(2)), (1, 1), (np.sqrt(2), 1/np.sqrt(2))]
    ratio = [(1/np.sqrt(2), np.sqrt(3.5)), (1, 1), (np.sqrt(3), 1/np.sqrt(2))]
    anchor_boxes = []
    for scale in scales:
        for w, h  in ratio:
            w *= scale
            h *= scale
            
            anchor_boxes.append([x, y, w, h])
    return anchor_boxes

In [369]:
# def anchor_box_generator(x, y):
# #     ratio = [(1/np.sqrt(2), np.sqrt(2)), (1, 1), (np.sqrt(2), 1/np.sqrt(2))]
# #     scales = [64, 128, 256, 512]
#     scales = [(188, 111), (113, 114), (70, 92), (416, 229), (261, 284), (174, 332), (768, 437), (499, 501), (355, 715)]
#     anchor_boxes = []
#     for w, h  in scales:
#           anchor_boxes.append([x, y, w, h])
#     return anchor_boxes

In [370]:
def Anchor_Boxes(img_shape, model='vgg'):
    '''
    input
    img_shape : image shape
    output 
    numpy array shape (w * h * 9, 4)
    '''
    if model == 'vgg':
        ratio = 2**4
        
    w=image.shape[1]//ratio
    h=image.shape[0]//ratio
    
    anchor_boxes = []
    for x in range(image.shape[1]//w//2, image.shape[1], image.shape[1]//w):
        for y in range(image.shape[0]//h//2, image.shape[0], image.shape[0]//h):
            anchor_boxes.append(anchor_box_generator(x, y))
    return np.array(anchor_boxes).reshape(-1, 4)

In [371]:
anchor_boxes = Anchor_Boxes(img_shape=image.shape, model='vgg')
len(anchor_boxes)

11664

In [602]:
ratio = 2**4

w=image.shape[1]//ratio
h=image.shape[0]//ratio
img_ = np.copy(image)

ground_truth_row = ground_truth.iloc[0]
x1 = int(ground_truth_row['x_min'])
x2 = int(ground_truth_row['x_max'])
y1 = int(ground_truth_row['y_min'])
y2 = int(ground_truth_row['y_max'])
cv2.rectangle(img_, (x1, y1), (x2, y2), (0, 0, 0), thickness=2)

plt.figure(figsize=(10, 10))
for x in range(img_.shape[1]//w//2, img_.shape[1], img_.shape[1]//w):
    for y in range(img_.shape[0]//h//2, img_.shape[0], img_.shape[0]//h):
            cv2.circle(img_, (x, y), radius=1, color=(255, 0, 0), thickness=2)
            
# plt.imshow(img_)
# plt.axis('off')
# plt.show()

<Figure size 720x720 with 0 Axes>

### IoU

In [373]:
# def anchor_to_coordinate(boxeses):    
#     x1 = boxes[:, 0] - boxes[:, 2]/2
#     x2 = boxes[:, 0] + boxes[:, 2]/2
#     y1 = boxes[:, 1] - boxes[:, 3]/2
#     y2 = boxes[:, 1] + boxes[:, 3]/2
#     return np.stack([x1, x2, y1, y2], axis=1)

In [374]:
# def coordinate_to_anxhor(boxeses):
#     w = boxes[:, 1] - boxes[:, 0]
#     h = boxes[:, 3] - boxes[:, 2]
#     x = boxes[:, 0] + w/2
#     y = boxes[:, 2] + h/2
#     return np.stack([x, y, w, h], axis=1)

In [375]:
# def IoU(box1, box2):
#     '''
#     anchor ver
#     '''
#     box1_area = box1[2] * box1[3]
#     box2_area = box2[2] * box2[3]
    
#     x1 = max(box1[0] - box1[2]/2, box2[0] - box2[2]/2)
#     x2 = min(box1[0] + box1[2]/2, box2[0] + box2[2]/2)
    
#     y1 = max(box1[1] - box1[3]/2, box2[1] - box2[3]/2)
#     y2 = min(box1[1] + box1[3]/2, box2[1] + box2[3]/2)
    
#     h = max(0.0, y2 - y1 + 1)
#     w = max(0.0, x2 - x1 + 1)
    
#     if (w <= 0) or (h <= 0):
#         return 0.0
    
#     intersect = h * w
#     union = box1_area + box2_area - intersect
#     return intersect / union 

In [376]:
# def IoU(box1, box2):
#     '''
#     coordinate ver
#     '''
#     box1_area = (box1[1] - box1[0] + 1) * (box1[3] - box1[2] + 1)
#     box2_area = (box2[1] - box2[0] + 1) * (box2[3] - box2[2] + 1)
    
#     x1 = max(box1[0], box2[0])
#     x2 = min(box1[1], box2[1])
    
#     y1 = max(box1[2], box2[2])
#     y2 = min(box1[3], box2[3])    
    
#     h = max(0.0, y2 - y1 + 1)
#     w = max(0.0, x2 - x1 + 1)
    
#     if (w <= 0) or (h <= 0):
#         return 0.0
    
#     intersect = h * w
#     union = box1_area + box2_area - intersect
    
#     iou = intersect / union
#     return iou

In [377]:
def IoU(box1, anchor_boxes):
    '''
    anchor ver
    inputs
    box1 : ground truth box
    anchor_boxes : anchor boxes
    '''
    broadcast = len(anchor_boxes)
    
    box1_area = box1[2] * box1[3]
    box2_area = anchor_boxes[:,2] * anchor_boxes[:,3]
    
    x1 = np.max([np.broadcast_to(box1[0] - box1[2]/2, broadcast), anchor_boxes[:, 0] - anchor_boxes[:, 2]/2], axis=0)
    x2 = np.min([np.broadcast_to(box1[0] + box1[2]/2, broadcast), anchor_boxes[:, 0] + anchor_boxes[:, 2]/2], axis=0)
    
    y1 = np.max([np.broadcast_to(box1[1] - box1[3]/2, broadcast), anchor_boxes[:, 1] - anchor_boxes[:, 3]/2], axis=0)
    y2 = np.min([np.broadcast_to(box1[1] + box1[3]/2, broadcast), anchor_boxes[:, 1] + anchor_boxes[:, 3]/2], axis=0)
    
    h = np.max([np.broadcast_to(0.0, broadcast), y2 - y1 + 1], axis=0)
    w = np.max([np.broadcast_to(0.0, broadcast), x2 - x1 + 1], axis=0)
    
    intersect = h * w
    union = np.broadcast_to(box1_area, broadcast) + box2_area - intersect
    return intersect / union 

In [385]:
def anchor_to_coordinate(box):    
    x1 = box[0] - box[2]/2
    x2 = box[0] + box[2]/2
    y1 = box[1] - box[3]/2
    y2 = box[1] + box[3]/2
    return (x1, x2, y1, y2)

In [604]:
ground_truth_row = ground_truth.iloc[0]

img_ = cv2.imread(f'./res/train_imgs/{ground_truth_row["image"]}', cv2.COLOR_BGR2RGB)
img_ = cv2.resize(img_, size[::-1])

max_output_size = 5
colors = {k: tuple(map(int, np.random.randint(0, 255, 3))) for k in range(max_output_size)}

x1 = int(ground_truth_row['x_min'])
x2 = int(ground_truth_row['x_max'])
y1 = int(ground_truth_row['y_min'])
y2 = int(ground_truth_row['y_max'])
cv2.rectangle(img_, (x1, y1), (x2, y2), (255, 0, 0), thickness=2)

gt = ground_truth_row[['x', 'y', 'w', 'h']]
ious = IoU(gt, anchor_boxes)
print(np.sum(ious>0.6))

selected_indices = tf.image.non_max_suppression(anchor_boxes, ious, max_output_size=max_output_size, score_threshold=0.01)
anchors = tf.gather(anchor_boxes, selected_indices)
print(IoU(gt, anchors))
print(gt)

for i, anchor_box in enumerate(anchors):
    anchor_box = anchor_to_coordinate(anchor_box.numpy())
    cv2.rectangle(
        img_, 
        (int(anchor_box[0]), int(anchor_box[2])), (int(anchor_box[1]), int(anchor_box[3])), 
        colors.get(i), 
        thickness=1
    )
    
# fig, ax = plt.subplots(dpi=200)
# ax.imshow(img_)
# ax.axis('off')
# plt.show()

### Label Generating

In [388]:
def label_generator(GT, anchor_boxes):
    cls_label = -np.ones(shape=(len(GT), anchor_boxes.shape[0]))
    pos_iou_threshold = 0.6
    neg_iou_threshold = 0.3
    n_sample = 128
    pos_ratio = 0.5
    n_pos = int(pos_ratio * n_sample)
    
    for i in tqdm(range(len(GT))):
        ious = np.apply_along_axis(IoU, 0, GT[i], anchor_boxes=anchor_boxes)
        cls_label[i][ious >= pos_iou_threshold] = 1
        cls_label[i][ious < neg_iou_threshold] = 0
        cls_label[i][np.argmax(ious)] = 1

        pos_index = np.where(cls_label[i] == 1)[0]
        if len(pos_index) > n_pos:
            disable_index = np.random.choice(
                pos_index,
                size = (len(pos_index) - n_pos),
                replace=False
            )
            cls_label[i][disable_index] = -1

        n_neg = n_sample - np.sum(cls_label[i] == 1)
        neg_index = np.where(cls_label[i] == 0)[0]
        if len(neg_index) > n_neg:
            disable_index = np.random.choice(
                neg_index, 
                size = (len(neg_index) - n_neg),             
                replace = False
            )
            cls_label[i][disable_index] = -1
            
    reg_label = np.zeros(shape=(len(GT), anchor_boxes.shape[0], 4))
    for i in tqdm(range(len(cls_label))):
        reg_label[i] = anchor_boxes * np.broadcast_to(tf.cast(cls_label[i] > 0, tf.int32), (4, len(cls_label[i]))).T
        indices = np.where(reg_label[i] != 0)[0][::4]
        x, y, w, h = GT[i][0], GT[i][1], GT[i][2], GT[i][3]

        tx = (x - reg_label[i][indices][:, 0]) / (reg_label[i][indices][:, 2])
        ty = (y - reg_label[i][indices][:, 1]) / (reg_label[i][indices][:, 3])
        tw = np.log(w / reg_label[i][indices][:, 2]) 
        th = np.log(h / reg_label[i][indices][:, 3]) 
        reg_label[i][indices] = np.stack([tx, ty, tw, th]).T
        
    return cls_label, reg_label

In [389]:
cls_label, reg_label = label_generator(GT, anchor_boxes)

100%|█████████████████████████████████████████████████████████████████████████████| 4195/4195 [00:09<00:00, 456.72it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 4195/4195 [00:04<00:00, 948.02it/s]


In [390]:
# index_inside = np.where(
#     (anchor_boxes[:, 0] - anchor_boxes[:, 2]/2 >= 0) &
#     (anchor_boxes[:, 0] + anchor_boxes[:, 2]/2 <= image.shape[1]) &
#     (anchor_boxes[:, 1] - anchor_boxes[:, 3]/2 >= 0) &
#     (anchor_boxes[:, 1] + anchor_boxes[:, 3]/2 <= image.shape[0]),
# )[0]

# valid_anchor_boxes = anchor_boxes[index_inside]

# valid_anchor_boxes

### Region Proposal Network

In [391]:
class RPN(tf.keras.models.Model):
    def __init__(self, img_size, anchor_boxes, k=9, backbone='vgg',**kwargs):
        super(RPN, self).__init__(**kwargs)
        self.backbone = backbone
        self.img_size = img_size
        self.anchor_boxes = anchor_boxes
        self.k = k
        self.base_model = get_base(self.img_size, model=self.backbone)
        self.window = tf.keras.layers.Conv2D(filters=256, kernel_size=3, strides=1, padding='same')
        self.bbox_reg = tf.keras.layers.Conv2D(filters=self.k*4, kernel_size=1, activation='linear')
        self.bbox_reg_reshape = tf.keras.layers.Reshape((-1, 4), name='reg_out')
        self.cls = tf.keras.layers.Conv2D(filters=self.k, kernel_size=1, activation='sigmoid')
        self.cls_reshape = tf.keras.layers.Reshape((-1, 1), name='cls_out')

    def compile(self, optimizer):
        super(RPN, self).compile()
        self.optimizer = optimizer
        self.loss_tracker = tf.keras.metrics.Mean(name='loss')
    
    def Cls_Loss(self, y_true, y_pred):
        indices = tf.where(tf.not_equal(y_true, tf.constant(-1.0, dtype=tf.float64)))
        target = tf.gather_nd(y_true, indices)
        output = tf.gather_nd(y_pred, indices)
        return tf.losses.BinaryCrossentropy()(target, output)

    def Reg_Loss(self, y_true, y_pred):
        indices = tf.reduce_any(tf.not_equal(y_true, 0), axis=-1)
        loss_fn = tf.losses.Huber(reduction=tf.losses.Reduction.NONE)
        loss_for_all = loss_fn(y_true[indices], y_pred[indices])
        loss_for_all = tf.reduce_mean(loss_for_all, axis=-1)
        return loss_for_all
    
    def train_step(self, data):
        x, y = data
        y_cls = y[0]
        y_reg = y[1]
        rpn_lambda = 10
        
        with tf.GradientTape() as tape:
            cls, bbox_reg = self(x, training=True)
            cls_loss = self.Cls_Loss(y_cls, cls)
            reg_loss = self.Reg_Loss(y_reg, bbox_reg)
            losses = cls_loss + rpn_lambda * reg_loss
            
        trainable_vars = self.trainable_variables
        grad = tape.gradient(losses, trainable_vars)
        self.optimizer.apply_gradients(zip(grad, trainable_vars))
        self.loss_tracker.update_state(losses)
        return {'rpn_loss': self.loss_tracker.result()}

    def call(self, inputs):
        feature_extractor = self.base_model(inputs)
        intermediate = self.window(feature_extractor)
        cls_ = self.cls(intermediate)
        cls = self.cls_reshape(cls_)
        bbox_reg_ = self.bbox_reg(intermediate)
        bbox_reg = self.bbox_reg_reshape(bbox_reg_)
        return cls, bbox_reg

In [392]:
tf.keras.backend.clear_session()
rpn = RPN(img_size=image.shape, anchor_boxes=anchor_boxes, k=9, backbone='vgg')
cls, bounding = rpn(inputs)

rpn.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001))

rpn.fit(
    x=tf.expand_dims(image, 0), 
    y=(tf.expand_dims(cls_label[0], 0), tf.expand_dims(reg_label[0], 0))
)



<tensorflow.python.keras.callbacks.History at 0x1d30717c288>

In [562]:
score, rp = rpn(tf.expand_dims(image, 0))

In [563]:
score = tf.squeeze(score).numpy()

In [564]:
rp = tf.squeeze(rp)

In [566]:
rois = np.zeros(anchor_boxes.shape)

rois[:, 0] = anchor_boxes[:, 0] + anchor_boxes[:, 2] * rp[:, 0]
rois[:, 1] = anchor_boxes[:, 1] + anchor_boxes[:, 3] * rp[:, 1]
rois[:, 2] = anchor_boxes[:, 2] * tf.exp(rp[:, 2])
rois[:, 3] = anchor_boxes[:, 3] * tf.exp(rp[:, 3])

In [569]:
def anchor_to_coordinate(boxes):    
    x1 = boxes[:, 0] - boxes[:, 2]/2
    x2 = boxes[:, 0] + boxes[:, 2]/2
    y1 = boxes[:, 1] - boxes[:, 3]/2
    y2 = boxes[:, 1] + boxes[:, 3]/2
    return np.stack([x1, x2, y1, y2], axis=1)

In [570]:
rois = anchor_to_coordinate(rois)

In [571]:
rois[:, 0] = np.clip(rois[:, 0], 0, size[1])
rois[:, 1] = np.clip(rois[:, 1], 0, size[1])
rois[:, 2] = np.clip(rois[:, 2], 0, size[0])
rois[:, 3] = np.clip(rois[:, 3], 0, size[0])

In [572]:
min_size = 16
hs = rois[:, 3] - rois[:, 2]
ws = rois[:, 1] - rois[:, 0]

keep = np.where((hs >= min_size) & (ws >= min_size))[0]
rois = rois[keep, :]

scores = score[keep]
print(keep.shape, rois.shape, scores.shape)

(2817,) (2817, 4) (2817,)


In [573]:
order = scores.ravel().argsort()[::-1]

n_train_pre_nms = 2000
order = order[:n_train_pre_nms]
rois = rois[order, :]
print(order.shape, rois.shape)

(2000,) (2000, 4)


In [None]:
n_sample = 64 # number of samples from roi
pos_ratio = 0.25 # number of positive examples out of the n_samples
pos_iou_thresh = 0.5 # min iou of region proposal with any ground truth object to consider it as positive label
neg_iou_thresh_hi = 0.5 # iou 0~0.5 is considered as negative (0, background)
neg_iou_thresh_lo = 0.0

In [583]:
def coordinate_to_anxhor(boxes):
    w = boxes[:, 1] - boxes[:, 0]
    h = boxes[:, 3] - boxes[:, 2]
    x = boxes[:, 0] + w/2
    y = boxes[:, 2] + h/2
    return np.stack([x, y, w, h], axis=1)

In [587]:
rois = coordinate_to_anxhor(rois)

In [595]:
ious = IoU(GT[0], rois)

## Detector

### Non-Maximum Suppression

In [None]:
def anchor_to_coordinate(box):    
    x1 = box[0] - box[2]/2
    x2 = box[0] + box[2]/2
    y1 = box[1] - box[3]/2
    y2 = box[1] + box[3]/2
    return (x1, x2, y1, y2)

In [None]:
max_output_size = 128
colors = {k: tuple(map(int, np.random.randint(0, 255, 3))) for k in range(max_output_size)}
x1 = int(ground_truth_row['x_min'])
x2 = int(ground_truth_row['x_max'])
y1 = int(ground_truth_row['y_min'])
y2 = int(ground_truth_row['y_max'])

img_ = image.copy()
cv2.rectangle(img_, (x1, y1), (x2, y2), (255, 0, 0), thickness=2)

selected_indices = tf.image.non_max_suppression(rois, ious, max_output_size=max_output_size, score_threshold=0.01)
anchors = tf.gather(rois, selected_indices)

for i, anchor in enumerate(anchors):
    anchor = anchor_to_coordinate(anchor.numpy())
    cv2.rectangle(
        img_, 
        (int(anchor[0]), int(anchor[2])), (int(anchor[1]), int(anchor[3])), 
        colors.get(i), 
        thickness=1
    )

fig, ax = plt.subplots(dpi=200)
ax.imshow(img_)
ax.axis('off')
plt.show()

In [1]:
(1080*1920) / (600*600)

5.76

In [4]:
(1080*1920) / ((1080/2) * (1920/2))

4.0

### Regional Interest Projection

In [None]:
class Regional_Interest_Projection(tf.keras.layers.Lyaer):
    def __init__(self, base_layer, regional_interest, **kwargs):
        super(Regional_Interest_Projection, self).__init__(**kwargs)
        self.base_layer = base_layer
        self.regional_interest = regional_interest
        
    def projection(self, x):
        pass
        
    def call(self, anchor_box):
        x = projection(anchor_box)
        return x

In [None]:
def projection(inputs):
    pass

def Regional_Interest_Projection(feature_map, regional_interest):
    return projection(feature_map, regional_interest)

### RoI pooling

In [106]:
class RoIPooling(tf.keras.layers.Layer):
    def __init__(self, pool_size, num_rois, **kwargs):
        super(RoIPooling, self).__init__(**kwargs)
        self.pool_size = pool_size
        self.num_rois = num_rois

    def build(self, input_shape):
        self.nb_channels = input_shape[0][3]   

    def compute_output_shape(self, input_shape):
        return (None, self.num_rois, self.pool_size, self.pool_size, self.nb_channels)

    def call(self, x):
        
        assert(len(x) == 2)

        # x[0] is image with shape (rows, cols, channels)
        img = x[0]

        # x[1] is roi with shape (num_rois,4) with ordering (x,y,w,h)
        rois = x[1]

        input_shape = tf.shape(img)

        outputs = []
        for roi_idx in range(self.num_rois):

            x = rois[0, roi_idx, 0]
            y = rois[0, roi_idx, 1]
            w = rois[0, roi_idx, 2]
            h = rois[0, roi_idx, 3]

            x = tf.keras.backend.cast(x, 'int32')
            y = tf.keras.backend.cast(y, 'int32')
            w = tf.keras.backend.cast(w, 'int32')
            h = tf.keras.backend.cast(h, 'int32')

            # Resized roi of the image to pooling size (7x7)
            rs = tf.image.resize(img[:, y:y+h, x:x+w, :], (self.pool_size, self.pool_size))
            outputs.append(rs)
                

        final_output = tf.keras.backend.concatenate(outputs, axis=0)

        # Reshape to (1, num_rois, pool_size, pool_size, nb_channels)
        final_output = tf.keras.backend.reshape(final_output, (1, self.num_rois, self.pool_size, self.pool_size, self.nb_channels))

        # permute_dimensions is similar to transpose
        final_output = tf.keras.backend.permute_dimensions(final_output, (0, 1, 2, 3, 4))

        return final_output
    
    
    def get_config(self):
        config = {'pool_size': self.pool_size,
                  'num_rois': self.num_rois}
        base_config = super(RoiPoolingConv, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [27]:
class ROIPoolingLayer(tf.keras.layers.Layer):
    def __init__(self, h, w, **kwargs):
        super(RoIPooling, self).__init__(**kwargs)
        self.h = h
        self.w = w
        
    @staticmethod
    def _pool_rois(feature_map, rois, pooled_height, pooled_width):
        def curried_pool_roi(roi):
            return ROIPoolingLayer._pool_roi(feature_map, roi,
                                             pooled_height, pooled_width)

        pooled_areas = tf.map_fn(curried_pool_roi, rois, dtype=tf.float32)
        return pooled_areas

    @staticmethod
    def _pool_roi(feature_map, roi, pooled_height, pooled_width):

        feature_map_height = int(feature_map.shape[0])
        feature_map_width = int(feature_map.shape[1])

        h_start = tf.cast(feature_map_height * roi[0], 'int32')
        w_start = tf.cast(feature_map_width * roi[1], 'int32')
        h_end = tf.cast(feature_map_height * roi[2], 'int32')
        w_end = tf.cast(feature_map_width * roi[3], 'int32')

        region = feature_map[h_start:h_end, w_start:w_end, :]

        region_height = h_end - h_start
        region_width = w_end - w_start
        h_step = tf.cast(region_height / pooled_height, 'int32')
        w_step = tf.cast(region_width / pooled_width, 'int32')

        areas = [[(
            i*h_step,
            j*w_step,
            (i+1)*h_step if i+1 < pooled_height else region_height,
            (j+1)*w_step if j+1 < pooled_width else region_width
        )
            for j in range(pooled_width)]
            for i in range(pooled_height)]

        def pool_area(x):
            return tf.math.reduce_max(region[x[0]:x[2], x[1]:x[3], :], axis=[0, 1])

        pooled_features = tf.stack([[pool_area(x) for x in row] for row in areas])
        return pooled_features
        
    def call(self, x):
        def curried_pool_rois(x):
            return ROIPoolingLayer._pool_rois(x[0], x[1],
                                              self.pooled_height,
                                              self.pooled_width)

        pooled_areas = tf.map_fn(curried_pool_rois, x, dtype=tf.float32)
        return pooled_areas
        
    def compute_output_shape(self, input_shape):
        feature_map_shape, rois_shape = input_shape
        assert feature_map_shape[0] == rois_shape[0]
        batch_size = feature_map_shape[0]
        n_rois = rois_shape[1]
        n_channels = feature_map_shape[3]
        return (batch_size, n_rois, self.pooled_height,
                self.pooled_width, n_channels)

###  Classifier

In [None]:
class Classifie(tf.keras.layers.Layer):
    def __init__(self, base_layers, input_rois, **kwargs):
        super(Classifie, self).__init__(**kwargs)
        self.flatten = tf.keras.layers.Flatten()
        self.dense1 = tf.keras.layers.Dense()
        self.dense2 = tf.kersa.layers.Dense()
        
        self.dense3 = tf.keras.layers.Dense()
        
        self.dense4a = tf.keras.layers.Dense(activation='softmax')
        self.dense4b = tf.keras.layers.Dense()
        
    def call(self, inputs):
        x = self.flatten(inputs)
        x = self.dense1(x)
        x = self.dense2(x)
        x = self.dense3(x)
        
        softmax = self.dense4a(x)
        bbox_regressor = self.dense4b(x)

## Faster R-CNN

In [None]:
class Faster_RCNN(tf.kersa.Model):
    def __init__(self, train_step=0, **kwargs):
        super(Faster_RCNN, self).__init__(*kwargs)
        self.rpn = RPN()
        self.rois = RoI()
        self.detector = Detector()
        self.train_step = train_step

    def compile(self, optimizer, ...):
        super(Faster_RCNN, self).compile()
        self.optimizer = optimizer
        ...
        
        self.rpn_loss_tracker = tf.keras.metrics.Mean(name='rpn_loss')
        self.detector_loss_tracker = tf.keras.metrics.Mean(name='detector_loss')
        
    def RPN_Loss(self, z):
        pass

    def Detector_Loss(self, x, z):
        pass
        
    def train_step(self, data):
        batch_size = tf.shape(data)[0]

        with tf.GradientTape() as tape:
            generated_image = self.Generating(num=batch_size)
            discriminator_loss = self.Discriminator_Loss(data, generated_image)
        grad = tape.gradient(discriminator_loss, self.discriminator.trainable_weights)
        self.discriminator_optimizer.apply_gradients(zip(grad, self.discriminator.trainable_weights))

        with tf.GradientTape() as tape:
            generated_image = self.Generating(num=batch_size)
            generator_loss = self.Generator_Loss(generated_image)
        grad = tape.gradient(generator_loss, self.generator.trainable_weights)
        self.generator_optimizer.apply_gradients(zip(grad, self.generator.trainable_weights))

        self.generator_loss_tracker.update_state(generator_loss)
        self.discriminator_loss_tracker.update_state(discriminator_loss)

        return {
            'discriminator_loss': self.discriminator_loss_tracker.result(),
            'generator_loss' : self.generator_loss_tracker.result()
        }

## Loss

### classification Loss

### bounding box regression Loss

In [260]:
def rpn_loss_regr_fixed_num(y_true, y_pred):

        # x is the difference between true value and predicted vaue
        x = y_true[:, :, :, 4 * num_anchors:] - y_pred

        # absolute value of x
        x_abs = K.abs(x)

        # If x_abs &lt;= 1.0, x_bool = 1
        x_bool = K.cast(K.less_equal(x_abs, 1.0), tf.float32)

        return lambda_rpn_regr * K.sum(
            y_true[:, :, :, :4 * num_anchors] * (x_bool * (0.5 * x * x) + (1 - x_bool) * (x_abs - 0.5))) / K.sum(epsilon + y_true[:, :, :, :4 * num_anchors])


def rpn_loss_regr(num_anchors):
    """Loss function for rpn regression
    Args:
        num_anchors: number of anchors (9 in here)
    Returns:
        Smooth L1 loss function 
                           0.5*x*x (if x_abs &lt; 1)
                           x_abx - 0.5 (otherwise)
    """
    return rpn_loss_regr_fixed_num
  
def rpn_loss_cls_fixed_num(y_true, y_pred):

            return lambda_rpn_class * K.sum(y_true[:, :, :, :num_anchors] * K.binary_crossentropy(y_pred[:, :, :, :], y_true[:, :, :, num_anchors:])) / K.sum(epsilon + y_true[:, :, :, :num_anchors])


def rpn_loss_cls(num_anchors):
    """Loss function for rpn classification
    Args:
        num_anchors: number of anchors (9 in here)
        y_true[:, :, :, :9]: [0,1,0,0,0,0,0,1,0] means only the second and the eighth box is valid which contains pos or neg anchor =&gt; isValid
        y_true[:, :, :, 9:]: [0,1,0,0,0,0,0,0,0] means the second box is pos and eighth box is negative
    Returns:
        lambda * sum((binary_crossentropy(isValid*y_pred,y_true))) / N
    """
    

    return rpn_loss_cls_fixed_num

tqdm.std.tqdm