# import libs

In [None]:
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import
import random
import pprint
import sys
import time
import numpy as np
from optparse import OptionParser
import pickle
import math
import cv2
import copy
from matplotlib import pyplot as plt
import tensorflow as tf
import pandas as pd
import os

from sklearn.metrics import average_precision_score


In [None]:
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, TimeDistributed, Dense, Flatten, Dropout

### Config setting

In [None]:
class Config:
    
    def __init__(self):
        
        # Print the process or not
        self.verbose = True
        
        # Name of base network
        self.network = 'vgg'
        
        # Setting for data augmentation
        self.use_horizontal_flips = False
        self.use_vertical_flips = False
        self.rot_90 = False
        
        # Anchor box scales
    # Note that if im_size is smaller, anchor_box_sclase should be scaled
    # Original anchor_box_scales in the paper is [128, 256, 512]
        self.anchor_box_scales = [64, 128, 256]
        
        # Anchor box ratios
        self.anchor_box_ratios = [[1, 1], [1./math.sqrt(2), 2./math.sqrt(2)], [2./math.sqrt(2), 1./math.sqrt(2)]] 
        
        # Size to resize the smallest side of the image
        # Original setting in paper is 600. Set to 300 in here to save training time
        self.im_size = 300
        
        # image channel-wise mean to subtract
        self.img_channel_mean = [103.939, 116.779, 123.68]
        self.img_scaling_factor = 1.0
        
        # number of ROIs at once
        self.num_rois = 4
        
        # stride at the RPN (this depends on the network configuration)
        self.rpn_stride = 16
        
        self.balanced_classes = False 
        
        # scaling the stdev
        self.std_scaling = 4.0
        self.classifier_regr_std = [8.0, 8.0, 4.0, 4.0]
        
        # overlaps for RPN
        self.rpn_min_overlap = 0.3
        self.rpn_max_overlap = 0.7
        
        # overlaps for classifier ROIs
        self.classifier_min_overlap = 0.1
        self.classifier_max_overlap = 0.5
        
        # placeholder for the class mapping, automatically generated by the parser
        self.class_mapping = None
        
        self.model_path = None

### Parser the data from annotation file

In [None]:
def get_data(input_path):
    """Parse the data from annotation file
    
    Args:
        input_path: annotation file path
        
    Returns:
        all_data: list(filepath, width, height, list(bboxes))
        classes_count: dict{key:class_name, value:count_num}
            e.g. {'Car': 2383, 'Mobile phone': 1108, 'Person': 3745}
        class_mapping: dict{ket:class_name, value: idx}
            e.g. {'Car', 0, 'Mobile phone': 1, 'Person': 2}
    """
    found_bg = False
    all_imgs = {}
    
    classes_count = {}
    
    class_mapping = {}
    
    visualise = True
    
    i = 1
    
    with open(input_path, 'r') as f:
        
        print('Parsing annotation files')
        
        for line in f:
            
            # Print process
            sys.stdout.write('\r'+'idx=' + str(i))
            i += 1
            
            line_split = line.strip().split(',')
            
            # Make sure the info saved in annotation file matching the format (path_filename, x1, y1, x2, y2, class_name)
            # Note:
            #   One path_filename might has several classes (class_name)
            #   x1, y1, x2, y2 are the pixel value of the original image, not the ratio value
            #   (x1, y1) top left coordinates; (x2, y2) bottom right coordinates
            
            (filename,x1,y1,x2,y2,class_name) = line_split
            
            if class_name not in classes_count:
                classes_count[class_name] = 1
            else:
                classes_count[class_name] += 1
            
            if class_name not in class_mapping:
                if class_name == 'bg' and found_bg == False:
                    print('Found class name with special name bg. Will be treated as a background region (this is usually for hard negative mining).')
                    found_bg = True
                class_mapping[class_name] = len(class_mapping)
                
            if filename not in all_imgs:
                all_imgs[filename] = []
                
                img = cv2.imread(filename)
                (rows,cols) = img.shape[:2]
                all_imgs[filename]['filepath'] = filename
                all_imgs[filename]['width'] = cols
                all_imgs[filename]['height'] = rows
                all_imgs[filename]['bboxes'] = []
                # if np.random.randint(0,6) > 0:
                #     all_imgs[filename]['imageset'] = 'trainval'
                # else:
                #     all_imgs[filename]['imageset'] = 'test'
                
            all_imgs[filename]['bboxes'].append({'class': class_name, 'x1': int(x1), 'y1': int(y1), 'x2': int(x2), 'y2': int(y2)})
            
        all_data = []
        for key in all_imgs:
            all_data.append(all_imgs[key]) # dict 만든 다음에 list로 바꾸는게 더 메모리 효율적인가?
            
        # make sure the bg class is last in the list
        if found_bg:
            if class_mapping['bg'] != len(class_mapping) - 1:
                key_to_switch = [key for key in class_mapping.keys() if class_mapping[key] == len(class_mapping)-1][0]
                val_to_switch = class_mapping['bg']
                class_mapping['bg'] = len(class_mapping) - 1
                class_mapping[key_to_switch] = val_to_switch
                
        return all_data, classes_count, class_mapping

### Define ROI Pooling Convolutional Layer

In [None]:
from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K

In [None]:
class RoiPoolingConv(Layer):
    '''ROI pooling layer for 2D inputs.
    See Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition,
    K. He, X. Zhang, S. Ren, J. Sun
    # Arguments
        pool_size: int
            Size of pooling region to use. pool_size = 7 will result in a 7x7 region.
        num_rois: number of regions of interest to be used
    # Input shape
        list of two 4D tensors [X_img,X_roi] with shape:
        X_img:
        `(1, rows, cols, channels)`
        X_roi:
        `(1,num_rois,4)` list of rois, with ordering (x,y,w,h)
    # Output shape
        3D tensor with shape:
        `(1, num_rois, channels, pool_size, pool_size)`
    '''
    def __init__(self, pool_size, num_rois, **kwargs):
        
        self.dim_ordering = K.image_dim_ordering()
        self.pool_size = pool_size
        slef.num_rois = num_rois
        
        super(RoiPoolingConv, self).__init__(**kwargs)
        
    def build(self, input_shape):
        assert(isinstance(input_shape, list)) # 내가 추가함
        self.nb_channels = input_shape[0][3]
        
        # super(RoiPoolingConv, self).build(input_shape) # 이건 왜 안하지? 공식문서에서는 하라는데?
        
    def comput_output_shape(self, input_shape):
        return None, self.num_rois, self.pool_size, self.pool_size, self.nb_channels
        
    def call(self, x, mask=None):
        
        assert(len(x) == 2)
        
        # X[0] is image with shape (batch_size, rows, cols, channels)
        img = x[0]
        
        # x[1] is roi with shape (batch_size, num_rois, 4) with ordering (x,y,w,h)
        rois = x[1]d
        
        input_shape = K.shape(img)
        
        outputs = []
        
        for roi_idx in range(self.num_rois):
            
            x = rois[0, roi_idx, 0]
            y = rois[0, roi_idx, 1]
            w = rois[0, roi_idx, 2]
            h = rois[0, roi_idx, 3]
            
            x = K.cast(x, 'int32')
            y = K.cast(y, 'int32')
            w = K.cast(w, 'int32')
            h = K.cast(h, 'int32')
            
            # Resized roi of the image to pooling size (7x7)
            rs = tf.image.resize(img[:, y:y+h, x:x+w, :], (self.pool_size, self.pool_size)) # 왜 그냥 resize를 쓰지??
            outputs.append(rs)
            
        final_output = K.concatenate(outputs, axis=0)
        
        # Reshape to (1, num_rois, pool_size, pool_size, nb_channels)
        # Might be (1, 4, 7, 7, 3)
        final_output = K.reshape(final_output, (1, self.num_rois, self.pool_size, self.pool_size, self.nb_channels))
        
        # permute_dimensions is similar to transpose
        final_output = K.permute_dimensions(final_output, (0, 1, 2, 3, 4)) # 이건 왜 하는걸까?
        
        return final_output
    
    def get_config(self):
        config = {'pool_size': self.pool_size,
                  'num_rois': self.num_rois}
        base_config = super(RoiPoolingConv, self).get_config()
        return dict(list(base_config.items() + list(config.items())))

### Vgg-16 model

In [None]:
def get_img_output_length(width, height):
    def get_output_length(input_length):
        return input_length//16
    
    return get_output_length(width), get_output_length(height)

def nn_base(input_tensor=None, trainable=False):
    
    input_shape = (None, None, 3)
    
    if input_tensor is None:
        img_input = Input(shape=input_shape)
    else:
        if not K.is_keras_tensor(input_tensor):
            img_input = input_tensor
            
    bn_axis = 3
    
    # Block 1
    x = Conv2D(64, (3, 3), activation="relu", padding="same", name="block1_conv1")(img_input)
    x = Conv2D(64, (3, 3), activation="relu", padding="same", name="block1_conv2")(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), name="block1_pool")(x)
    
    # Block 2 
    x = Conv2D(128, (3, 3), activation="relu", padding="same", name="block2_conv1")(x)
    x = Conv2D(128, (3, 3), activation="relu", padding="same", name="block2_conv2")(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), name="block2_pool")(x)
    
    # Block 3
    x = Conv2D(256, (3, 3), activation="relu", padding="same", name="block3_conv1")(x)
    x = Conv2D(256, (3, 3), activation="relu", padding="same", name="block3_conv2")(x)
    x = Conv2D(256, (3, 3), activation="relu", padding="same", name="block3_conv3")(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), name="block3_pool")(x)
    
    # Block 4
    x = Conv2D(512, (3, 3), activation="relu", padding="same", name="block4_conv1")(x)
    x = Conv2D(512, (3, 3), activation="relu", padding="same", name="block4_conv2")(x)
    x = Conv2D(512, (3, 3), activation="relu", padding="same", name="block4_conv3")(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), name="block4_pool")
    
    # Block5
    x = Conv2D(512, (3, 3), activation="relu", padding="same", name="block5_conv1")(x)
    x = Conv2D(512, (3, 3), activation="relu", padding="same", name="block5_conv2")(x)
    x = Conv2D(512, (3, 3), activation="relu", padding="same", name="block5_conv3")(x)
#     x = Maxpooling2D((2, 2), strides=(2, 2), name="block5_pool")(x)

    return x

### RPN layer

In [None]:
def rpn_layer(base_layers, num_anchors):
    """Create a rpn layer
        Step1: Pass through the feature map from base layer to a 3x3 512 channels convolutional layer
                Keep the padding 'same' to preserve the feature map's size
        Step2: Pass the step1 to two (1,1) convolutional layer to replace the fully connected layer
                classification layer: num_anchors (9 in here) channels for 0, 1 sigmoid activation output
                regression layer: num_anchors*4 (36 in here) channels for computing the regression of bboxes with linear activation
        Args:
            base_layer: vgg in here
            num_anchors: 9 in here
            
        Returns:
            [x_class, x_regr, base_layer]
            x_class: classification for whether it's an object
            x_regr: bboxes regresssion
            base_layers: vgg in here
        """
    x = Conv2D(512, (3, 3), padding="same", activation="relu", kernel_initializer="normal", name="rpn_conv1")(base_layers)
    
    x_class = Conv2D(num_anchors, (1, 1), activation="sigmoid", kernel_initializer="uniform", name="rpn_out_class")(x)
    x_regr = Conv2D(num_anchors * 4, (1, 1), activation="linear", kernel_initializer="zero", name="rpn_out_regress")(x)
    
    return [x_class, x_regr, base_layers]

### Classifer layer

In [None]:
def classifier_layer(base_layers, input_rois, num_rois, nb_classes=4):
    """Create a classifier layer
    
    Args:
        base_layers: vgg
        input_rois: `(1,num_rois,4)` list of rois, with ordering (x,y,w,h)
        num_rois: number of rois to be processed in one time (4 in here)
        
    Returns:
        list(out_class, out_regr)
        out_class: classifier layer output
        out_regr:regression layer output
    """
    
    input_shape = (num_rois,7,7,512)
    
    pooling_regions = 7
    
    # out_roi_pool.shape = (1, num_rois, channels, pool_size, pool_size)
    # num_rois (4) 7x7 roi pooling
    out_roi_pool = RoiPoolingConv(pooling_regions, num_rois)([base_layer, input_rois])
    
    # Flatten the convolutional layer and connected to 2 FC and 2 dropout
    out = TimeDistributed(Flatten(name="flatten"))(out_roi_pool)
    out = TimeDistributed(Dense(4096, activation="relu", name="fc1"))(out)
    out = TimeDistributed(Dropout(0.5))(out)
    out = TimeDistributed(Dense(4096, activation="relu", name="fc2"))(out)
    out = TimeDistributed(Dropout(0.5))(out)
    
    # There are two output layer
    # out_class: softmax activation function for classify the class name of the object
    # out_regr: linear activation function for bboxes coordinates regression
    out_class = TimeDistributed(Dense(nb_classes, activation="softmax", kernel_initializer="zero"), name="dense_class_{}".format(nb_classes))(out)
    # note: no regression target for bg class
    out_regr = TimeDistributed(Dense(4 * (nb_classes-1), activation="linear", kernel_initializer="zero"), name="dense_regress_{}".format(nb_classes))(out)
    
    return [out_class, out_regr]