In [19]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [20]:
import wget
import tempfile
import tarfile
from zipfile import ZipFile
from glob import glob 
import json
import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict
import numpy as np
import cv2
import os
import pdb

In [21]:
from fastestimator.estimator.estimator import Estimator
from fastestimator.network.loss import Loss
from fastestimator.network.model import ModelOp, FEModel
from fastestimator.network.network import Network
from fastestimator.pipeline.pipeline import Pipeline
from fastestimator.pipeline.processing import Minmax
from fastestimator.record.preprocess import ImageReader, Resize
from fastestimator.record.record import RecordWriter
from fastestimator.util.op import NumpyOp, TensorOp
from fastestimator.estimator.trace import MeanAveragePrecision, LRController, ModelSaver

In [22]:
from fastestimator.util.compute_overlap import compute_overlap

In [23]:
import numpy as np
import tensorflow as tf
from tensorflow.python.keras import layers, models


In [24]:
import numpy as np
import tensorflow as tf
from tensorflow.python.keras import layers, models

def classification_sub_net(num_classes, num_anchor=9):
    model = models.Sequential()
    model.add(
        layers.Conv2D(256,
                      kernel_size=3,
                      strides=1,
                      padding='same',
                      activation='relu',
                      kernel_initializer=tf.random_normal_initializer(stddev=0.01), bias_initializer='zeros'))
    model.add(
        layers.Conv2D(256,
                      kernel_size=3,
                      strides=1,
                      padding='same',
                      activation='relu',
                      kernel_initializer=tf.random_normal_initializer(stddev=0.01), bias_initializer='zeros'))
    model.add(
        layers.Conv2D(256,
                      kernel_size=3,
                      strides=1,
                      padding='same',
                      activation='relu',
                      kernel_initializer=tf.random_normal_initializer(stddev=0.01), bias_initializer='zeros'))
    model.add(
        layers.Conv2D(256,
                      kernel_size=3,
                      strides=1,
                      padding='same',
                      activation='relu',
                      kernel_initializer=tf.random_normal_initializer(stddev=0.01), bias_initializer='zeros'))
    model.add(
        layers.Conv2D(num_classes * num_anchor,
                      kernel_size=3,
                      strides=1,
                      padding='same',
                      activation='sigmoid',
                      kernel_initializer=tf.random_normal_initializer(stddev=0.01),
                      bias_initializer=tf.initializers.constant(np.log(1 / 99))))
    model.add(layers.Reshape((-1, num_classes)))  # the output dimension is [batch, #anchor, #classes]
    return model

In [25]:
def regression_sub_net(num_anchor=9):
    model = models.Sequential()
    model.add(
        layers.Conv2D(256,
                      kernel_size=3,
                      strides=1,
                      padding='same',
                      activation='relu',
                      kernel_initializer=tf.random_normal_initializer(stddev=0.01),
                      bias_initializer='zeros'))
    model.add(
        layers.Conv2D(256,
                      kernel_size=3,
                      strides=1,
                      padding='same',
                      activation='relu',
                      kernel_initializer=tf.random_normal_initializer(stddev=0.01),
                      bias_initializer='zeros')) 
    model.add(
        layers.Conv2D(256,
                      kernel_size=3,
                      strides=1,
                      padding='same',
                      activation='relu',
                      kernel_initializer=tf.random_normal_initializer(stddev=0.01),
                      bias_initializer='zeros'))
    model.add(
        layers.Conv2D(256,
                      kernel_size=3,
                      strides=1,
                      padding='same',
                      activation='relu',
                      kernel_initializer=tf.random_normal_initializer(stddev=0.01),
                      bias_initializer='zeros'))
    model.add(
        layers.Conv2D(4 * num_anchor,
                      kernel_size=3,
                      strides=1,
                      padding='same',
                      kernel_initializer=tf.random_normal_initializer(stddev=0.01),
                      bias_initializer='zeros'))
    model.add(layers.Reshape((-1, 4)))  # the output dimension is [batch, #anchor, 4]
    return model

In [26]:
def RetinaNet(input_shape, num_classes, num_anchor=9):
    inputs = tf.keras.Input(shape=input_shape)
    # FPN
#     weights = '/home/ubuntu/ResNet-50-model.keras.h5'
    resnet50 = tf.keras.applications.ResNet50(weights= "imagenet", include_top=False, input_tensor=inputs, pooling=None)
#     resnet50.load_weights(weights, by_name=True)
    assert resnet50.layers[80].name == "conv3_block4_out"
    C3 = resnet50.layers[80].output
    assert resnet50.layers[142].name == "conv4_block6_out"
    C4 = resnet50.layers[142].output
    assert resnet50.layers[-1].name == "conv5_block3_out"
    C5 = resnet50.layers[-1].output
    P5 = layers.Conv2D(256, kernel_size=1, strides=1, padding='same')(C5)
    P5_upsampling = layers.UpSampling2D()(P5)
    P4 = layers.Conv2D(256, kernel_size=1, strides=1, padding='same')(C4)
    P4 = layers.Add()([P5_upsampling, P4])
    P4_upsampling = layers.UpSampling2D()(P4)
    P3 = layers.Conv2D(256, kernel_size=1, strides=1, padding='same')(C3)
    P3 = layers.Add()([P4_upsampling, P3])
    P6 = layers.Conv2D(256, kernel_size=3, strides=2, padding='same', name="P6")(C5)
    P7 = layers.Activation('relu')(P6)
    P7 = layers.Conv2D(256, kernel_size=3, strides=2, padding='same', name="P7")(P7)
    P5 = layers.Conv2D(256, kernel_size=3, strides=1, padding='same', name="P5")(P5)
    P4 = layers.Conv2D(256, kernel_size=3, strides=1, padding='same', name="P4")(P4)
    P3 = layers.Conv2D(256, kernel_size=3, strides=1, padding='same', name="P3")(P3)
    # classification subnet
    cls_subnet = classification_sub_net(num_classes=num_classes, num_anchor=num_anchor)
    P3_cls = cls_subnet(P3)
    P4_cls = cls_subnet(P4)
    P5_cls = cls_subnet(P5)
    P6_cls = cls_subnet(P6)
    P7_cls = cls_subnet(P7)
    cls_output = layers.Concatenate(axis=-2)([P3_cls, P4_cls, P5_cls, P6_cls, P7_cls])
    # localization subnet
    loc_subnet = regression_sub_net(num_anchor=num_anchor)
    P3_loc = loc_subnet(P3)
    P4_loc = loc_subnet(P4)
    P5_loc = loc_subnet(P5)
    P6_loc = loc_subnet(P6)
    P7_loc = loc_subnet(P7)
    loc_output = layers.Concatenate(axis=-2)([P3_loc, P4_loc, P5_loc, P6_loc, P7_loc])
    return tf.keras.Model(inputs=inputs, outputs=[cls_output, loc_output])



# def get_loc_offset(box_gt, box_anchor):
#     mean = 0 
#     std = 0.2
#     gt_x1, gt_y1, gt_x2, gt_y2 = tuple(box_gt)
#     ac_x1, ac_y1, ac_x2, ac_y2 = tuple(box_anchor)
#     anchor_width = ac_x2 - ac_x1
#     anchor_height = ac_y2 - ac_y1
#     dx1 = (gt_x1 - ac_x1) / anchor_width
#     dx1 = dx1 / std
#     dy1 = (gt_y1 - ac_y1) / anchor_height
#     dy1 = dy1 / std
#     dx2 = (gt_x2 - ac_x2) / anchor_width
#     dx2 = dx2 / std
#     dy2 = (gt_y2 - ac_y2) / anchor_height
#     dy2 = dy2 /std
#     return dx1, dy1, dx2, dy2


def get_loc_offset(box_gt, box_anchor):
    mean = 0 
    std = 0.2
    anchor_width_height = np.tile(box_anchor[:,2:] - box_anchor[:,:2],[1,2])
    delta =  (box_gt - box_anchor)/ anchor_width_height
    return delta/std


def get_iou(box1, box2):
    b1_x1, b1_y1, b1_x2, b1_y2 = tuple(box1)
    b2_x1, b2_y1, b2_x2, b2_y2 = tuple(box2)
    xA = max(b1_x1, b2_x1)
    yA = max(b1_y1, b2_y1)
    xB = min(b1_x2, b2_x2)
    yB = min(b1_y2, b2_y2)
    
    interArea = max(0, xB - xA) * max(0, yB - yA)
    if interArea == 0:
        iou = 0
    else:
        box1Area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1)
        box2Area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1)
        iou = interArea / (box1Area + box2Area - interArea)
    return iou

In [27]:
from ast import literal_eval
# SHAPE=(288*2,288*2,3)

class ResnetPreprocess(NumpyOp):
    def forward(self, data, state):
        image = data
        image = image.astype(np.float64)
        image[..., 0] -= 103.939
        image[..., 1] -= 116.779
        image[..., 2] -= 123.68
        return image        

class String2FloatArray(NumpyOp):
    # this thing converts '[1, 2, 3]' into np.array([1, 2, 3])
    def forward(self, data, state):
        for idx, elem in enumerate(data):
            elem = literal_eval(elem)
            data[idx] = np.array(elem, dtype=np.int32)
#             data[idx] = np.array([float(x) for x in elem[1:-1].split(',')])
        return data
    
class String2IntArray(NumpyOp):
    # this thing converts '[1, 2, 3]' into np.array([1, 2, 3])
    def forward(self, data, state):
        for idx, elem in enumerate(data):
            elem = literal_eval(elem)
            data[idx] = np.array(elem ,dtype=np.int32)
#             data[idx] = np.array([int(x) for x in elem[1:-1].split(',')])
        return data
    
    
class RelativeCoordinate(NumpyOp):
    def forward(self, data, state):
        image, x1, y1, x2, y2 = data
        height, width = image.shape[0], image.shape[1]
        x1, y1, x2, y2 = x1 / width, y1 / height, x2 / width, y2 / height
        return x1, y1, x2, y2
    
class ImageAdjustedBatchMax(TensorOp):
    def forward(self, data, state):
        images = data
        batch_size = len(images)
        max_shape = tuple(max(image.shape[idx] for image in images) for idx in range(3) )
        images_max_shape = np.zeros((batch_size,)+max_shape)
        for id, image in enumerate(images):
            images_max_shape[idx, :image.shape[0], :image.shape[1], :image.shape[2] ] = image
        return image_max_shape
    
class ResizeCocoStyle(Resize):
    def __init__(self, target_size, keep_ratio=False, inputs=None, outputs=None, mode=None):
        super().__init__(target_size, keep_ratio=keep_ratio, inputs=inputs, outputs=outputs, mode=mode)
        
        
    def forward(self, data, state):
        img, x1, y1, x2, y2 = data 
        if self.keep_ratio:
            original_ratio = img.shape[1] / img.shape[0]
            target_ratio = self.target_size[1] / self.target_size[0]
            if original_ratio >= target_ratio:
                pad = (img.shape[1] / target_ratio - img.shape[0]) / 2
                pad_boarder = (np.ceil(pad).astype(np.int), np.floor(pad).astype(np.int), 0, 0)
                y1 += np.ceil(pad).astype(np.int)
                y2 += np.ceil(pad).astype(np.int)
            else:
                pad = (img.shape[0] * target_ratio - img.shape[1]) / 2
                pad_boarder = (0, 0, np.ceil(pad).astype(np.int), np.floor(pad).astype(np.int))
                x1 += np.ceil(pad).astype(np.int)
                x2 += np.ceil(pad).astype(np.int)
                
            img = self._cv2.copyMakeBorder(img, *pad_boarder, self._cv2.BORDER_CONSTANT)
        img_resize = self._cv2.resize(img, (self.target_size[1], self.target_size[0]), self.resize_method)
        x1 = x1 * self.target_size[1]/img.shape[1]
        x2 = x2 * self.target_size[1]/img.shape[1]
        y1 = y1 * self.target_size[0]/img.shape[0]
        y2 = y2 * self.target_size[0]/img.shape[0]
        return img_resize, x1,y1,x2,y2
        
            
    
class GenerateTarget(NumpyOp):
    def __init__(self, inputs=None, outputs=None, mode=None, input_shape=(800,800,3)):
        super().__init__(inputs=inputs, outputs=outputs, mode=mode)
        self.pyramid_levels = [3,4,5,6,7]
        self.sizes   = [32, 64, 128, 256, 512]
        self.strides = [8, 16, 32, 64, 128]
        self.ratios  = np.array([0.5, 1, 2], dtype=np.float)
        self.scales  = np.array([2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)], dtype=np.float)
        
        
        self.anchors_list = np.zeros((0,4))
        image_shapes = [(np.array(input_shape[:2]) + 2**pyra_level-1)//(2**pyra_level) for pyra_level in self.pyramid_levels]
        for idx, pyra_level in enumerate(self.pyramid_levels) :
            base_size=self.sizes[idx]
            ratios=self.ratios
            scales=self.scales
            image_shape = image_shapes[idx]
            strides = self.strides[idx]
            anchors = self.generate_anchors_core(base_size, ratios, scales)
            shifted_anchors = self.shift(image_shape, strides, anchors)
            self.anchors_list = np.append(self.anchors_list, shifted_anchors, axis=0)
        
        
    def forward(self, data, state):
        label, x1, y1, x2, y2, image = data
        target_cls, target_loc = self.get_target(self.anchors_list, label, x1, y1, x2, y2, num_classes=81)
        return target_cls, target_loc, self.anchors_list
        
        
#     def get_target(self, anchorbox, label, x1, y1, x2, y2, num_classes=20):
#         bg_index = num_classes - 1
#         num_anchor = anchorbox.shape[0]
#         target_cls = np.zeros(shape=(num_anchor), dtype=np.int64) + (bg_index)  #initializing with bg_index
#         target_loc = np.zeros(shape=(num_anchor, 4), dtype=np.float32)
#         target_iou = np.zeros(shape=(num_anchor), dtype=np.float32)
#         for _label, _x1, _y1, _x2, _y2 in zip(label, x1, y1, x2, y2):
#             best_iou = 0.0
#             for anchor_idx in range(num_anchor):
#                 iou = get_iou((_x1, _y1, _x2, _y2), anchorbox[anchor_idx])

#                 if iou > best_iou:
#                     best_iou = iou
#                     best_anchor_idx = anchor_idx
#                 if iou > target_iou[anchor_idx]:    
#                     if iou > 0.5 :
#                         target_cls[anchor_idx] = _label
#                         target_loc[anchor_idx] = get_loc_offset((_x1, _y1, _x2, _y2), anchorbox[anchor_idx])
#                         target_iou[anchor_idx] = iou
#                     elif iou >0.4:
#                         target_cls[anchor_idx] = -2 #ignore this example
#                         target_iou[anchor_idx] = iou
#                     else:
#                         target_cls[anchor_idx] = bg_index
#                         target_iou[anchor_idx] = iou
#             if best_iou > 0 and best_iou < 0.5: #if gt has no >0.5 iou with any anchor
#                 target_cls[best_anchor_idx] = _label
#                 target_loc[best_anchor_idx] = get_loc_offset((_x1, _y1, _x2, _y2), anchorbox[best_anchor_idx])
#                 target_iou[best_anchor_idx] = 1.0  # 
#         return target_cls, target_loc
        
        
    def get_target(self, anchorbox, label, x1, y1, x2, y2, num_classes=20):
        bg_index = num_classes -1
        query_box= np.zeros((0,4))
        query_label = np.zeros((0))
        for _x1, _y1, _x2, _y2,_label in zip(x1, y1, x2, y2, label):
            query_box = np.append(query_box, np.array([[_x1,_y1,_x2,_y2]]), axis=0)
            query_label = np.append(query_label, _label)
        
        overlap = compute_overlap(anchorbox.astype(np.float64), query_box.astype(np.float64))
        argmax_overlaps_inds = np.argmax(overlap, axis=1)
        max_overlaps = overlap[ np.arange(overlap.shape[0]) , argmax_overlaps_inds]
        positive_index = (max_overlaps > 0.5)
        ignore_index = (max_overlaps > 0.4)  & ~positive_index
        negative_index = (max_overlaps <= 0.4)
        
        target_loc = get_loc_offset(query_box[ argmax_overlaps_inds, :], anchorbox)
        target_cls = query_label[argmax_overlaps_inds]
        target_cls[negative_index] = bg_index
        target_cls[ignore_index] = -2 # ignore this example
        
        return target_cls, target_loc
        
        
        
        
    def generate_anchors_core(self, base_size, ratios, scales):
        num_anchors = len(ratios) * len(scales)
        # initialize output anchors
        anchors = np.zeros((num_anchors, 4))
        # scale base_size
        anchors[:, 2:] = base_size * np.tile(scales, (2, len(ratios))).T

        # compute areas of anchors
        areas = anchors[:, 2] * anchors[:, 3]

        # correct for ratios
        anchors[:, 2] = np.sqrt(areas / np.repeat(ratios, len(scales)))
        anchors[:, 3] = anchors[:, 2] * np.repeat(ratios, len(scales))

        # transform from (x_ctr, y_ctr, w, h) -> (x1, y1, x2, y2)
        anchors[:, 0::2] -= np.tile(anchors[:, 2] * 0.5, (2, 1)).T
        anchors[:, 1::2] -= np.tile(anchors[:, 3] * 0.5, (2, 1)).T

        return anchors
    
    def shift(self, image_shape, stride, anchors):
        shift_x = (np.arange(0, image_shape[1]) + 0.5) * stride
        shift_y = (np.arange(0, image_shape[0]) + 0.5) * stride
        shift_x, shift_y = np.meshgrid(shift_x, shift_y)

        shifts = np.vstack((
            shift_x.ravel(), shift_y.ravel(),
            shift_x.ravel(), shift_y.ravel()
        )).transpose()

        A = anchors.shape[0]
        K = shifts.shape[0]
        all_anchors = (anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2)))
        all_anchors = all_anchors.reshape((K * A, 4))

        return all_anchors
        
    

In [28]:


# sample_array = np.zeros((0, 4))
# sample_array
# for i in range(2):
#     sample_array = np.append(sample_array,np.array([[i,i,i,i]]), axis=0)
# sample_array

# sample_array.shape

In [29]:
class RetinaLoss(Loss):
    def focal_loss(self, cls_gt_example, cls_pred_example, num_classes, alpha=0.25, gamma=2.0):
        bg_index = num_classes - 1 
        # cls_gt has shape [A], cls_pred is in [A, K]
        # gather the objects and background, discard the rest
        obj_idx = tf.where(tf.logical_and( tf.greater_equal(cls_gt_example, 0), tf.less(cls_gt_example, bg_index)))
        obj_bg_idx = tf.where(tf.greater_equal(cls_gt_example, 0))
        obj_bg_count = tf.cast(tf.shape(obj_bg_idx)[0], tf.float32)
        obj_count = tf.cast(tf.maximum(tf.shape(obj_idx)[0],1), tf.float32)
        cls_gt_example = tf.one_hot(cls_gt_example, num_classes)
        cls_gt_example = tf.gather_nd(cls_gt_example, obj_bg_idx)
        cls_pred_example = tf.gather_nd(cls_pred_example, obj_bg_idx)
        cls_gt_example = tf.reshape(cls_gt_example, (-1, 1))
        cls_pred_example = tf.reshape(cls_pred_example, (-1, 1))
        # compute the focal weight on each selected anchor box
        alpha_factor = tf.ones_like(cls_gt_example) * alpha
        alpha_factor = tf.where(tf.equal(cls_gt_example, 1), alpha_factor, 1 - alpha_factor)
        focal_weight = tf.where(tf.equal(cls_gt_example, 1), 1 - cls_pred_example, cls_pred_example)
        focal_weight = alpha_factor * focal_weight**gamma / obj_count
        cls_loss = tf.losses.BinaryCrossentropy(reduction='sum')(cls_gt_example, cls_pred_example, sample_weight=focal_weight)
        return cls_loss, obj_idx

    def smooth_l1(self, loc_gt_example, loc_pred_example, obj_idx):
        # loc_gt anf loc_pred has shape [A, 4]
        sigma= 3
        sigma_squared = sigma ** 3
        obj_count = tf.cast(tf.maximum(tf.shape(obj_idx)[0],1), tf.float32)
        loc_gt = tf.gather_nd(loc_gt_example, obj_idx)
        loc_pred = tf.gather_nd(loc_pred_example, obj_idx)
        loc_gt = tf.reshape(loc_gt, (-1, 1))
        loc_pred = tf.reshape(loc_pred, (-1, 1))
        loc_diff = tf.abs(loc_gt - loc_pred)
        smooth_l1_loss = tf.where(tf.less(loc_diff, 1/sigma_squared), 0.5 * loc_diff**2 * sigma_squared, 
                                  loc_diff - 0.5/sigma_squared)
        smooth_l1_loss = tf.reduce_sum(smooth_l1_loss)/ obj_count
        return smooth_l1_loss

    def forward(self, data, state):
        cls_gt, loc_gt, cls_pred, loc_pred = data
        cls_gt = tf.cast(cls_gt, tf.int32 )
        batch_size = state["batch_size_per_device"]
        total_loss = []
        for idx in range(batch_size):
            cls_gt_example = cls_gt[idx]
            loc_gt_example = loc_gt[idx]
            cls_pred_example = cls_pred[idx]
            loc_pred_example = loc_pred[idx]
            focal_loss, obj_idx = self.focal_loss(cls_gt_example, cls_pred_example, num_classes=80+1)
            smooth_l1_loss = self.smooth_l1(loc_gt_example, loc_pred_example, obj_idx)
            total_loss.append(focal_loss + smooth_l1_loss)
        total_loss = tf.convert_to_tensor(total_loss)
        return total_loss
    

In [30]:
class PredictBox(TensorOp):
    def __init__(self, num_classes, inputs=None, outputs=None, mode=None):
        super().__init__(inputs=inputs, outputs=outputs, mode=mode)
        self.num_classes = num_classes
        self.bg_index = num_classes - 1
                   
    def forward(self, data, state):
        cls_pred, loc_pred, loc_base = data
        input_width = 1280
        input_height = 800
        top_n = 300
        score_threshold = 0.05   
        std = 0.2
        mean = 0
        # convert the residual prediction to absolute prediction in (x1, y1, x2, y2)
        anchor_w_h = tf.tile(loc_base[:,:, 2:], [1, 1, 2]) - tf.tile(loc_base[:,:,:2], [1, 1, 2])
        anchorbox =  loc_base
        loc_pred_abs = tf.map_fn(lambda x: (x[0]*std+mean) * x[1] + x[2],
                             elems=(loc_pred, anchor_w_h, anchorbox),
                             dtype=tf.float32,
                             back_prop=False)
        x1 = tf.clip_by_value(loc_pred_abs[:, :, 0], 0, input_width)
        y1 = tf.clip_by_value(loc_pred_abs[:, :, 1], 0, input_height)
        x2 = tf.clip_by_value(loc_pred_abs[:, :, 2], 0, input_width)
        y2 = tf.clip_by_value(loc_pred_abs[:, :, 3], 0, input_height)
        loc_pred_abs = tf.stack([x1, y1, x2, y2] ,axis=2)
        
        num_batch, num_anchor, _ = loc_pred_abs.shape
        cls_best_score = tf.reduce_max(cls_pred, axis=-1)
        cls_best_class = tf.argmax(cls_pred, axis=-1)
        
        cls_best_score = tf.where(tf.not_equal(cls_best_class, self.bg_index), cls_best_score,0)
        
        # select top n anchor boxes to proceed
        # Padded Nonmax suppression with threshold
        selected_indices_padded = tf.map_fn(
            lambda x: tf.image.non_max_suppression_padded(
                x[0], x[1], top_n, pad_to_max_output_size=True, score_threshold=score_threshold).selected_indices,
            (loc_pred_abs, cls_best_score),
            dtype=tf.int32,
            back_prop=False)
        valid_outputs = tf.map_fn(
            lambda x: tf.image.non_max_suppression_padded(
                x[0], x[1], top_n, pad_to_max_output_size=True, score_threshold=score_threshold).valid_outputs,
            (loc_pred_abs, cls_best_score),
            dtype=tf.int32,
            back_prop=False)
        return loc_pred_abs, selected_indices_padded, valid_outputs

In [31]:
path = '/home/ubuntu/coco/'
train_csv = os.path.join(path,'train_coco.csv')
val_csv = os.path.join(path,'val_coco.csv')

In [34]:
# 32*40, 32*25
writer = RecordWriter(
        train_data=train_csv,
        save_dir = '/home/ubuntu/coco/tf_records',
        validation_data=val_csv,
        ops=[
                ImageReader(inputs="image", parent_path=path, outputs="image"),
                String2IntArray(inputs=["label"], outputs=["label"]),
                String2FloatArray(inputs=["x1", "y1", "x2", "y2"], outputs=["x1", "y1", "x2", "y2"]),
                ResnetPreprocess(inputs="image", outputs="image"), 
                ResizeCocoStyle((800, 1280), keep_ratio=True, inputs=["image", "x1", "y1", "x2", "y2" ] , outputs=["image","x1","y1","x2","y2"]),
                GenerateTarget(inputs=["label","x1","y1","x2","y2","image"], outputs=["target_cls","target_loc","base_loc"], input_shape=(800,1280,3))
        ])

pipeline = Pipeline(batch_size=2, data=writer, read_feature=["image","image_id", "target_cls", "target_loc","base_loc"])
# pipeline = Pipeline(batch_size=2, data=writer, read_feature=["image","image_id", "x1","y1","x2","y2"], padded_batch=True)


In [33]:
show_batch = pipeline.show_results(mode='eval')

FastEstimator: Saving tfrecord to /home/ubuntu/coco/tf_records
FastEstimator: Converting Train TFRecords 0.0%, Speed: 0.00 record/sec
FastEstimator: Converting Train TFRecords 4.8%, Speed: 157.12 record/sec
FastEstimator: Converting Train TFRecords 9.6%, Speed: 110.50 record/sec
FastEstimator: Converting Train TFRecords 14.4%, Speed: 86.67 record/sec
FastEstimator: Converting Train TFRecords 19.1%, Speed: 47.79 record/sec
FastEstimator: Converting Train TFRecords 23.9%, Speed: 65.08 record/sec
FastEstimator: Converting Train TFRecords 28.7%, Speed: 9.41 record/sec
FastEstimator: Converting Train TFRecords 33.5%, Speed: 0.00 record/sec
FastEstimator: Converting Train TFRecords 38.3%, Speed: 6.81 record/sec
FastEstimator: Converting Train TFRecords 43.1%, Speed: 6.90 record/sec
FastEstimator: Converting Train TFRecords 47.9%, Speed: 6.89 record/sec
FastEstimator: Converting Train TFRecords 52.7%, Speed: 6.93 record/sec
FastEstimator: Converting Train TFRecords 57.4%, Speed: 6.89 record/s

In [39]:
a = np.array([[1,1,2,2],[2,2,3,3],[3,3,4,4],[4,4,5,5]])
b = np.array([2,2,2,2])
a.shape

(4, 4)

In [40]:
c = np.tile(a[:,2:]-a[:,:2],[1,2])
c.shape

(4, 4)

In [41]:
show_batch[0]['target_cls']

<tf.Tensor: id=475, shape=(8, 191970), dtype=float32, numpy=
array([[80., 80., 80., ..., 80., 80., 80.],
       [80., 80., 80., ..., 80., 80., 80.],
       [80., 80., 80., ..., 80., 80., 80.],
       ...,
       [80., 80., 80., ..., 80., 80., 80.],
       [80., 80., 80., ..., 80., 80., 80.],
       [80., 80., 80., ..., 80., 80., 80.]], dtype=float32)>

In [42]:
show_batch[0]['target_loc']

<tf.Tensor: id=476, shape=(8, 191970, 4), dtype=float32, numpy=
array([[[ 7.44260178e+01,  8.07973480e+01,  7.71599960e+01,
          7.94801941e+01],
        [ 5.95877190e+01,  6.46446457e+01,  6.07261810e+01,
          6.25677223e+01],
        [ 4.78105507e+01,  5.18242378e+01,  4.76826553e+01,
          4.91442833e+01],
        ...,
        [-5.24779129e+00, -7.70829201e-01, -9.28104401e+00,
         -5.65574026e+00],
        [-3.64942598e+00, -9.60588679e-02, -7.88211775e+00,
         -5.00471258e+00],
        [-2.38080263e+00,  4.39506710e-01, -6.77178955e+00,
         -4.48799181e+00]],

       [[ 6.07067528e+01,  1.09068619e+02,  9.42762146e+01,
          1.53462112e+02],
        [ 4.86987305e+01,  8.70835648e+01,  7.43113327e+01,
          1.21287209e+02],
        [ 3.91679573e+01,  6.96340256e+01,  5.84651947e+01,
          9.57499695e+01],
        ...,
        [-6.96269941e+00,  1.12648062e-01, -7.14151669e+00,
         -3.34380531e+00],
        [-5.01054955e+00,  6.05157495e

In [43]:
# print(len(show_batch))
# show_batch[0].keys()

# show_batch[0]['base_loc'][0]

# batch_idx=0
# high = len(show_batch[batch_idx]['image_id'])
# idx = np.random.randint(low=0,high=high)
# print('selected index', idx)
# image = show_batch[batch_idx]['image'][idx]
# target_cls = show_batch[batch_idx]['target_cls'][idx]
# target_loc = show_batch[batch_idx]['target_loc'][idx]
# base_loc = show_batch[batch_idx]['base_loc'][idx]

# bg_index = 80
# obj_idx = tf.where(tf.logical_and( tf.greater(target_cls, 0), tf.less(target_cls, bg_index)))
# # obj_bg_idx = tf.where(tf.greater_equal(target_cls, 0))
# target_cls_filt = tf.gather_nd(target_cls, obj_idx)
# target_loc_filt = tf.gather_nd(target_loc, obj_idx)

# print(target_loc_filt)
# print(target_cls_filt)
# print(base_loc.shape)

In [35]:
# prepare model
model = FEModel(model_def=lambda: RetinaNet(input_shape=(800, 1280, 3), num_classes=80+1),
                model_name="retinanet",
                optimizer=tf.optimizers.Adam(learning_rate=0.00001))
network = Network(ops=[
    ModelOp(inputs="image", model=model, outputs=["pred_cls", "pred_loc"]),
    PredictBox(80+1, inputs=["pred_cls","pred_loc","base_loc"], outputs=("abs_loc","selected_indices_padded", "valid_outputs"), mode="eval"),
    RetinaLoss(inputs=("target_cls", "target_loc", "pred_cls", "pred_loc"), outputs="loss"),
])
# prepare estimator
model_dir = '/home/ubuntu/coco/bestmodel'
traces = [MeanAveragePrecision('selected_indices_padded','valid_outputs','image_id','pred_cls', 'abs_loc',
                                coco_path='/home/ubuntu/coco', val_csv='val_coco.csv'), 
            LRController(model_name="retinanet", reduce_on_eval=True, reduce_patience=2, min_lr=1e-09,  reduce_factor=0.33),
            ModelSaver(model_name="retinanet", save_dir=model_dir, save_best=True)
         ]
estimator = Estimator(network=network, pipeline=pipeline, epochs=100, log_steps=10, traces=traces)

loading annotations into memory...
Done (t=7.86s)
creating index...
index created!


In [None]:
estimator.fit()

    ______           __  ______     __  _                 __            
   / ____/___ ______/ /_/ ____/____/ /_(_)___ ___  ____ _/ /_____  _____
  / /_  / __ `/ ___/ __/ __/ / ___/ __/ / __ `__ \/ __ `/ __/ __ \/ ___/
 / __/ / /_/ (__  ) /_/ /___(__  ) /_/ / / / / / / /_/ / /_/ /_/ / /    
/_/    \__,_/____/\__/_____/____/\__/_/_/ /_/ /_/\__,_/\__/\____/_/     
                                                                        

FastEstimator: Reading non-empty directory: /home/ubuntu/coco/tf_records
FastEstimator: Found 6000 examples for train in /home/ubuntu/coco/tf_records/train_summary0.json
FastEstimator: Found 3000 examples for eval in /home/ubuntu/coco/tf_records/eval_summary0.json


W1021 20:46:05.785572 140684334917440 mirrored_strategy.py:659] Using MirroredStrategy eagerly has significant overhead currently. We will be working on improving this in the future, but for now please wrap `call_for_each_replica` or `experimental_run` or `experimental_run_v2` inside a tf.function to get the best performance.
W1021 20:46:10.868411 140684334917440 mirrored_strategy.py:659] Using MirroredStrategy eagerly has significant overhead currently. We will be working on improving this in the future, but for now please wrap `call_for_each_replica` or `experimental_run` or `experimental_run_v2` inside a tf.function to get the best performance.


FastEstimator-Start: step: 0; retinanet_lr: 1e-05; 
FastEstimator-Train: step: 0; loss: 6232.865; retinanet_lr: 1e-05; 
FastEstimator-Train: step: 10; loss: 3099.5024; examples/sec: 19.77; retinanet_lr: 1e-05; 
FastEstimator-Train: step: 20; loss: 2249.901; examples/sec: 5.29; retinanet_lr: 1e-05; 
FastEstimator-Train: step: 30; loss: 2990.9507; examples/sec: 4.36; retinanet_lr: 1e-05; 
FastEstimator-Train: step: 40; loss: 1003.57153; examples/sec: 4.56; retinanet_lr: 1e-05; 
FastEstimator-Train: step: 50; loss: 422.76025; examples/sec: 4.71; retinanet_lr: 1e-05; 
FastEstimator-Train: step: 60; loss: 213.85661; examples/sec: 5.03; retinanet_lr: 1e-05; 
FastEstimator-Train: step: 70; loss: 96.674774; examples/sec: 5.03; retinanet_lr: 1e-05; 
FastEstimator-Train: step: 80; loss: 24.450075; examples/sec: 5.31; retinanet_lr: 1e-05; 
FastEstimator-Train: step: 90; loss: 25.320282; examples/sec: 5.12; retinanet_lr: 1e-05; 
FastEstimator-Train: step: 100; loss: 20.067902; examples/sec: 5.38; 

FastEstimator-Train: step: 880; loss: 7.021303; examples/sec: 6.42; retinanet_lr: 1e-05; 
FastEstimator-Train: step: 890; loss: 7.0262327; examples/sec: 6.26; retinanet_lr: 1e-05; 
FastEstimator-Train: step: 900; loss: 8.055449; examples/sec: 6.0; retinanet_lr: 1e-05; 
FastEstimator-Train: step: 910; loss: 6.1037555; examples/sec: 6.53; retinanet_lr: 1e-05; 
FastEstimator-Train: step: 920; loss: 7.324296; examples/sec: 6.06; retinanet_lr: 1e-05; 
FastEstimator-Train: step: 930; loss: 7.1842594; examples/sec: 4.98; retinanet_lr: 1e-05; 
FastEstimator-Train: step: 940; loss: 6.5925837; examples/sec: 4.92; retinanet_lr: 1e-05; 
FastEstimator-Train: step: 950; loss: 7.0213814; examples/sec: 5.05; retinanet_lr: 1e-05; 
FastEstimator-Train: step: 960; loss: 6.0299788; examples/sec: 4.59; retinanet_lr: 1e-05; 
FastEstimator-Train: step: 970; loss: 6.3313217; examples/sec: 4.66; retinanet_lr: 1e-05; 
FastEstimator-Train: step: 980; loss: 7.0540633; examples/sec: 4.61; retinanet_lr: 1e-05; 
Fas