In [1]:
import numpy as np
import os
from glob import glob
import cv2
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow.contrib.eager as tfe

  from ._conv import register_converters as _register_converters


Instructions for updating:
Use the retry module or similar alternatives.


In [2]:
tf.enable_eager_execution(device_policy=tfe.DEVICE_PLACEMENT_SILENT)

In [3]:
DIR_DATA = 'data_VOC2012/VOC2012'
DIR_INPUT = os.path.join(DIR_DATA, 'JPEGImages')
DIR_OUTPUT = os.path.join(DIR_DATA, 'Annotations')

OBJECT_LABELS = {
    'tvmonitor': (0, 'Indoor'),
    'aeroplane': (1, 'Vehicle'),
    'bicycle': (2, 'Vehicle'),
    'bird': (3, 'Animal'),
    'boat': (4, 'Vehicle'),
    'bottle': (5, 'Indoor'),
    'bus': (6, 'Vehicle'),
    'car': (7, 'Vehicle'),
    'cat': (8, 'Animal'),
    'chair': (9, 'Indoor'),
    'cow': (10, 'Animal'),
    'diningtable': (11, 'Indoor'),
    'dog': (12, 'Animal'),
    'horse': (13, 'Animal'),
    'motorbike': (14, 'Vehicle'),
    'person': (15, 'Person'),
    'pottedplant': (16, 'Indoor'),
    'sheep': (17, 'Animal'),
    'sofa': (18, 'Indoor'),
    'train': (19, 'Vehicle')
}
OBJECTS = ['tvmonitor', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 
          'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train']

NUM_OBJECTS = 20
MAX_DETECTIONS_PER_IMAGE = 10
IMG_OUT_H, IMG_OUT_W = 416, 416
GRID_H, GRID_W = 13, 13
GRID_SIZE = 416//GRID_H 
ANCHORS_NORMALIZED = np.array(
    [
        [0.05210654, 0.04405615],
        [0.15865615, 0.14418923],
        [0.42110308, 0.25680231],
        [0.27136769, 0.60637077],
        [0.70525231, 0.75157846]
    ]
)
ANCHORS = ANCHORS_NORMALIZED * np.array([GRID_H, GRID_W])
NUM_ANCHORS = ANCHORS.shape[0]
THRESHOLD_OUT_PROB = 0.6
THRESHOLD_IOU_NMS = 0.5

CHECKPOINT_DIR = 'model'

if tfe.num_gpus() > 0:
    DEVICE = '/gpu:0'
    print('Using GPU')
else:
    DEVICE = '/cpu:0'
    print('Using CPU')

Using GPU


In [4]:
def apply_transformations(predictions):
    predictions_yx = tf.sigmoid(predictions[..., 0:2])
    predictions_hw = tf.exp(predictions[...,2:4])
    predictions_prob_obj = tf.sigmoid(predictions[...,4:5])
    predictions_prob_class = tf.nn.softmax(predictions[...,5:])
    
    return predictions_yx, predictions_hw, predictions_prob_obj, predictions_prob_class

def get_coordinates(h, w):
    coordinates_y = tf.range(h)
    coordinates_x = tf.range(w)
    x, y = tf.meshgrid(coordinates_x, coordinates_y)
    coordinates = tf.stack([y, x], axis=-1)
    coordinates = tf.reshape(coordinates, [1, h, w, 1, 2])
    coordinates = tf.cast(coordinates, tf.float32)
    
    return coordinates

def grid2normalized(predictions_yx, predictions_hw):    
    # create cartesian coordinates on grid space
    coordinates = get_coordinates(GRID_H, GRID_W)
    
    # map from grid space to [0,19] space
    anchors = tf.cast(tf.reshape(ANCHORS, [1, 1, 1, ANCHORS.shape[0], 2]), dtype=tf.float32)  # [0,19] space
    predictions_yx += coordinates
    predictions_hw *= anchors
    
    # map from [0,19] space to [0,1] space
    shape = tf.cast(tf.reshape([GRID_H, GRID_W], [1, 1, 1, 1, 2]), tf.float32)
    predictions_yx /= shape
    predictions_hw /= shape
    
    return predictions_yx, predictions_hw

def center2corner(predictions_yx, predictions_hw):
    # predictions_yx = [GRID_H, GRID_W, NUM_ANCHORS, 2]
    
    bbox_min = predictions_yx - (predictions_hw/2.)
    bbox_max = predictions_yx + (predictions_hw/2.)
    
    predictions_corner = tf.concat([bbox_min[...,0:1], bbox_min[...,1:2], bbox_max[...,0:1], bbox_max[...,1:2]], axis=-1)
    
    return predictions_corner

def get_filtered_predictions(predictions_corner, predictions_prob_obj, predictions_prob_class):
    # compute overall prob for each anchor in each grid
    predictions_prob = predictions_prob_obj * predictions_prob_class
    
    # get max prob among all classes at each anchor in each grid
    predictions_idx_class_max = tf.argmax(predictions_prob, axis=-1)
    predictions_prob = tf.reduce_max(predictions_prob, axis=-1)
    
    # compute filter mask
    mask_filter = predictions_prob >= THRESHOLD_OUT_PROB
    
    # apply mask on output
    bbox_filtered = tf.boolean_mask(predictions_corner, mask_filter)
    prob_filtered = tf.boolean_mask(predictions_prob, mask_filter)
    with tf.device('/cpu:0'):
        idx_class_filtered = tf.boolean_mask(predictions_idx_class_max, mask_filter)
    
    if DEVICE == '/gpu:0':
        idx_class_filtered = idx_class_filtered.gpu()        
    
    return bbox_filtered, prob_filtered, idx_class_filtered


def predictions2outputs(predictions):
    # apply corresponding transformations on predictions
    predictions_yx, predictions_hw, predictions_prob_obj, predictions_prob_class = apply_transformations(predictions)
    
    # map predictions_bbox to [0,1] space
    predictions_yx, predictions_hw = grid2normalized(predictions_yx, predictions_hw)
    
    # represent boxes using corners
    predictions_corner = center2corner(predictions_yx, predictions_hw)
    
    # filter predictions based on (prob_obj * prob_class). (needs to be done separately for each image in batch)
    bbox_filtered, prob_filtered, idx_class_filtered = get_filtered_predictions(predictions_corner, predictions_prob_obj, predictions_prob_class)
    # bbox_filtered.shape = [BATCH_SIZE, NUM_FILTERED, 4]
    
    # TODO: perform nms for each class separately
    # scale boxes from [0,1] to image space
    img_space = tf.reshape(tf.cast(tf.stack([IMG_OUT_H, IMG_OUT_W, IMG_OUT_H, IMG_OUT_W]), tf.float32), [1, 1, 4])
    bbox_filtered = tf.reshape(bbox_filtered*img_space, [-1, 4])  # tf.nms takes num_boxes (no batch support)
    
    # perform non-max suppression
    with tf.device('/cpu:0'):
        bbox_nms_indices = tf.image.non_max_suppression(bbox_filtered, tf.reshape(prob_filtered,[-1]), MAX_DETECTIONS_PER_IMAGE, THRESHOLD_IOU_NMS)
    if DEVICE == '/gpu:0':
        bbox_nms_indices = bbox_nms_indices.gpu()
    
    bbox_nms = tf.gather(bbox_filtered, bbox_nms_indices)  # box_nms.shape = [len(bbox_nms_indices), 4]
    prob_nms = tf.expand_dims(tf.gather(prob_filtered, bbox_nms_indices), axis=-1) # prob_nms.shape = [len(bbox_nms_indices), 1]
    with tf.device('/cpu:0'):
        idx_class_nms = tf.expand_dims(tf.cast(tf.gather(idx_class_filtered, bbox_nms_indices), tf.float32), axis=-1)
    if DEVICE == '/gpu:0':
        idx_class_nms = idx_class_nms.gpu()
    
    # concat return data
    output = tf.concat([bbox_nms, prob_nms, idx_class_nms], axis=-1)

    return tf.expand_dims(output, axis=0)

In [5]:
class Model(tf.keras.Model):
    def __init__(self):
        super(Model, self).__init__()
        self.optimizer = tf.train.AdamOptimizer()
        
        # add layers
        self.conv1 = tf.keras.layers.Conv2D(32, 3, padding='same', use_bias=False)
        self.norm1 = tf.keras.layers.BatchNormalization()
        self.pool1 = tf.keras.layers.MaxPool2D()

        self.conv2 = tf.keras.layers.Conv2D(64, 3, padding='same', use_bias=False)
        self.norm2 = tf.keras.layers.BatchNormalization()
        self.pool2 = tf.keras.layers.MaxPool2D()
        
        self.conv3 = tf.keras.layers.Conv2D(128, 3, padding='same', use_bias=False)
        self.norm3 = tf.keras.layers.BatchNormalization()
        
        self.conv4 = tf.keras.layers.Conv2D(64, 1, padding='same', use_bias=False)
        self.norm4 = tf.keras.layers.BatchNormalization()
        
        self.conv5 = tf.keras.layers.Conv2D(128, 3, padding='same', use_bias=False)
        self.norm5 = tf.keras.layers.BatchNormalization()
        self.pool5 = tf.keras.layers.MaxPool2D()
        
        self.conv6 = tf.keras.layers.Conv2D(256, 3, padding='same', use_bias=False)
        self.norm6 = tf.keras.layers.BatchNormalization()
        
        self.conv7 = tf.keras.layers.Conv2D(128, 1, padding='same', use_bias=False)
        self.norm7 = tf.keras.layers.BatchNormalization()
        
        self.conv8 = tf.keras.layers.Conv2D(256, 3, padding='same', use_bias=False)
        self.norm8 = tf.keras.layers.BatchNormalization()
        self.pool8 = tf.keras.layers.MaxPool2D()
        
        self.conv9 = tf.keras.layers.Conv2D(512, 3, padding='same', use_bias=False)
        self.norm9 = tf.keras.layers.BatchNormalization()
        
        self.conv10 = tf.keras.layers.Conv2D(256, 1, padding='same', use_bias=False)
        self.norm10 = tf.keras.layers.BatchNormalization()
        
        self.conv11 = tf.keras.layers.Conv2D(512, 3, padding='same', use_bias=False)
        self.norm11 = tf.keras.layers.BatchNormalization()
        
        self.conv12 = tf.keras.layers.Conv2D(256, 1, padding='same', use_bias=False)
        self.norm12 = tf.keras.layers.BatchNormalization()
        
        self.conv13 = tf.keras.layers.Conv2D(512, 3, padding='same', use_bias=False)
        self.norm13 = tf.keras.layers.BatchNormalization()  # skip after this
        self.pool13 = tf.keras.layers.MaxPool2D()
        
        self.conv14 = tf.keras.layers.Conv2D(1024, 3, padding='same', use_bias=False)
        self.norm14 = tf.keras.layers.BatchNormalization()
        
        self.conv15 = tf.keras.layers.Conv2D(512, 1, padding='same', use_bias=False)
        self.norm15 = tf.keras.layers.BatchNormalization()
        
        self.conv16 = tf.keras.layers.Conv2D(1024, 3, padding='same', use_bias=False)
        self.norm16 = tf.keras.layers.BatchNormalization()
        
        self.conv17 = tf.keras.layers.Conv2D(512, 1, padding='same', use_bias=False)
        self.norm17 = tf.keras.layers.BatchNormalization()
        
        self.conv18 = tf.keras.layers.Conv2D(1024, 3, padding='same', use_bias=False)
        self.norm18 = tf.keras.layers.BatchNormalization()
        
        self.conv19 = tf.keras.layers.Conv2D(1024, 3, padding='same', use_bias=False)
        self.norm19 = tf.keras.layers.BatchNormalization()
        
        self.conv20 = tf.keras.layers.Conv2D(1024, 3, padding='same', use_bias=False)
        self.norm20 = tf.keras.layers.BatchNormalization()
        
        self.conv21 = tf.keras.layers.Conv2D(64, 1, padding='same', use_bias=False)  # apply on skipped connection
        self.norm21 = tf.keras.layers.BatchNormalization()
        
        self.conv22 = tf.keras.layers.Conv2D(1024, 3, padding='same', use_bias=False)
        self.norm22 = tf.keras.layers.BatchNormalization()
        # Feature Extractor Ends Here!
        
        # Detector Layer!
        self.conv23 = tf.keras.layers.Conv2D(NUM_ANCHORS*(4+1+NUM_OBJECTS), 1, padding='same')
        
    def forward(self, imgs):
        # imgs.shape = [B, IMG_H, IMG_W, 3]
        
        # for now, resize and reshape imgs to vector
        imgs = tf.image.resize_images(imgs, [416, 416])
        
        x = self.conv1(imgs)
        x = self.norm1(x)
        x = tf.nn.leaky_relu(x, alpha=0.1)
        x = self.pool1(x)
        
        x = self.conv2(x)
        x = self.norm2(x)
        x = tf.nn.leaky_relu(x, alpha=0.1)
        x = self.pool2(x)
        
        x = self.conv3(x)
        x = self.norm3(x)
        x = tf.nn.leaky_relu(x, alpha=0.1)
        
        x = self.conv4(x)
        x = self.norm4(x)
        x = tf.nn.leaky_relu(x, alpha=0.1)
        
        x = self.conv5(x)
        x = self.norm5(x)
        x = tf.nn.leaky_relu(x, alpha=0.1)
        x = self.pool5(x)
        
        x = self.conv6(x)
        x = self.norm6(x)
        x = tf.nn.leaky_relu(x, alpha=0.1)
        
        x = self.conv7(x)
        x = self.norm7(x)
        x = tf.nn.leaky_relu(x, alpha=0.1)
        
        x = self.conv8(x)
        x = self.norm8(x)
        x = tf.nn.leaky_relu(x, alpha=0.1)
        x = self.pool8(x)
        
        x = self.conv9(x)
        x = self.norm9(x)
        x = tf.nn.leaky_relu(x, alpha=0.1)
        
        x = self.conv10(x)
        x = self.norm10(x)
        x = tf.nn.leaky_relu(x, alpha=0.1)
        
        x = self.conv11(x)
        x = self.norm11(x)
        x = tf.nn.leaky_relu(x, alpha=0.1)
        
        x = self.conv12(x)
        x = self.norm12(x)
        x = tf.nn.leaky_relu(x, alpha=0.1)
        
        x = self.conv13(x)
        x = self.norm13(x)
        x = tf.nn.leaky_relu(x, alpha=0.1)
        x_skip = tf.identity(x)
        x = self.pool13(x)
        
        x = self.conv14(x)
        x = self.norm14(x)
        x = tf.nn.leaky_relu(x, alpha=0.1)
        
        x = self.conv15(x)
        x = self.norm15(x)
        x = tf.nn.leaky_relu(x, alpha=0.1)
        
        x = self.conv16(x)
        x = self.norm16(x)
        x = tf.nn.leaky_relu(x, alpha=0.1)
        
        x = self.conv17(x)
        x = self.norm17(x)
        x = tf.nn.leaky_relu(x, alpha=0.1)
        
        x = self.conv18(x)
        x = self.norm18(x)
        x = tf.nn.leaky_relu(x, alpha=0.1)
        
        x = self.conv19(x)
        x = self.norm19(x)
        x = tf.nn.leaky_relu(x, alpha=0.1)
        
        x = self.conv20(x)
        x = self.norm20(x)
        x = tf.nn.leaky_relu(x, alpha=0.1)
        
        x_skip = self.conv21(x_skip)
        x_skip = self.norm21(x_skip)
        x_skip = tf.nn.leaky_relu(x_skip, alpha=0.1)
        x_skip = tf.space_to_depth(x_skip, block_size=2)  # lossless shrinkage of feature map
        
        x = tf.concat([x_skip, x], axis=-1)  # low_level features concatenated with high_level features
        
        x = self.conv22(x)
        x = self.norm22(x)
        x = tf.nn.leaky_relu(x, alpha=0.1)
        # Feature Extractor ends here!
        
        # Detector layer
        x = self.conv23(x)
        
        # reshape output
        pred = tf.reshape(x, [-1, GRID_H, GRID_W, NUM_ANCHORS, 4+1+NUM_OBJECTS])
        
        return pred
    
    def predict(self, imgs):
        '''predicts bboxes and draws them on the image'''
        # imgs.shape = [B, IMG_H, IMG_W, 3]
        
        # forward pass
        predictions = self.forward(imgs)
        
        predictions = tf.concat([predictions[...,1::-1], predictions[...,3:1:-1], predictions[...,4:]], axis=-1)
        
        # post-process to get bounding boxes
        outputs = predictions2outputs(predictions)  
        # CAUTION!!!
        # TODO: use batch multi-class nms (currently works with BATCH_SIZE=1)
        # reference: https://github.com/tensorflow/models/blob/master/research/object_detection/core/post_processing.py
        
        return outputs

In [22]:
def read_data(filename):
    # read and process image
    img_name = os.path.join(DIR_INPUT, filename + '.jpg')
    img = cv2.imread(img_name)
    img_in_h = img.shape[0]
    img_in_w = img.shape[1]
    img = cv2.resize(img, (IMG_OUT_W, IMG_OUT_H))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = (img / 255.).astype(np.float32)
    img = np.expand_dims(img, 0)
    
    # read annotation
    annotation_name = os.path.join(DIR_OUTPUT, filename + '.xml')
    tree = ET.parse(annotation_name)
    root = tree.getroot()
    targets = []
    for obj in root.findall('object'):
        # read class label
        label_text = obj.find('name').text
        label = int(OBJECT_LABELS[label_text][0])
        
        # read bbox
        bbox = obj.find('bndbox')
        y_min = float(bbox.find('ymin').text)
        x_min = float(bbox.find('xmin').text)
        y_max = float(bbox.find('ymax').text)
        x_max = float(bbox.find('xmax').text)
        
        # normalize these values s.t. image goes from 0 to 1 (helps for arbitary size image size)
        y_min /= img_in_h
        x_min /= img_in_w
        y_max /= img_in_h
        x_max /= img_in_w
        
        # map from [0,1] to image space
        y_min *= IMG_OUT_H
        x_min *= IMG_OUT_W
        y_max *= IMG_OUT_H
        x_max *= IMG_OUT_W

        targets.append((y_min, x_min, y_max, x_max, 1, label))
        
    return img, np.array(targets, dtype=np.float32)


In [23]:
def process_img(img):
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (IMG_W, IMG_H))
    img = (img / 255.).astype(np.float32)
    img = np.expand_dims(img, 0)
    
    return img

def draw_output(img, output):
    # unnormalize image
    img = (img * 255).astype(np.uint8)
    
    output = output.astype(np.int32)
    for idx_box in range(output.shape[0]):
        bbox = output[idx_box]
        img = cv2.rectangle(img, (bbox[1], bbox[0]), (bbox[3], bbox[2]), color=(255, 0, 0), thickness=3)
    
    return img

In [24]:
# read filenames
filenames = sorted(os.listdir(DIR_INPUT))
filenames = [filename[:-4] for filename in filenames]  # trim extension

# load model
with tf.device(DEVICE):
    model = Model()
    checkpoint = tfe.Checkpoint(model=model, optimizer_step=tf.train.get_or_create_global_step())
    checkpoint.restore(tf.train.latest_checkpoint(CHECKPOINT_DIR))

In [25]:
def draw_output(img, output):
    # unnormalize image
    img = (img * 255).astype(np.uint8)
    
    for idx_box in range(output.shape[0]):
        conf = output[idx_box][4]
        bbox = output[idx_box].astype(np.int32)
        obj_class = OBJECTS[bbox[5]]
        img = cv2.rectangle(img, (bbox[1], bbox[0]), (bbox[3], bbox[2]), color=(255, 0, 0), thickness=3)
        font = cv2.FONT_HERSHEY_SIMPLEX
        img = cv2.putText(img, '{}({:.2f})'.format(obj_class, conf),(bbox[1], bbox[0]), font, .5,(0,255,255),2,cv2.LINE_AA)
        
    return img

In [52]:
# predict on each data and store gt and pred
with open(DIR_DATA+'/gt_file.txt', 'w') as gt_file, open(DIR_DATA+'/pred_file.txt', 'w') as pred_file :
    for filename in filenames:
        # read data
        img, gt = read_data(filename)

        # predict on image
        output = model.predict(img)

        # write pred and gt to file (gt.shape = [num_objects, 6], pred.shape = [MAX_DETECTIONS_PER_IMAGE, 6])
        pred = output[0].numpy()

        gt_file.write( ' '.join([str(y) for y in gt.flatten()]) + '\n' )
        pred_file.write( ' '.join([str(y) for y in pred.flatten()]) + '\n' )

    #     # draw output
    #     img_out = draw_output(img[0], output[0].numpy())

    #     # draw gt
    #     img_gt = draw_output(img[0], gt)