In [None]:
from __future__ import print_function
import tensorflow as tf
from load_pvoc_data import load_data, TRAIN_LENGTH
import pickle

In [None]:
BATCH_SIZE = 32
EPOCHS = 4
VALIDATION_SPLIT = 0.3
L_COORD = 5
L_NOOBJ = 0.5
WARM_START_PATH = "/mnt/Data/tmp/darknet/standard"
MODEL_PATH = "/mnt/Data/tmp/yolo/standard"

In [None]:
def train_input_fn():
    train_dataset = tf.data.Dataset.from_generator(
        lambda:load_data("train"),
        (tf.uint8, tf.int32),
        (tf.TensorShape([None, None, 3]), tf.TensorShape([None, 5]))
    )
    train_dataset = train_dataset.map(lambda img, lbl: (
        tf.image.resize_images(img, (416, 416)),
        tf.pad(lbl, [[0, 60 - tf.shape(lbl)[0]], [0, 0]])
    ))
    train_dataset = train_dataset.apply(tf.contrib.data.assert_element_shape((
        [416, 416, 3],
        [60, 5]
    )))
    train_dataset = train_dataset.shuffle(10000)
    
    val_length = int(VALIDATION_SPLIT * TRAIN_LENGTH * 8)
    val_dataset = train_dataset.take(val_length).apply(
        tf.contrib.data.batch_and_drop_remainder(BATCH_SIZE))
    train_dataset = train_dataset.skip(val_length).apply(
        tf.contrib.data.batch_and_drop_remainder(BATCH_SIZE)).repeat()

    return train_dataset, val_dataset

In [None]:
def test_input_fn():
    test_dataset = tf.data.Dataset.from_generator(
        lambda:load_data("test"),
        (tf.uint8, tf.int32),
        (tf.TensorShape([None, None, 3]), tf.TensorShape([None, 5]))
    )
    test_dataset = test_dataset.map(lambda img, lbl: (
        tf.image.resize_images(img, (416, 416)),
        tf.pad(lbl, [[0, 60 - tf.shape(lbl)[0]], [0, 0]])
    ))
    test_dataset = test_dataset.apply(tf.contrib.data.assert_element_shape((
        [416, 416, 3],
        [60, 5]
    )))
    return test_dataset.apply(tf.contrib.data.batch_and_drop_remainder(BATCH_SIZE))

In [None]:
def conv_layer(inputs, filters=32, kernel_size=3, strides=1, activation=tf.nn.leaky_relu, batch_normalize=True,
               trainable=True):
    x = tf.layers.conv2d(inputs=inputs, filters=filters, kernel_size=kernel_size, strides=strides, padding='same',
                         trainable=trainable)
    if batch_normalize:
        x = tf.layers.batch_normalization(x, trainable=trainable)
    if activation is not None:
        x = activation(x)
    return x

In [None]:
def residual_block(inputs, filters, trainable=False):
    x = conv_layer(inputs=inputs, filters=filters, kernel_size=1, trainable=trainable)
    x = conv_layer(inputs=inputs, filters=(filters * 2), trainable=trainable)
    return x + inputs

In [None]:
def darknet_block(inputs, filters, repetitions, trainable=False):
    x = conv_layer(inputs=inputs, filters=filters, strides=2, trainable=trainable)
    for i in range(repetitions):
        x = residual_block(x, filters / 2, trainable=trainable)
    return x

In [None]:
def yolo_layer(inputs, anchors):
    indices_w = tf.range(tf.shape(inputs)[2])
    indices_h = tf.range(tf.shape(inputs)[1])
    x_indices, y_indices = tf.meshgrid(indices_w, indices_h)
    x_indices = tf.cast(x_indices, dtype=tf.float32)
    y_indices = tf.cast(y_indices, dtype=tf.float32)
    
    stack = []
    for i, anchor in enumerate(anchors):
        stack.append((tf.sigmoid(inputs[:,:,:,25 * i + 0]) + x_indices) / tf.cast(tf.shape(inputs)[2], dtype=tf.float32))    # bx
        stack.append((tf.sigmoid(inputs[:,:,:,25 * i + 0]) + y_indices) / tf.cast(tf.shape(inputs)[1], dtype=tf.float32))    # by
        stack.append((tf.exp(inputs[:,:,:,25 * i + 2]) * anchor[0]) / tf.cast(tf.shape(inputs)[2], dtype=tf.float32))    # bw
        stack.append((tf.exp(inputs[:,:,:,25 * i + 3]) * anchor[1]) / tf.cast(tf.shape(inputs)[1], dtype=tf.float32))    # bh
        stack.append(tf.sigmoid(inputs[:,:,:,25 * i + 4]))
        for j in range(5, 25):
            stack.append(inputs[:,:,:,25 * i + j])
        
    return tf.stack(stack, axis=-1)

In [None]:
def non_max_suppr(*args):
    for i, arg in enumerate(args):
        for j in range(arg.shape[-1] / 25):
            arg[:,:,:,25 * j + 0] = arg[:,:,:,25 * j + 1] - arg[:,:,:,25 * j + 3] / 2    # y_min
            arg[:,:,:,25 * j + 1] = arg[:,:,:,25 * j + 0] - arg[:,:,:,25 * j + 2] / 2    # x_min
            arg[:,:,:,25 * j + 2] = arg[:,:,:,25 * j + 1] + arg[:,:,:,25 * j + 3] / 2    # y_max
            arg[:,:,:,25 * j + 3] = arg[:,:,:,25 * j + 0] + arg[:,:,:,25 * j + 2] / 2    # x_max
        flattened = tf.reshape(
            arg,
            (-1, tf.shape(arg)[1] * tf.shape(arg)[2], tf.shape(arg)[3])
        )
        to_concat = []
        for j in range(arg.shape[-1] / 25):
            to_concat.append(flattened[:,:,(25 * j):(25 * (j + 1))])
        args[i] = tf.concat(to_concat, axis=1)
    args = tf.concat(args, axis=1)
    return tf.map_fn(
        lambda boxes: tf.gather(boxes, tf.image.non_max_suppression(
            boxes[:,:4],
            boxes[:,4],
            6,
            score_threshold=0.5
        )),
        args,
        infer_shape=False
    )

In [None]:
def get_masks(output, target):
    shape = output.get_shape().as_list()
    def make_mask(bboxes):
        _, idx = tf.unique(bboxes[:,-1])
        end = idx[-1]
        indices = tf.map_fn(lambda bbox: [
            tf.cast(bbox[0] * shape[2], dtype=tf.int32),
            tf.cast(bbox[1] * shape[1], dtype=tf.int32),
        ], bboxes[:end], dtype=[tf.int32] * 2)
        updates = tf.map_fn(
            lambda bbox: [1.0, bbox[0], bbox[1], bbox[2], bbox[3], bbox[4]],
            bboxes[:end],
            dtype=[tf.float32] * 6
        )
        return tf.scatter_nd(indices, updates, [shape[1], shape[2], 6])
    obj_mask = tf.map_fn(make_mask, target, dtype=tf.float32)
    
    box_mask = tf.one_hot(tf.argmax(output[:, :, :, 4::25], axis=-1),
                          depth=tf.cast(tf.shape(output)[-1] / 25, dtype=tf.int32),
                          axis=-1)
    
    return obj_mask, 1 - obj_mask[:, :, :, 0:1], box_mask

In [None]:
def iou(output, mask):
    x_min = tf.maximum(output[:,:,:,0::25] - output[:,:,:,2::25] / 2,
                       mask[:,:,:,1:2] - mask[:,:,:,3:4] / 2)
    x_max = tf.minimum(output[:,:,:,0::25] + output[:,:,:,2::25] / 2,
                       mask[:,:,:,1:2] + mask[:,:,:,3:4] / 2)
    y_min = tf.maximum(output[:,:,:,1::25] - output[:,:,:,3::25] / 2,
                       mask[:,:,:,2:3] - mask[:,:,:,4:5] / 2)
    y_max = tf.minimum(output[:,:,:,1::25] + output[:,:,:,3::25] / 2,
                       mask[:,:,:,2:3] + mask[:,:,:,4:5] / 2)
    
    inter_area = tf.maximum(x_max - x_min, 0) * tf.maximum(y_max - y_min, 0)
    area_1 = output[:,:,:,2::25] * output[:,:,:,3::25]
    area_2 = mask[:,:,:,3:4] * mask[:,:,:,4:5]
    return inter_area / (area_1 + area_2 - inter_area)

In [None]:
def classification_loss(output, mask):
    loss = tf.map_fn(lambda i: tf.nn.softmax_cross_entropy_with_logits_v2(
            labels=tf.one_hot(tf.cast(mask[:,:,:,-1], dtype=tf.int32), depth=20),
            logits=output[:,:,:,(25 * i + 4):(25 * (i + 1))]
        ), tf.range(tf.cast(tf.shape(output)[-1] / 25, dtype=tf.int32)),
                    dtype=tf.float32)
    return tf.reduce_mean(loss)

In [None]:
def yolo_loss(outputs, target):
    loss = 0
    for output in outputs:
        obj_mask, noobj_mask, box_mask = get_masks(output, target)
        loss += tf.reduce_mean(
            obj_mask[:,:,:,0:1] * (box_mask * (L_COORD * (
                tf.squared_difference(output[:,:,:,0::25], obj_mask[:,:,:,1:2]) +
                tf.squared_difference(output[:,:,:,1::25], obj_mask[:,:,:,2:3]) +
                tf.squared_difference(
                    tf.sqrt(output[:,:,:,2::25]), tf.sqrt(obj_mask[:,:,:,3:4])
                ) + tf.squared_difference(
                    tf.sqrt(output[:,:,:,3::25]), tf.sqrt(obj_mask[:,:,:,4:5])
                )
            ) + tf.squared_difference(output[:,:,:,4::25], iou(output, obj_mask)))) +
            classification_loss(output, obj_mask)
        )
        loss += L_NOOBJ * tf.reduce_mean(noobj_mask * tf.square(output[:,:,:,4::25]))
    return loss

In [None]:
def darknet_model(features, labels, mode):  
    features = tf.cast(features, dtype=tf.float32)
    normalized = tf.map_fn(tf.image.per_image_standardization, features)
    
    # Feature extractor: Darknet53
    x = conv_layer(inputs=normalized, filters=32, trainable=False)
    x = darknet_block(x, 64, 1)
    x = darknet_block(x, 128, 2)
    l_36 = darknet_block(x, 256, 8)
    l_61 = darknet_block(l_36, 512, 8)
    x = darknet_block(l_61, 1024, 4, trainable=True)
    
    # YOLO model
    x = conv_layer(x, filters=512, kernel_size=1)
    x = conv_layer(x, filters=1024)
    x = conv_layer(x, filters=512, kernel_size=1)
    x = conv_layer(x, filters=1024)
    l_79 = conv_layer(x, filters=512, kernel_size=1)
    
    x = conv_layer(l_79, filters=1024)
    x = conv_layer(x, filters=75, kernel_size=1, activation=None, batch_normalize=False)
    o_1 = yolo_layer(x, anchors=[(116, 90), (156, 198), (373, 326)])
    
    x = conv_layer(l_79, filters=256, kernel_size=1)
    x = tf.image.resize_images(x, (tf.shape(x)[1] * 2, tf.shape(x)[2] * 2))
    x = tf.concat([x, l_61], axis=-1)
    x = conv_layer(x, filters=256, kernel_size=1)
    x = conv_layer(x, filters=512)
    x = conv_layer(x, filters=256, kernel_size=1)
    x = conv_layer(x, filters=512)
    l_91 = conv_layer(x, filters=256, kernel_size=1)
    
    x = conv_layer(x, filters=512)
    x = conv_layer(x, filters=75, kernel_size=1, activation=None, batch_normalize=False)
    o_2 = yolo_layer(x, anchors=[(30, 61), (62, 45), (59, 119)])
    
    x = conv_layer(l_91, filters=128, kernel_size=1)
    x = tf.image.resize_images(x, (tf.shape(x)[1] * 2, tf.shape(x)[2] * 2))
    x = tf.concat([x, l_36], axis=-1)
    x = conv_layer(x, filters=128, kernel_size=1)
    x = conv_layer(x, filters=256)
    x = conv_layer(x, filters=128, kernel_size=1)
    x = conv_layer(x, filters=256)
    x = conv_layer(x, filters=128, kernel_size=1)
    x = conv_layer(x, filters=256)
    x = conv_layer(x, filters=75, kernel_size=1, activation=None, batch_normalize=False)
    o_3 = yolo_layer(x, anchors=[(10, 13), (16, 30), (33, 23)])
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        # Non-maximum suppression to remove overlapping boxes
        output = non_max_suppr(o_1, o_2, o_3)
        predictions = {
            'images': tf.image.draw_bounding_boxes(features, output[:,:,:4]),
            'labels': tf.argmax(output[:,:,5:], axis=-1)
        }
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
    
    loss = yolo_loss([o_1, o_2, o_3], labels)
    tf.summary.scalar('loss', loss)
    
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.contrib.estimator.TowerOptimizer(tf.train.AdamOptimizer(1e-4))
        train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)

    return tf.estimator.EstimatorSpec(mode=mode, loss=loss)

In [None]:
with open('darknet_variables.pkl', 'rb') as vars_file:
    warm_start = tf.estimator.WarmStartSettings(
        ckpt_to_initialize_from=WARM_START_PATH,
        vars_to_warm_start=pickle.load(vars_file)
    )

In [None]:
model = tf.estimator.Estimator(
    model_fn=tf.contrib.estimator.replicate_model_fn(darknet_model),
    model_dir=MODEL_PATH,
    warm_start_from=warm_start, config=tf.estimator.RunConfig(
        save_checkpoints_steps=150, save_summary_steps=10, log_step_count_steps=10
    )
)

In [None]:
validation_hook = tf.contrib.learn.monitors.replace_monitors_with_hooks(
    [tf.contrib.learn.monitors.ValidationMonitor(
        input_fn=lambda:train_input_fn()[1], every_n_steps=100, early_stopping_rounds=10
    )],
    model
)[0]

In [None]:
max_steps = int(((1 - VALIDATION_SPLIT) * TRAIN_LENGTH * 8 / BATCH_SIZE) * EPOCHS)
model.train(input_fn=lambda:train_input_fn()[0], hooks=[validation_hook],
            max_steps=max_steps)

In [None]:
print(model.evaluate(input_fn=test_input_fn))