In [None]:
from __future__ import print_function
import numpy as np
import tensorflow as tf
from load_pvoc_data import load_data, TRAIN_LENGTH

In [None]:
BATCH_SIZE = 32
EPOCHS = 4
VALIDATION_SPLIT = 0.3

In [None]:
def preprocessing(img, lbl):
    crop_img = tf.image.central_crop(img, 1)
    resized = tf.image.resize_images(img, (256, 256))
    norm_img = tf.image.per_image_standardization(resized)
    
    one_hot = tf.one_hot(lbl, 20)
    #summed = tf.reduce_sum(one_hot, axis=-2)
    #multi_hot = tf.where(
    #    tf.equal(summed, 0), tf.zeros_like(summed, dtype=tf.float32), tf.ones_like(summed, dtype=tf.float32)
    #)
    return norm_img, one_hot

In [None]:
def train_input_fn():
    train_dataset = tf.data.Dataset.from_generator(
        lambda:load_data("train"),
        (tf.uint8, tf.int32),
        (tf.TensorShape([None, None, 3]), tf.TensorShape([None]))
    )
    train_dataset = train_dataset.map(preprocessing).shuffle(10000).apply(tf.contrib.data.assert_element_shape((
        [256, 256, 3],
        [20]
    )))
    
    val_length = int(VALIDATION_SPLIT * TRAIN_LENGTH * 8)
    val_dataset = train_dataset.take(val_length).apply(
        tf.contrib.data.batch_and_drop_remainder(BATCH_SIZE))
    train_dataset = train_dataset.skip(val_length).apply(
        tf.contrib.data.batch_and_drop_remainder(BATCH_SIZE)).repeat()

    return train_dataset, val_dataset

In [None]:
def test_input_fn():
    test_dataset = tf.data.Dataset.from_generator(
        lambda:load_data("test"),
        (tf.uint8, tf.int32),
        (tf.TensorShape([None, None, 3]), tf.TensorShape([None]))
    )
    test_dataset = test_dataset.map(preprocessing).apply(tf.contrib.data.assert_element_shape((
        [256, 256, 3],
        [20]
    )))
    return test_dataset.apply(tf.contrib.data.batch_and_drop_remainder(BATCH_SIZE))

In [None]:
def conv_layer(inputs, filters=32, kernel_size=3, strides=1, activation=tf.nn.leaky_relu, batch_normalize=True
               trainable=True):
    x = tf.layers.conv2d(inputs=inputs, filters=filters, kernel_size=kernel_size, strides=strides, padding='same',
                         trainable=trainable)
    if batch_normalize:
        x = tf.layers.batch_normalization(x, trainable=trainable)
    if activation is not None:
        x = activation(x)
    return x

In [None]:
def residual_block(inputs, filters, trainable=False):
    x = conv_layer(inputs=inputs, filters=filters, kernel_size=1, trainable=trainable)
    x = conv_layer(inputs=inputs, filters=(filters * 2), trainable=trainable)
    return x + inputs

In [None]:
def darknet_block(inputs, filters, repetitions, trainable=False):
    x = conv_layer(inputs=inputs, filters=filters, strides=2, trainable=trainable)
    for i in range(repetitions):
        x = residual_block(x, filters / 2, trainable=trainable)
    return x

In [None]:
def yolo_layer(inputs, anchors):
    indices_w = tf.range(int(inputs.shape[2]))
    indices_h = tf.range(int(inputs.shape[1]))
    x_indices, y_indices = tf.meshgrid(indices_w, indices_h)
    
    for i, anchor in enumerate(anchors):
         b_x = tf.sigmoid(inputs[:,:,:,25 * i + 0]) + x_indices
         b_y = tf.sigmoid(inputs[:,:,:,25 * i + 0]) + y_indices
         b_w = tf.exp(inputs[:,:,:,25 * i + 2]) * anchor[0]
         b_h = tf.exp(inputs[:,:,:,25 * i + 3]) * anchor[1]
        
        inputs[:,:,:,25 * i + 0] = (b_y - b_h / 2) / int(inputs.shape[1])   # y_min
        inputs[:,:,:,25 * i + 1] = (b_x - b_w / 2) / int(inputs.shape[2])   # x_min
        inputs[:,:,:,25 * i + 2] = (b_y + b_h / 2) / int(inputs.shape[1])   # y_max
        inputs[:,:,:,25 * i + 3] = (b_x + b_w / 2) / int(inputs.shape[2])   # x_max
        
        inputs[:,:,:,25 * i + 4] = tf.sigmoid(inputs[:,:,:,25 * i + 4])
        
    flattened = tf.reshape(inputs, (-1, int(inputs.shape[1] * inputs.shape[2]), int(inputs.shape[3])))
    return tf.concat([flattened[:,:,:25], flattened[:,:,25:50], flattened[:,:,50:]])

In [None]:
def darknet_model(features, labels, mode):  
    features = tf.cast(features, dtype=tf.float32)
    
    # Feature extractor: Darknet53
    x = conv_layer(inputs=features, filters=32, trainable=False)
    x = darknet_block(x, 64, 1)
    x = darknet_block(x, 128, 2)
    l_36 = darknet_block(x, 256, 8)
    l_61 = darknet_block(l_36, 512, 8)
    x = darknet_block(l_61, 1024, 4, trainable=True)
    
    # YOLO model
    x = conv_layer(x, filters=512, kernel_size=1)
    x = conv_layer(x, filters=1024)
    x = conv_layer(x, filters=512, kernel_size=1)
    x = conv_layer(x, filters=1024)
    l_79 = conv_layer(x, filters=512, kernel_size=1)
    
    x = conv_layer(l_79, filters=1024)
    x = conv_layer(x, filters=75, kernel_size=1, activation=None, batch_normalize=False)
    o_1 = yolo_layer(x, anchors=[(116, 90), (156, 198), (373, 326)])
    
    x = conv_layer(l_79, filters=256, kernel_size=1)
    x = tf.image.resize_images(x, (int(x.shape[1]) * 2, int(x.shape[2]) * 2))
    x = tf.concat([x, l_61], axis=-1)
    x = conv_layer(x, filters=256, kernel_size=1)
    x = conv_layer(x, filters=512)
    x = conv_layer(x, filters=256, kernel_size=1)
    x = conv_layer(x, filters=512)
    l_91 = conv_layer(x, filters=256, kernel_size=1)
    
    x = conv_layer(x, filters=512)
    x = conv_layer(x, filters=75, kernel_size=1, activation=None, batch_normalize=False)
    o_2 = yolo_layer(x, anchors=[(30, 61), (62, 45), (59, 119)])
    
    x = conv_layer(l_91, filters=128, kernel_size=1)
    x = tf.image.resize_images(x, (int(x.shape[1]) * 2, int(x.shape[2]) * 2))
    x = tf.concat([x, l_36], axis=-1)
    x = conv_layer(x, filters=128, kernel_size=1)
    x = conv_layer(x, filters=256)
    x = conv_layer(x, filters=128, kernel_size=1)
    x = conv_layer(x, filters=256)
    x = conv_layer(x, filters=128, kernel_size=1)
    x = conv_layer(x, filters=256)
    x = conv_layer(x, filters=75, kernel_size=1, activation=None, batch_normalize=False)
    o_3 = yolo_layer(x, anchors=[(10, 13), (16, 30), (33, 23)])
    
    output = tf.concat([o_1, o_2, o_3])
    bboxes = tf.map_fn(
        lambda boxes: tf.gather(boxes, tf.image.non_max_suppression(boxes[:,:4], boxes[:,4], 6, score_threshold=0.5)),
        output,
        infer_shape=False
    )
    output = bboxes
    output[:,:,0] = (bboxes[:,:,1] + bboxes[:,:,3]) / 2   # x-center
    output[:,:,1] = (bboxes[:,:,0] + bboxes[:,:,2]) / 2   # y-center
    output[:,:,2] = (bboxes[:,:,3] - bboxes[:,:,1]) / 2   # width
    output[:,:,3] = (bboxes[:,:,2] - bboxes[:,:,0]) / 2   # height
    
    # TODO
    """
    classes = tf.where(tf.sigmoid(x) >= 0.5, tf.ones_like(x, dtype=tf.float32), tf.zeros_like(x, dtype=tf.float32))
    correct_prediction = tf.equal(classes, labels)
    acc = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    acc = tf.identity(acc, name='accuracy_tensor')
    
    predictions = {'classes': classes, 'accuracy': acc}
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
    
    loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=x)
    loss = tf.reduce_mean(loss)
    
    tf.summary.scalar('accuracy', acc)
    tf.summary.scalar('loss', loss)
    """
    
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.contrib.estimator.TowerOptimizer(tf.train.AdamOptimizer(1e-4))
        train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)

    eval_metric_ops = {'accuracy': tf.metrics.accuracy(labels=labels, predictions=classes)}
    return tf.estimator.EstimatorSpec(mode=mode, loss=loss,
                                      eval_metric_ops=eval_metric_ops)


In [None]:
tensors_to_log = {'accuracy': 'accuracy_tensor'}
logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=10)

In [None]:
vars_warm = []

In [None]:
warm_start = tf.estimator.WarmStartSettings(ckpt_to_initialize_from='/tmp/tmpdark/', vars_to_warm_start=vars_warm)

In [None]:
model = tf.estimator.Estimator(
    model_fn=tf.contrib.estimator.replicate_model_fn(darknet_model), model_dir='/tmp/tmpdarkyolo',
    warm_start_from=warm_start, config=tf.estimator.RunConfig(
        save_checkpoints_steps=150, save_summary_steps=10, log_step_count_steps=10
    )
)

In [None]:
validation_hook = tf.contrib.learn.monitors.replace_monitors_with_hooks(
    [tf.contrib.learn.monitors.ValidationMonitor(
        input_fn=lambda:train_input_fn()[1], every_n_steps=100, early_stopping_rounds=10
    )],
    model
)[0]

In [None]:
max_steps = int(((1 - VALIDATION_SPLIT) * TRAIN_LENGTH * 8 / BATCH_SIZE) * EPOCHS)
model.train(input_fn=lambda:train_input_fn()[0], hooks=[logging_hook, validation_hook],
            max_steps=max_steps)

In [None]:
print(model.evaluate(input_fn=test_input_fn))