In [1]:
import os.path
import warnings
import tensorflow as tf
import glob
import project_tests as ptests
import helper

In [2]:
# Global variables for paths
data_dir = './data'
runs_dir = './runs'
training_dir = data_dir + '/data_road/training'
training_size = len(glob.glob(training_dir + '/calib/*.*'))
vgg_dir = data_dir + '/vgg'

In [3]:
# Check Tensorflow Version
from distutils.version import LooseVersion
assert LooseVersion(tf.__version__) >= LooseVersion('1.0'), 'Assertion Failed. Tensorflow > 1.0 current version is {}'.format(tf.__version__)

# Print version if assertion is successful
print('Assertion Successful: TF Version: {}'.format(tf.__version__))

Assertion Successful: TF Version: 1.0.0


In [4]:
# Check GPU availability
from tensorflow import test as tft
if not tft.gpu_device_name():
  warnings.warn('GPU not found... Please reconsider working with a GPU for training')
else:
  print('Default GPU : {}'.format(tft.gpu_device_name()))

Default GPU : /gpu:0


In [5]:
# Training constants
num_classes_ = 2
img_shape_ = (160, 576)

epochs_ = 20
batch_size_ = 1

learning_rate_ = 0.0001
dropout_ = 0.75

In [6]:
# Place_holders (_ph)
label_ph = tf.placeholder(tf.float32, [None, 
                                       img_shape_[0],
                                       img_shape_[1], 
                                       num_classes_])

# Learning Rate
learning_rate_ph = tf.placeholder(tf.float32)

# keep_prob
keep_prob_ph = tf.placeholder(tf.float32)

In [7]:
# Initialize training losses to null
all_training_losses = [] 

In [8]:
def load_vgg(sess, vgg_dir):
    """
    Load Pretrained VGG Model
    @param sess:  Tf Session
    @param vgg_dir: Directory containing vgg "variables/" and "saved_model.pb"
    return: VGG Tensor Tuple(image_input, keep_prob, layer3, layer4, layer7)
    """
    # Load Model with Weights from vgg directory
    model = tf.saved_model.loader.load(sess, ['vgg16'], vgg_dir)

    # Setup tensors to get from graph ( vgg after loading)
    graph = tf.get_default_graph()

    # get image input
    image_input = graph.get_tensor_by_name('image_input:0')

    # get keep probability
    keep_prob = graph.get_tensor_by_name('keep_prob:0')

    # Get layer outputs
    layer_3 = graph.get_tensor_by_name('layer3_out:0')
    layer_4 = graph.get_tensor_by_name('layer4_out:0')
    layer_7 = graph.get_tensor_by_name('layer7_out:0')

    # return as 5D list
    return image_input, keep_prob, layer_3, layer_4, layer_7

In [9]:
def conv_1x1(layer, layer_name):
  """ convolve layer by (1x1) to preserve spatial information """
  return tf.layers.conv2d(inputs = layer,
                          filters =  num_classes_,
                          kernel_size = (1, 1),
                          strides = (1, 1),
                          name = layer_name)

In [10]:
def deconvolve(layer, k, s, layer_name):
  """ Transpose Convolve/ deconvolve a layer with arguments as params """
  return tf.layers.conv2d_transpose(inputs = layer,
                                    filters = num_classes_,
                                    kernel_size = (k, k),
                                    strides = (s, s),
                                    padding = 'same',
                                    name = layer_name)

In [11]:
def layers(vgg_layer_3_out, vgg_layer_4_out, vgg_layer_7_out, num_classes = num_classes_):
    """
    # Create layers for the FCN.
    vgg_layer_n_out: TF Tensor for VGG Layer n output
    num_classes: Number of classes to classify
    return: The Tensor for the last layer of output
    """


    # Apply a 1x1 convolution to all argument layers
    layer_3x = conv_1x1(layer = vgg_layer_3_out, layer_name = "layer3conv1x1")
    layer_4x = conv_1x1(layer = vgg_layer_4_out, layer_name = "layer4conv1x1")
    layer_7x = conv_1x1(layer = vgg_layer_7_out, layer_name = "layer7conv1x1")

    # Add decoder layers to the network with skip connections
    # Deconvolve
    decoder_layer_1 = deconvolve(layer = layer_7x, k = 4, s = 2, layer_name = "decoderlayer1")
    
    # Sum (skip connection)
    decoder_layer_2 = tf.add(decoder_layer_1, layer_4x, name = "decoderlayer2")
    
    # Deconvolve
    decoder_layer_3 = deconvolve(layer = decoder_layer_2, k = 4, s = 2, layer_name = "decoderlayer3")

    # Sum (skip connection)
    decoder_layer_4 = tf.add(decoder_layer_3, layer_3x, name = "decoderlayer4")
    
    # Deconvolve
    decoderlayer_output = deconvolve(layer = decoderlayer4, k = 16, s = 8, layer_name = "decoderlayer_output")

    return decoderlayer_output

In [12]:
def layers_verbose(vgg_layer_3_out, vgg_layer_4_out, vgg_layer7_out, num_classes = num_classes_):

    """
    # Create layers for the FCN.
    vgg_layer_n_out: TF Tensor for VGG Layer n output
    num_classes: Number of classes to classify
    return: List of tensors for all layers
    """
    # Apply a 1x1 convolution to encoder layers
    layer3x = conv_1x1(layer = vgg_layer_3_out, layer_name = "layer3conv1x1")
    layer4x = conv_1x1(layer = vgg_layer_4_out, layer_name = "layer4conv1x1")
    layer7x = conv_1x1(layer = vgg_layer7_out, layer_name = "layer7conv1x1")

    # Add decoder layers to the network with skip connections
    # Deconvolve
    decoder_layer_1 = deconvolve(layer = layer_7x, k = 4, s = 2, layer_name = "decoderlayer1")
    
    # Sum (skip connection)
    decoder_layer_2 = tf.add(decoder_layer_1, layer_4x, name = "decoderlayer2")
    
    # Deconvolve
    decoder_layer_3 = deconvolve(layer = decoder_layer_2, k = 4, s = 2, layer_name = "decoderlayer3")

    # Sum (skip connection)
    decoder_layer_4 = tf.add(decoder_layer_3, layer3x, name = "decoderlayer4")
    
    # Deconvolve
    decoderlayer_output = deconvolve(layer = decoderlayer4, k = 16, s = 8, layer_name = "decoderlayer_output")

    # Return all the layers for a more detailed output
    return vgg_layer_3_out, vgg_layer_4_out, vgg_layer_7_out, layer3x, layer4x, layer7x, \
         decoder_layer_1, decoder_layer_2, decoder_layer_3, decoder_layer_4, decoderlayer_output

In [13]:
def optimize(nn_last_layer, correct_label, learning_rate, num_classes = num_classes_):
    """
    TF loss and optimizer operations.
    nn_last_layer: last layer tensor
    correct_label: label image placeholder
    learning_rate: learning rate placeholder
    num_classes: Number of classes to classify
    return: logits, train_op, cross_entropy_loss as python list
    """
    # Flatten 4D tensors to 2D
    # (pixel,class)
    logits = tf.reshape(nn_last_layer, (-1, num_classes))
    class_labels = tf.reshape(correct_label, (-1, num_classes))

    # The cross_entropy_loss is the cost heuristic
    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits = logits,
                                                            labels = class_labels)
    # use the reduce mean method
    cross_entropy_loss = tf.reduce_mean(cross_entropy)

    # Use the standard Adam optimizer to minimize loss
    train_op = tf.train.AdamOptimizer(learning_rate).minimize(cross_entropy_loss)

    # return logits, train_op, cross_entropy_loss as python list
    return logits, train_op, cross_entropy_loss

In [14]:
def train_nn(sess, epochs, batch_size, get_batches_fn, train_op,
             cross_entropy_loss, input_image,
             correct_label, keep_prob, learning_rate):
    """
    Train the neural network and provide debug prints during training
    Arguments: 
    sess: TF Session
    epochs: Number of epochs
    batch_size: Batch size
    get_batches_fn: Function to get batches of training data
    train_op: training operation
    cross_entropy_loss: Loss Tensor
    input_image: TF Placeholder for input images
    correct_label: TF Placeholder for label images
    keep_prob: TF Placeholder for dropout keep probability
    learning_rate: TF Placeholder for learning rate
    """
    # For all epochs
    for epoch in range(EPOCHS):
        #initialize losses and counter
        losses, i = [], 0
        
        # For all images in the batch
        for images, labels in get_batches_fn(BATCH_SIZE):
            
            # increment batch counter by 1
            i += 1
            
            # Create the feed by assigining values to placeholders
            feed = {input_image: images,
                    label_ph: labels,
                    keep_prob_ph: dropout_,
                    learning_rate_ph: learning_rate_ }

            # Run the training op with the created feed
            _, partial_loss = sess.run([train_op, cross_entropy_loss], feed_dict = feed)

            # display output
            print("- - - - - >Iteration: ", i, "----->Partial loss:", partial_loss)
            
            # Add to list of losses
            losses.append(partial_loss)

        # After each batch compute net average loss
        training_loss = sum(losses) / len(losses)
        
        # Add to list of global training losses
        all_training_losses.append(training_loss)

        # Print Training loss at end of each Epoch
        print("***************")
        print("Epoch: ", epoch + 1, " of ", epochs_, "training loss: ", training_loss)
        print("***************")


In [15]:
def run_tests():
    """
    Run tests to test whether functions are correctly created
    """
    tests.test_layers(layers)
    tests.test_optimize(optimize)
    tests.test_for_kitti_dataset(data_dir)
    tests.test_train_nn(train_nn)

In [16]:
def run():
    
    print("Training data size", training_size)
    
    # download vgg model if it doesnt exist
    helper.maybe_download_pretrained_vgg(data_dir)
    
    # use the get batches function from the helper.py provided
    get_batches_fn = helper.gen_batch_function(training_dir, img_shape_)
    
    # Using the default session
    with tf.Session() as session:
        
        # Returns the input dropout and output layers from vgg
        image_input, keep_prob, layer_3, layer_4, layer_7 = load_vgg(session, vgg_dir)

        # Create the layers and get the output
        model_output = layers(layer_3, layer_4, layer_7, num_classes_)

        # Get the logits, training op and the loss
        logits, train_op, cross_entropy_loss = optimize(model_output, label_ph, learning_rate_ph, num_classes_)

        # Initilize all variables
        session.run([tf.global_variables_initializer(), tf.local_variables_initializer()])

        # Run the training step
        train_nn(session, epochs_, batch_size_, get_batches_fn, 
                 train_op, cross_entropy_loss, image_input,
                 label_ph, keep_prob, learning_rate)

        # Save inference data
        helper.save_inference_samples(runs_dir, data_dir, session, img_shape_, logits, keep_prob_ph, image_input)

In [17]:
import numpy as np 

def network_shapes():
    
    with tf.Session() as sess:
        # Create a random 3 channel input
        x = np.random.randn(1, 160, 576, 3)

        # Create inputs, dropout and vgg out layers
        image_input, keep_prob, layer_3, layer_4, layer_7 = load_vgg(sess, vgg_dir)
        
        # Create verbose layers
        op = layers_verbose(layer_3, layer_4, layer_7, num_classes_)

        # initialize the variables 
        sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()])

        # Run the optimizer
        l3, l4, l7, l3x, l4x, l7x, d1, s2, d3, s4, d5 = sess.run(op, feed_dict = {image_input: x, keep_prob_ph: 1.0})

        print("------------------")
        print("shapes of layers:") 
        print("------------------")

        print("layer3 -->", l3.shape)
        print("layer4 -->", l4.shape)
        print("layer7 -->", l7.shape)
        print("layer3 conv1x1 -->", l3x.shape)
        print("layer4 conv1x1 -->", l4x.shape)
        print("layer7 conv1x1-->", l7x.shape)
        print("decoderlayer1 transpose: layer7 k = 4 s = 2 -->", d1.shape)
        print("decoderlayer2 skip: decoderlayer1 and layer4conv1x1 -->", s2.shape)
        print("decoderlayer3 transpose: decoderlayer2 k = 4 s = 2 -->", d3.shape)
        print("decoderlayer4 skip: decoderlayer3 and layer3conv1x1 -->", s4.shape)
        print("decoderlayer5 transpose: decoderlayer4 k = 16 s = 8 -->", d5.shape)

In [19]:
run()

Training data size 0


ResourceExhaustedError: OOM when allocating tensor with shape[7,7,512,4096]
	 [[Node: save/Assign_27 = Assign[T=DT_FLOAT, _class=["loc:@fc6/weights"], use_locking=true, validate_shape=true, _device="/job:localhost/replica:0/task:0/gpu:0"](fc6/weights, save/RestoreV2_27/_7)]]

Caused by op 'save/Assign_27', defined at:
  File "/Users/sahilmalhotra/anaconda/envs/tf-lab/lib/python3.5/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/Users/sahilmalhotra/anaconda/envs/tf-lab/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/Users/sahilmalhotra/anaconda/envs/tf-lab/lib/python3.5/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/Users/sahilmalhotra/anaconda/envs/tf-lab/lib/python3.5/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/Users/sahilmalhotra/anaconda/envs/tf-lab/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 474, in start
    ioloop.IOLoop.instance().start()
  File "/Users/sahilmalhotra/anaconda/envs/tf-lab/lib/python3.5/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/Users/sahilmalhotra/anaconda/envs/tf-lab/lib/python3.5/site-packages/tornado/ioloop.py", line 887, in start
    handler_func(fd_obj, events)
  File "/Users/sahilmalhotra/anaconda/envs/tf-lab/lib/python3.5/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/sahilmalhotra/anaconda/envs/tf-lab/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/Users/sahilmalhotra/anaconda/envs/tf-lab/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/Users/sahilmalhotra/anaconda/envs/tf-lab/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/Users/sahilmalhotra/anaconda/envs/tf-lab/lib/python3.5/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/sahilmalhotra/anaconda/envs/tf-lab/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/Users/sahilmalhotra/anaconda/envs/tf-lab/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "/Users/sahilmalhotra/anaconda/envs/tf-lab/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 390, in execute_request
    user_expressions, allow_stdin)
  File "/Users/sahilmalhotra/anaconda/envs/tf-lab/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/Users/sahilmalhotra/anaconda/envs/tf-lab/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 501, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/Users/sahilmalhotra/anaconda/envs/tf-lab/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/Users/sahilmalhotra/anaconda/envs/tf-lab/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2827, in run_ast_nodes
    if self.run_code(code, result):
  File "/Users/sahilmalhotra/anaconda/envs/tf-lab/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-18-157c9bda2cd6>", line 1, in <module>
    run()
  File "<ipython-input-16-c8bbae680200>", line 15, in run
    image_input, keep_prob, layer_3, layer_4, layer_7 = load_vgg(session, vgg_dir)
  File "<ipython-input-8-ea30449bfaaf>", line 9, in load_vgg
    model = tf.saved_model.loader.load(sess, ['vgg16'], vgg_dir)
  File "/Users/sahilmalhotra/anaconda/envs/tf-lab/lib/python3.5/site-packages/tensorflow/python/saved_model/loader_impl.py", line 212, in load
    saver = tf_saver.import_meta_graph(meta_graph_def_to_load, **saver_kwargs)
  File "/Users/sahilmalhotra/anaconda/envs/tf-lab/lib/python3.5/site-packages/tensorflow/python/training/saver.py", line 1577, in import_meta_graph
    **kwargs)
  File "/Users/sahilmalhotra/anaconda/envs/tf-lab/lib/python3.5/site-packages/tensorflow/python/framework/meta_graph.py", line 498, in import_scoped_meta_graph
    producer_op_list=producer_op_list)
  File "/Users/sahilmalhotra/anaconda/envs/tf-lab/lib/python3.5/site-packages/tensorflow/python/framework/importer.py", line 287, in import_graph_def
    op_def=op_def)
  File "/Users/sahilmalhotra/anaconda/envs/tf-lab/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 2395, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/Users/sahilmalhotra/anaconda/envs/tf-lab/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1264, in __init__
    self._traceback = _extract_stack()

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[7,7,512,4096]
	 [[Node: save/Assign_27 = Assign[T=DT_FLOAT, _class=["loc:@fc6/weights"], use_locking=true, validate_shape=true, _device="/job:localhost/replica:0/task:0/gpu:0"](fc6/weights, save/RestoreV2_27/_7)]]
