In [None]:
def colorize_fun(args, ctx):
    
    def print_log(worker_num, arg):
        print("%d: " %worker_num)
        print(arg)

    from tensorflowonspark import TFNode
    from datetime import datetime
    import getpass
    import math
    import numpy
    import os
    import signal
    import tensorflow as tf
    import time
    #from skimage.color import lab2rgb
    from hops import hdfs
  
    # Used to get TensorBoard logdir for TensorBoard that show up in HopsWorks
    from hops import tensorboard

    worker_num = ctx.worker_num
    job_name = ctx.job_name
    task_index = ctx.task_index
    cluster_spec = ctx.cluster_spec
    num_workers = len(cluster_spec['worker'])

    # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
    if job_name == "ps":
        time.sleep((worker_num + 1) * 5)

    # Parameters
    batch_size = 10
    num_epochs = 10000
    

    # Get TF cluster and server instances
    cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)
    
    def weight(width, height, input_channels, output_channels, variable_name):
        # [width, height, input channel, output channel]
        return tf.get_variable(variable_name, initializer=tf.truncated_normal([width, height, input_channels, output_channels], stddev=0.02))

    def bias(output_channels, variable_name):
        return tf.get_variable(variable_name, initializer=tf.constant(0.0, shape=[output_channels]))

    def maxPool(X):
        return tf.nn.max_pool(X, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="SAME")

    def conv2dTranspose(X, W, B, output_shape, stride=2):
        # Lesa þetta: http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf
        conv2d = tf.nn.conv2d_transpose(X, W, output_shape, strides=[1, stride, stride, 1], padding="SAME")
        return tf.nn.bias_add(conv2d, B)
    
    def generateVGGLayers(input_layer):
        vgg_model = args.vgg[0][()]
        vgg_network = {}

        def vggVariable(values, name):
            # Populate tensor with values
            return tf.get_variable(
                name=name, 
                initializer=tf.constant_initializer(values, dtype=tf.float32), 
                shape=values.shape
            )

        def mat2tf(kernels):
            # matconvnet: [width, height, in_channels, out_channels]
            # tensorflow: [height, width, in_channels, out_channels]
            return numpy.transpose(kernels, (1, 0, 2, 3))

        def vggConv(name, X):
            weight = mat2tf(vgg_model[name]['kernels'])
            bias = vgg_model[name]['bias'].reshape(-1) # flatten

            weight = vggVariable(weight, "vgg/" + name + "/weight")
            bias = vggVariable(bias, "vgg/" + name + "/bias")

            X = tf.nn.conv2d(X, weight, strides=[1, 1, 1, 1], padding="SAME")
            return tf.nn.bias_add(X, bias)

        def vggRelu(name, X):
            return tf.nn.relu(X, name="vgg/" + name + "/relu")

        def vggPool(X):
            return tf.nn.avg_pool(X, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="SAME")

        # Hidden layer 1 
        # conv1_2 -> relu1_2 -> pool1
        vgg_network['conv1_2'] = vggConv('conv1_2', input_layer)
        vgg_network['relu1_2'] = vggRelu('relu1_2', vgg_network['conv1_2'])
        vgg_network['pool1'] = vggPool(vgg_network['relu1_2'])

        # Hidden layer 2
        # pool1 -> conv2_1 -> relu2_1 -> conv2_2 -> relu2_2 -> pool2
        vgg_network['conv2_1'] = vggConv('conv2_1', vgg_network['pool1'])
        vgg_network['relu2_1'] = vggRelu('relu2_1', vgg_network['conv2_1'])
        vgg_network['conv2_2'] = vggConv('conv2_2', vgg_network['relu2_1'])
        vgg_network['relu2_2'] = vggRelu('relu2_2', vgg_network['conv2_2'])
        vgg_network['pool2'] = vggPool(vgg_network['relu2_2'])

        # Hidden layer 3
        # pool2 -> conv3_1 -> relu3_1 -> conv3_2 -> relu3_2 -> conv3_3 -> 
        # relu3_3 -> conv3_4 -> relu3_4 -> pool3
        vgg_network['conv3_1'] = vggConv('conv3_1', vgg_network['pool2'])
        vgg_network['relu3_1'] = vggRelu('relu3_1', vgg_network['conv3_1'])
        vgg_network['conv3_2'] = vggConv('conv3_2', vgg_network['relu3_1'])
        vgg_network['relu3_2'] = vggRelu('relu3_2', vgg_network['conv3_2'])
        vgg_network['conv3_3'] = vggConv('conv3_3', vgg_network['relu3_2'])
        vgg_network['relu3_3'] = vggRelu('relu3_3', vgg_network['conv3_3'])
        vgg_network['conv3_4'] = vggConv('conv3_4', vgg_network['relu3_3'])
        vgg_network['relu3_4'] = vggRelu('relu3_4', vgg_network['conv3_4'])
        vgg_network['pool3'] = vggPool(vgg_network['relu3_4'])

        # Hidden layer 4
        # pool3 -> conv4_1 -> relu4_1 -> conv4_2 -> relu4_2 -> conv4_3 -> 
        # relu4_3 -> conv4_4 -> relu4_4 -> pool4
        vgg_network['conv4_1'] = vggConv('conv4_1', vgg_network['pool3'])
        vgg_network['relu4_1'] = vggRelu('relu4_1', vgg_network['conv4_1'])
        vgg_network['conv4_2'] = vggConv('conv4_2', vgg_network['relu4_1'])
        vgg_network['relu4_2'] = vggRelu('relu4_2', vgg_network['conv4_2'])
        vgg_network['conv4_3'] = vggConv('conv4_3', vgg_network['relu4_2'])
        vgg_network['relu4_3'] = vggRelu('relu4_3', vgg_network['conv4_3'])
        vgg_network['conv4_4'] = vggConv('conv4_4', vgg_network['relu4_3'])
        vgg_network['relu4_4'] = vggRelu('relu4_4', vgg_network['conv4_4'])
        vgg_network['pool4'] = vggPool(vgg_network['relu4_4'])

        # Hidden layer 5
        # pool4 -> conv5_1 -> relu5_1 -> conv5_2 -> relu5_2 -> conv5_3 -> 
        # relu5_3 -> conv5_4 -> relu5_4
        vgg_network['conv5_1'] = vggConv('conv5_1', vgg_network['pool4'])
        vgg_network['relu5_1'] = vggRelu('relu5_1', vgg_network['conv5_1'])
        vgg_network['conv5_2'] = vggConv('conv5_2', vgg_network['relu5_1'])
        vgg_network['relu5_2'] = vggRelu('relu5_2', vgg_network['conv5_2'])
        vgg_network['conv5_3'] = vggConv('conv5_3', vgg_network['relu5_2'])
        vgg_network['relu5_3'] = vggRelu('relu5_3', vgg_network['conv5_3'])
        vgg_network['conv5_4'] = vggConv('conv5_4', vgg_network['relu5_3'])
        vgg_network['relu5_4'] = vggRelu('relu5_4', vgg_network['conv5_4'])

        return vgg_network
        
    def createNetwork(L):
        # Input layer
        W = weight(width=3, height=3, input_channels=1, output_channels=64, variable_name="input_layer/weight")
        B = bias(output_channels=64, variable_name="input_layer/bias")
        input_layer = tf.nn.bias_add(tf.nn.conv2d(L, W, strides=[1, 1, 1, 1], padding="SAME"), B)
        input_layer = tf.nn.relu(input_layer, name="input_later/relu")

        # VGG layers
        vgg_network = generateVGGLayers(input_layer)
        vgg_pool5 = maxPool(vgg_network["relu5_3"])

        # Hidden layer 1 (scale up(vgg_pool5) + vgg_pool4)
        vgg_pool4 = vgg_network["pool4"]
        W1 = weight(4, 4, vgg_pool4.shape[3].value, vgg_pool5.shape[3].value, "hidden_layer/1/weight")
        B1 = bias(vgg_pool4.shape[3].value, "hidden_layer/1/bias")
        conv_trans1 = conv2dTranspose(vgg_pool5, W1, B1, output_shape=tf.shape(vgg_pool4))
        hypercolumns = tf.add(conv_trans1, vgg_pool4, name="hidden_layer/1/fuse")

        # Hidden layer 2 (scale up (scale up(vgg_pool5) + vgg_pool4) + vgg_pool3)
        vgg_pool3 = vgg_network["pool3"]
        W2 = weight(4, 4, vgg_pool3.shape[3].value, vgg_pool4.shape[3].value,"hidden_layer/2/weight")
        B2 = bias(vgg_pool3.shape[3].value, "hidden_layer/2/bias")
        conv_trans2 = conv2dTranspose(hypercolumns, W2, B2, output_shape=tf.shape(vgg_pool3))
        hypercolumns = tf.add(conv_trans2, vgg_pool3, name="hidden_layer/2/fuse")

        # Output layer (scale up (scale up (scale up(vgg_pool5) + vgg_pool4) + vgg_pool3) to picture size)
        input_shape = tf.shape(L)
        output_shape = tf.stack([input_shape[0], input_shape[1], input_shape[2], 2])
        W3 = weight(16, 16, 2, vgg_pool3.shape[3].value, "output_layer/weight")
        B3 = bias(2, "output_layer/bias")
        AB = conv2dTranspose(hypercolumns, W3, B3, output_shape=output_shape, stride=8)

        # Output LAB values
        return tf.concat([L, AB], 3, name="colorized_image") # [?, pic_width, pic_height, 3]

    def readTFRecords(path, batch_size=100, num_epochs=None, task_index=None, num_workers=None):
        print_log(worker_num, "num_epochs: {0}".format(num_epochs))

        # Setup queue of TFRecord filenames
        tf_record_pattern = os.path.join(path, 'part-*')
        files = tf.gfile.Glob(tf_record_pattern)
        queue_name = "file_queue"

        # split input files across workers, if specified
        if task_index is not None and num_workers is not None:
            num_files = len(files)
            files = files[task_index:num_files:num_workers]
            queue_name = "file_queue_{0}".format(task_index)

        print_log(worker_num, "files: {0}".format(files))
        file_queue = tf.train.string_input_producer(files, shuffle=False, capacity=1000, num_epochs=num_epochs, name=queue_name)

        # Setup reader for examples
        reader = tf.TFRecordReader(name="reader")
        _, serialized = reader.read(file_queue)
        feature_def = {
            'L': tf.FixedLenFeature([65536], tf.float32), 
            'A': tf.FixedLenFeature([65536], tf.float32), 
            'B': tf.FixedLenFeature([65536], tf.float32) 
        }
        
        features = tf.parse_single_example(serialized, feature_def)
        L = tf.reshape(tf.to_float(features['L']), [256, 256, 1])
        
        print_log(worker_num, "L: {0}".format(L))
        A = tf.multiply(tf.reshape(tf.to_float(features['A']), [256, 256, 1]), 128)
        print_log(worker_num, "A: {0}".format(A))
        B = tf.multiply(tf.reshape(tf.to_float(features['B']), [256, 256, 1]), 128)
        print_log(worker_num, "B: {0}".format(B))
        LAB = tf.concat([L, A, B], 2)

        # Return a batch of examples
        return tf.train.batch([L, LAB], batch_size, num_threads=args.readers, name="batch")
        
    def convert2RGB(lab):
        return map(lambda x: lab2rgb(x.astype(numpy.float32)), lab)
    
    def extractImage(LAB):
        return LAB#tf.py_func(convert2RGB, [LAB], tf.float32)

    
    if job_name == "ps":
        server.join()
    elif job_name == "worker":
        # Assigns ops to the local worker by default.
        with tf.device(tf.train.replica_device_setter(worker_device="/job:worker/task:%d" % task_index, cluster=cluster)):
            index = task_index if args.mode == "inference" else None
            workers = num_workers if args.mode == "inference" else None

            images = TFNode.hdfs_path(ctx, args.images)
            X, Y_ = readTFRecords(images, batch_size, num_epochs, index, workers)
            
            Y = createNetwork(X)

            '''Display pics'''
            tf.summary.image("bw_img", X)
            tf.summary.image("color_img", extractImage(Y_))
            tf.summary.image("colorized_img", extractImage(Y))

            # Define the loss function 
            loss = tf.reduce_mean(tf.squared_difference(Y, Y_), 1)
            tf.summary.scalar("loss", tf.reduce_mean(loss))

            global_step = tf.Variable(0, name="global_step")
            # Define an optimizer
            train_op = tf.train.AdamOptimizer(0.0001, beta1=0.9).minimize(loss, global_step=global_step)

            # Test trained model
            label = tf.argmax(Y_, 1, name="label")
            prediction = tf.argmax(Y, 1,name="prediction")
            correct_prediction = tf.equal(prediction, label)
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")
            tf.summary.scalar("accuracy", accuracy)

            saver = tf.train.Saver()
            summary_op = tf.summary.merge_all()
            init_op = tf.global_variables_initializer()

            # Create a "supervisor", which oversees the training process and stores model state into HDFS
            logdir = tensorboard.logdir()
            print("tensorflow model path: {0}".format(logdir))

            if job_name == "worker" and task_index == 0:
                summary_writer = tf.summary.FileWriter(logdir, graph=tf.get_default_graph())

            if args.mode == "train":
                sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                       logdir=logdir,
                                       init_op=init_op,
                                       summary_op=None,
                                       summary_writer=None,
                                       saver=saver,
                                       global_step=global_step,
                                       stop_grace_secs=300,
                                       save_model_secs=10)
            else:
                sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                       logdir=logdir,
                                       summary_op=None,
                                       saver=saver,
                                       global_step=global_step,
                                       stop_grace_secs=300,
                                       save_model_secs=0)
            output_dir = TFNode.hdfs_path(ctx, args.output)
            output_file = tf.gfile.Open("{0}/part-{1:05d}".format(output_dir, worker_num), mode='w')

            # The supervisor takes care of session initialization, restoring from
            # a checkpoint, and closing when done or an error occurs.
    with sv.managed_session(server.target) as sess:
        print("{0} session ready".format(datetime.now().isoformat()))

        # Loop until the supervisor shuts down or 1000000 steps have completed.
        step = 0
        count = 0
        while not sv.should_stop() and step < args.steps:
        # Run a training step asynchronously.
        # See `tf.train.SyncReplicasOptimizer` for additional details on how to
        # perform *synchronous* training.

            # using QueueRunners/Readers
            if args.mode == "train":
                #if (step % 100 == 0):
                    #print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy)))
                _, summary, step = sess.run([train_op, summary_op, global_step])
                if sv.is_chief:
                    summary_writer.add_summary(summary, step)
        if task_index == 0:
            time.sleep(60)

        # Ask for all the services to stop.
        print("{0} stopping supervisor".format(datetime.now().isoformat()))
        sv.stop()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
6501,application_1513605045578_4017,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.


In [None]:
from pyspark.context import SparkContext
from pyspark.conf import SparkConf

import argparse
import os
import numpy
import sys
import tensorflow as tf
import threading
from datetime import datetime
from hops import util
from hops import hdfs
import io

from tensorflowonspark import TFCluster

sc = spark.sparkContext
num_executors = util.num_executors(spark)
num_ps = util.num_param_servers(spark)

model_location = "hdfs:///Projects/colorizeML2/imagenet_vgg/vgg_model.npy"
model = sc.binaryFiles(model_location).map(lambda binaryData: binaryData[1]).map(lambda x: io.BytesIO(x)).map(lambda x: numpy.load(x)).take(1)

parser = argparse.ArgumentParser()
parser.add_argument("-e", "--epochs", help="number of epochs", type=int, default=0)
parser.add_argument("-f", "--format", help="example format: (csv|pickle|tfr)", choices=["csv","pickle","tfr"], default="csv")
parser.add_argument("-i", "--images", help="HDFS path to MNIST images in parallelized format", default='/Projects/' + hdfs.project_name() + '/imbd_face_dataset/processed')
parser.add_argument("-l", "--labels", help="HDFS path to MNIST labels in parallelized format", default = '/Projects/' + hdfs.project_name() + '/mnist/train/labels')
parser.add_argument("-m", "--model", help="HDFS path to save/load model during train/test", default="mnist_model")
parser.add_argument("-n", "--cluster_size", help="number of nodes in the cluster (for Spark Standalone)", type=int, default=num_executors)
parser.add_argument("-o", "--output", help="HDFS path to save test/inference output", default="predictions")
parser.add_argument("-r", "--readers", help="number of reader/enqueue threads", type=int, default=1)
parser.add_argument("-s", "--steps", help="maximum number of steps", type=int, default=4000000)
parser.add_argument("-tb", "--tensorboard", help="launch tensorboard process", action="store_true")
parser.add_argument("-X", "--mode", help="train|inference", default="train")
parser.add_argument("-c", "--rdma", help="use rdma connection", default=False)
parser.add_argument("-v", "--vgg", default= model)
args = parser.parse_args()
print("args:",args)

print("{0} ===== Start".format(datetime.now().isoformat()))

cluster = TFCluster.run(sc, colorize_fun, args, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.TENSORFLOW)
cluster.shutdown()

print("{0} ===== Stop".format(datetime.now().isoformat()))