In [1]:
% pylab inline
from numpy import linalg as LA
import glob
from tqdm import tqdm
import os
import sklearn.preprocessing as prep
import pickle
import joblib
import tensorflow as tf
import pandas as pd
import os
#os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
#os.environ['CUDA_VISIBLE_DEVICES'] = '1'


IMAGE_WIDTH = 256
IMAGE_HEIGHT = 256
IMAGE_CHANNELS = 3
BATCH_SIZE = 8

LEARNING_RATE = 1e-4
N_EPOCHS = 100
N_LATENT = 100
CHECKPOINT_DIR = '/Z/personal-folders/interns/saket/vae_patches_train_valid_nlatent100'    
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
INPUT_DIM = IMAGE_CHANNELS*IMAGE_WIDTH*IMAGE_HEIGHT


def min_max_scale(X):
    preprocessor = prep.MinMaxScaler().fit(X)
    X_scaled = preprocessor.transform(X)
    return X_scaled

Populating the interactive namespace from numpy and matplotlib


In [2]:
#config = tf.ConfigProto(
#    device_count = {'GPU': 0}
#)
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.3
config.gpu_options.visible_device_list = '1' 

class VAE(object):
    def __init__(self, 
                 input_dim, 
                 learning_rate=0.01, 
                 n_latent=8, 
                 batch_size=50):
        self.learning_rate = learning_rate
        self.n_latent = n_latent
        self.batch_size = batch_size
        self.input_dim = input_dim
        
        self._build_network()
        self._create_loss_optimizer()

        
        init = tf.global_variables_initializer()        
        # Launch the session
        self.session = tf.Session(config=config)
        self.session.run(init)
        self.saver = tf.train.Saver(tf.global_variables())
        
    def close_session(self):
        self.session.close()
        
    def _build_network(self):        
        self.x = tf.placeholder(tf.float32, 
                                [None, self.input_dim])
        dense1 = tf.layers.dense(activation=tf.nn.elu, 
                                 inputs=self.x, 
                                 units=256)
        dense2 = tf.layers.dense(activation=tf.nn.elu, 
                                 inputs=dense1, 
                                 units=256)
        dense3 = tf.layers.dense(activation=tf.nn.elu, 
                                 inputs=dense2, 
                                 units=256)
        dense4 = tf.layers.dense(activation=None, 
                                 inputs=dense3, 
                                 units=self.n_latent * 2)
        self.mu = dense4[:, :self.n_latent]
        self.sigma = tf.nn.softplus(dense4[:, self.n_latent:])
        eps = tf.random_normal(shape=tf.shape(self.sigma),
                               mean=0, 
                               stddev=1, 
                               dtype=tf.float32)
        self.z = self.mu + self.sigma * eps
        
        ddense1 = tf.layers.dense(activation=tf.nn.elu, 
                                  inputs=self.z, 
                                  units=256)
        ddense2 = tf.layers.dense(activation=tf.nn.elu, 
                                  inputs=ddense1, 
                                  units=256)
        ddense3 = tf.layers.dense(activation=tf.nn.elu, 
                                  inputs=ddense2, 
                                  units=256)

        self.reconstructed = tf.layers.dense(activation=tf.nn.sigmoid, 
                                             inputs=ddense3,
                                             units=self.input_dim)
    
    def _create_loss_optimizer(self):
        epsilon = 1e-10
        reconstruction_loss = -tf.reduce_sum(
            self.x * tf.log(epsilon+self.reconstructed) + (1-self.x) * tf.log(epsilon+1-self.reconstructed), 
            axis=1
        )
        
        self.reconstruction_loss = tf.reduce_mean(reconstruction_loss)        
        latent_loss = -0.5 * tf.reduce_sum(1 + tf.log(epsilon+self.sigma) - tf.square(self.mu) - tf.square(self.sigma),
                                           axis=1)
        latent_loss = tf.reduce_mean(latent_loss)
        self.latent_loss = latent_loss
        self.cost = tf.reduce_mean(self.reconstruction_loss + self.latent_loss)
        # ADAM optimizer
        self.optimizer = \
            tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.cost)         
    
    
    def fit_minibatch(self, batch):
        _, cost, reconstruction_loss, latent_loss = self.session.run([self.optimizer,
                                                                      self.cost,
                                                                      self.reconstruction_loss,
                                                                      self.latent_loss], 
                                                                     feed_dict = {self.x: batch})
        return  cost, reconstruction_loss, latent_loss
    
    def reconstruct(self, x):
        return self.session.run([self.reconstructed], feed_dict={self.x: x})
    
    def decoder(self, z):
        return self.session.run([self.reconstructed], feed_dict={self.z: z})
    
    def encoder(self, x):
        return self.session.run([self.z], feed_dict={self.x: x})

    def save_model(self, checkpoint_path, epoch):
        self.saver.save(self.session, checkpoint_path, global_step = epoch)

    def load_model(self, checkpoint_dir):
        ckpt = tf.train.get_checkpoint_state(checkpoint_dir=checkpoint_dir, latest_filename='checkpoint')
        print('loading model: {}'.format(ckpt.model_checkpoint_path))
        self.saver.restore(self.session, ckpt.model_checkpoint_path)

In [3]:
train_df_file = '/Z/personal-folders/interns/saket/github/pywsi/data/patch_df/train_df_with_mask.tsv'
valid_df_file = '/Z/personal-folders/interns/saket/github/pywsi/data/patch_df/validate_df_with_mask.tsv'
train_df = pd.read_table(train_df_file)
train_df.columns#()

Index(['is_tissue', 'is_tumor', 'json_filepath', 'slide_path', 'slide_type',
       'tile_loc', 'uid', 'img_path', 'mask_path'],
      dtype='object')

In [4]:
train_df_file = '/Z/personal-folders/interns/saket/github/pywsi/data/patch_df/train_df_with_mask.tsv'
valid_df_file = '/Z/personal-folders/interns/saket/github/pywsi/data/patch_df/validate_df_with_mask.tsv'


def preprocess(image):
    return image/255.0 - 0.5

def _read_py_function(label, filename):
    image_decoded = joblib.load(filename)
    image_decoded = preprocess(image_decoded)
    #print(label)
    #print(image_decoded)
    return int32(eval(label)), image_decoded.astype(np.float32)

def _resize_function(label, image_decoded):
    image_resized = tf.reshape(image_decoded, (-1, INPUT_DIM))
    image_resized = tf.cast(
        image_resized,
        tf.float32)
    return tf.cast(label, tf.int32), image_resized


def make_dataset(df):
    record_defaults = [[''], ['']]
    select_cols = [1, 7]
    dataset = tf.contrib.data.CsvDataset(df,
                                         record_defaults, 
                                         header=True, 
                                         field_delim='\t',
                                         select_cols=select_cols)
    #training_dataset = training_dataset.map(parser, 
    #num_parallel_calls=BATCH_SIZE)
    dataset = dataset.map( lambda is_tumor, img_path: tuple(tf.py_func(_read_py_function, 
                                                                       [is_tumor, img_path], 
                                                                       [np.int32, np.float32])))
    dataset = dataset.map(_resize_function)
    dataset = dataset.shuffle(buffer_size=10000)
    dataset = dataset.batch(BATCH_SIZE)
    return dataset

In [5]:
training_dataset = make_dataset(train_df_file)
validation_dataset = make_dataset(valid_df_file)
training_iterator = training_dataset.make_one_shot_iterator()

iterator = tf.data.Iterator.from_structure(training_dataset.output_types,
                                           training_dataset.output_shapes)
training_init_op = iterator.make_initializer(training_dataset)
#validation_init_op = iterator.make_initializer(validation_dataset)



In [6]:
model = VAE(input_dim=INPUT_DIM,
            learning_rate=LEARNING_RATE,
            n_latent=N_LATENT,
            batch_size=BATCH_SIZE)
total_losses = []
reconstruction_losses = []
latent_losses = []
sess = model.session
training_next_batch = iterator.get_next()

for epoch in range(N_EPOCHS):
    sess.run(training_init_op)    
    while True:
        try:
            training_label_batch, training_image_batch = sess.run(training_next_batch)
            #print(training_image_batch)
            #print(training_label_batch)
        except tf.errors.OutOfRangeError:
            break
        input_batch = training_image_batch
        #input_batch = np.reshape(input_batch, (-1, ))
        #input_batch = np.asarray(input_batch, dtype=np.float32).reshape(-1, 256*256*3)
        total_loss, reconstruction_loss, latent_loss = model.fit_minibatch(input_batch)
        latent_losses.append(latent_loss)
        reconstruction_losses.append(reconstruction_loss)
        total_losses.append(total_loss)
        total_losses_path = os.path.join(CHECKPOINT_DIR, 'total_losses.pickle')
        latent_losses_path = os.path.join(CHECKPOINT_DIR, 'latent_losses.pickle')
        reconstruction_losses_path = os.path.join(CHECKPOINT_DIR, 'reconstruction_losses.pickle')
        joblib.dump(total_losses, total_losses_path)
        joblib.dump(latent_losses, latent_losses_path)
        joblib.dump(reconstruction_losses, reconstruction_losses_path)

    if epoch % 5 == 0:
        print('[Epoch {}] Loss: {}, Recon loss: {}, Latent loss: {}'.format(
            epoch, total_loss, reconstruction_loss, latent_loss))
        checkpoint_path = os.path.join(CHECKPOINT_DIR, 'model.ckpt')
        model.save_model(checkpoint_path, epoch)
        print ("model saved to {}".format(checkpoint_path))

print('Done!')
#return model, reconstruction_losses, lat

ResourceExhaustedError: OOM when allocating tensor with shape[256] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[Node: dense_1/bias/Adam/Assign = Assign[T=DT_FLOAT, _class=["loc:@dense_1/bias/Assign"], _grappler_relax_allocator_constraints=true, use_locking=true, validate_shape=true, _device="/job:localhost/replica:0/task:0/device:GPU:0"](dense_1/bias/Adam, dense_1/bias/Adam/Initializer/zeros)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


Caused by op 'dense_1/bias/Adam/Assign', defined at:
  File "/home/saket/anaconda3/lib/python3.5/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/saket/anaconda3/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/saket/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/home/saket/anaconda3/lib/python3.5/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/saket/anaconda3/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 486, in start
    self.io_loop.start()
  File "/home/saket/anaconda3/lib/python3.5/site-packages/tornado/platform/asyncio.py", line 127, in start
    self.asyncio_loop.run_forever()
  File "/home/saket/anaconda3/lib/python3.5/asyncio/base_events.py", line 421, in run_forever
    self._run_once()
  File "/home/saket/anaconda3/lib/python3.5/asyncio/base_events.py", line 1425, in _run_once
    handle._run()
  File "/home/saket/anaconda3/lib/python3.5/asyncio/events.py", line 127, in _run
    self._callback(*self._args)
  File "/home/saket/anaconda3/lib/python3.5/site-packages/tornado/ioloop.py", line 759, in _run_callback
    ret = callback()
  File "/home/saket/anaconda3/lib/python3.5/site-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/saket/anaconda3/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 536, in <lambda>
    self.io_loop.add_callback(lambda : self._handle_events(self.socket, 0))
  File "/home/saket/anaconda3/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "/home/saket/anaconda3/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/saket/anaconda3/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "/home/saket/anaconda3/lib/python3.5/site-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/saket/anaconda3/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/saket/anaconda3/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/saket/anaconda3/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/home/saket/anaconda3/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/saket/anaconda3/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/saket/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2662, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/home/saket/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2785, in _run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/saket/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2903, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/saket/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-6-0a70863614fd>", line 4, in <module>
    batch_size=BATCH_SIZE)
  File "<ipython-input-2-eea8d8b17704>", line 20, in __init__
    self._create_loss_optimizer()
  File "<ipython-input-2-eea8d8b17704>", line 83, in _create_loss_optimizer
    self.optimizer =             tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.cost)
  File "/home/saket/anaconda3/lib/python3.5/site-packages/tensorflow/python/training/optimizer.py", line 409, in minimize
    name=name)
  File "/home/saket/anaconda3/lib/python3.5/site-packages/tensorflow/python/training/optimizer.py", line 585, in apply_gradients
    self._create_slots(var_list)
  File "/home/saket/anaconda3/lib/python3.5/site-packages/tensorflow/python/training/adam.py", line 127, in _create_slots
    self._zeros_slot(v, "m", self._name)
  File "/home/saket/anaconda3/lib/python3.5/site-packages/tensorflow/python/training/optimizer.py", line 1130, in _zeros_slot
    new_slot_variable = slot_creator.create_zeros_slot(var, op_name)
  File "/home/saket/anaconda3/lib/python3.5/site-packages/tensorflow/python/training/slot_creator.py", line 181, in create_zeros_slot
    colocate_with_primary=colocate_with_primary)
  File "/home/saket/anaconda3/lib/python3.5/site-packages/tensorflow/python/training/slot_creator.py", line 155, in create_slot_with_initializer
    dtype)
  File "/home/saket/anaconda3/lib/python3.5/site-packages/tensorflow/python/training/slot_creator.py", line 65, in _create_slot_var
    validate_shape=validate_shape)
  File "/home/saket/anaconda3/lib/python3.5/site-packages/tensorflow/python/ops/variable_scope.py", line 1328, in get_variable
    constraint=constraint)
  File "/home/saket/anaconda3/lib/python3.5/site-packages/tensorflow/python/ops/variable_scope.py", line 1090, in get_variable
    constraint=constraint)
  File "/home/saket/anaconda3/lib/python3.5/site-packages/tensorflow/python/ops/variable_scope.py", line 435, in get_variable
    constraint=constraint)
  File "/home/saket/anaconda3/lib/python3.5/site-packages/tensorflow/python/ops/variable_scope.py", line 404, in _true_getter
    use_resource=use_resource, constraint=constraint)
  File "/home/saket/anaconda3/lib/python3.5/site-packages/tensorflow/python/ops/variable_scope.py", line 796, in _get_single_variable
    use_resource=use_resource)
  File "/home/saket/anaconda3/lib/python3.5/site-packages/tensorflow/python/ops/variable_scope.py", line 2234, in variable
    use_resource=use_resource)
  File "/home/saket/anaconda3/lib/python3.5/site-packages/tensorflow/python/ops/variable_scope.py", line 2224, in <lambda>
    previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
  File "/home/saket/anaconda3/lib/python3.5/site-packages/tensorflow/python/ops/variable_scope.py", line 2207, in default_variable_creator
    constraint=constraint)
  File "/home/saket/anaconda3/lib/python3.5/site-packages/tensorflow/python/ops/variables.py", line 235, in __init__
    constraint=constraint)
  File "/home/saket/anaconda3/lib/python3.5/site-packages/tensorflow/python/ops/variables.py", line 387, in _init_from_args
    validate_shape=validate_shape).op
  File "/home/saket/anaconda3/lib/python3.5/site-packages/tensorflow/python/ops/state_ops.py", line 219, in assign
    validate_shape=validate_shape)
  File "/home/saket/anaconda3/lib/python3.5/site-packages/tensorflow/python/ops/gen_state_ops.py", line 60, in assign
    use_locking=use_locking, name=name)
  File "/home/saket/anaconda3/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/saket/anaconda3/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 3417, in create_op
    op_def=op_def)
  File "/home/saket/anaconda3/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1743, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[256] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[Node: dense_1/bias/Adam/Assign = Assign[T=DT_FLOAT, _class=["loc:@dense_1/bias/Assign"], _grappler_relax_allocator_constraints=true, use_locking=true, validate_shape=true, _device="/job:localhost/replica:0/task:0/device:GPU:0"](dense_1/bias/Adam, dense_1/bias/Adam/Initializer/zeros)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.



In [None]:
training_image_batch