In [1]:
%pylab inline
import pandas as pd
import tensorflow as tf
import glob
from tensorflow.contrib.tensor_forest.python import tensor_forest
from tensorflow.python.ops import resources
from tqdm import tqdm_notebook
from multiprocessing import Pool
import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""

Populating the interactive namespace from numpy and matplotlib


In [2]:

# Parameters
num_steps = 500 # Total steps to train
batch_size = 1024 # The number of samples per batch
num_classes = 2 # The 10 digits
num_features = 46 # Each image is 28x28 pixels
num_trees = 100
max_nodes = 1000


In [3]:
X = tf.placeholder(tf.float32, shape=[None, num_features])
# For random forest, labels must be integers (the class id)
Y = tf.placeholder(tf.int32, shape=[None])

# Random Forest Parameters
hparams = tensor_forest.ForestHParams(num_classes=num_classes,
                                      num_features=num_features,
                                      num_trees=num_trees,
                                      max_nodes=max_nodes).fill()

# Build the Random Forest
forest_graph = tensor_forest.RandomForestGraphs(hparams)
# Get training graph and loss
train_op = forest_graph.training_graph(X, Y)
loss_op = forest_graph.training_loss(X, Y)

# Measure the accuracy
infer_op, _, _ = forest_graph.inference_graph(X)
correct_prediction = tf.equal(tf.argmax(infer_op, 1), tf.cast(Y, tf.int64))
accuracy_op = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

# Initialize the variables (i.e. assign their default value) and forest resources
init_vars = tf.group(tf.global_variables_initializer(),
    resources.initialize_resources(resources.shared_resources()))

# Start TensorFlow session
sess = tf.Session()

# Run the initializer
sess.run(init_vars)

INFO:tensorflow:Constructing forest with params = 
INFO:tensorflow:{'num_classes': 2, 'num_splits_to_consider': 10, 'bagged_features': None, 'base_random_seed': 0, 'max_fertile_nodes': 0, 'split_name': 'less_or_equal', 'checkpoint_stats': False, 'regression': False, 'inference_tree_paths': False, 'finish_type': 0, 'split_after_samples': 250, 'pruning_type': 0, 'max_nodes': 1000, 'split_finish_name': 'basic', 'num_output_columns': 3, 'stats_model_type': 0, 'num_outputs': 1, 'split_type': 0, 'bagged_num_features': 46, 'num_trees': 10, 'num_features': 46, 'early_finish_check_every_samples': 0, 'dominate_method': 'bootstrap', 'use_running_stats_method': False, 'param_file': None, 'bagging_fraction': 1.0, 'valid_leaf_threshold': 1, 'initialize_average_splits': False, 'collate_examples': False, 'leaf_model_type': 0, 'prune_every_samples': 0, 'feature_bagging_fraction': 1.0, 'model_name': 'all_dense', 'dominate_fraction': 0.99, 'split_pruning_name': 'none'}


In [4]:
df = pd.read_table('normal_tumor_segmented_df.tsv')
FEATURE_KEYS = df.columns
FEATURE_KEYS

Index(['area', 'bbox_area', 'compactness', 'convex_area', 'eccentricity',
       'equivalent_diameter', 'extent', 'fractal_dimension',
       'inertia_tensor_eigvals_1', 'inertia_tensor_eigvals_2',
       'major_axis_length', 'max_intensity', 'mean_intensity',
       'mean_intensity_entire_image', 'minor_axis_length', 'moments_central_1',
       'moments_central_10', 'moments_central_11', 'moments_central_12',
       'moments_central_13', 'moments_central_14', 'moments_central_15',
       'moments_central_16', 'moments_central_2', 'moments_central_3',
       'moments_central_4', 'moments_central_5', 'moments_central_6',
       'moments_central_7', 'moments_central_8', 'moments_central_9',
       'moments_hu_1', 'moments_hu_2', 'moments_hu_3', 'moments_hu_4',
       'moments_hu_5', 'moments_hu_6', 'moments_hu_7', 'nuclei',
       'nuclei_intensity_over_entire_image', 'orientation', 'perimeter',
       'solidity', 'texture', 'total_nuclei_area', 'total_nuclei_area_ratio'],
      dtype='o

In [None]:
df = pd.read_csv('normal_tumor_segmented_df_with_label.csv')

In [None]:
df.label.unique()

In [5]:
def _parse_csv(rows_string_tensor):
        """Takes the string input tensor and returns tuple of (features, labels)."""
        # Last dim is the label.        
        num_features = len(FEATURE_KEYS)
        num_columns = num_features + 1 
        columns = tf.decode_csv(rows_string_tensor,
                                record_defaults=[[0.0]] * num_features + [[0]], 
                                field_delim=',')
        #features = dict(zip(FEATURE_KEYS, columns[:num_features]))
        #tf.Print('d', columns)
        #print(columns)
        return tf.stack(columns[:-1]), columns[-1]

def input_fn(file_names, batch_size):   

    """The input_fn."""
    dataset = tf.data.TextLineDataset(file_names).skip(1)
    # Skip the first line (which does not have data).
    dataset = dataset.map(_parse_csv)
    dataset = dataset.batch(batch_size)
    #iterator = dataset.make_one_shot_iterator()
    #features, labels = iterator.get_next()
    #return features, labels
    iterator = tf.data.Iterator.from_structure(dataset.output_types,
                                               dataset.output_shapes)
    next_batch = iterator.get_next()
    init_op = iterator.make_initializer(dataset)
    
    return init_op, next_batch

In [6]:
training_init_op, training_next_batch =  input_fn(['normal_tumor_segmented_df_with_label.csv'],
                                                  1024)


In [8]:
for epoch in range(num_steps):
    sess.run(training_init_op)    
    while True:
        try:
            training_features_batch, training_label_batch = sess.run(training_next_batch)
        except tf.errors.OutOfRangeError:
            break
        _, l = sess.run([train_op, loss_op], 
                        feed_dict={X: training_features_batch,
                                   Y: training_label_batch})
    acc = sess.run(accuracy_op, 
                   feed_dict={X: training_features_batch, 
                              Y: training_label_batch})
    print('Step %i, Loss: %f, Acc: %f' % (epoch, l, acc))

Step 0, Loss: -3.000000, Acc: 0.000000


KeyboardInterrupt: 

In [1]:
normal_segmented_tsv_dir = '/Z/personal-folders/interns/saket/histopath_data/CAMELYON16_patches/normal_patches_test_segmented/level_0/'
tumor_segmented_tsv_dir = '/Z/personal-folders/interns/saket/histopath_data/CAMELYON16_patches/tumor_patches_test_segmented/level_0/'
test_tumor_segmented_tsv_dir = '/Z/personal-folders/interns/saket/histopath_data/CAMELYON16_patches/tumor_patches_test_segmented/level_0/'
test_normal_segmented_tsv_dir = '/Z/personal-folders/interns/saket/histopath_data/CAMELYON16_patches/normal_patches_test_segmented/level_0/'

normal_segmented_tsv = glob.glob(normal_segmented_tsv_dir+'/*.tsv')
tumor_segmented_tsv = glob.glob(tumor_segmented_tsv_dir+'/*.tsv')


test_normal_segmented_tsv = glob.glob(test_normal_segmented_tsv_dir+'/*.tsv')
test_tumor_segmented_tsv = glob.glob(test_tumor_segmented_tsv_dir+'/*.tsv')


NameError: name 'glob' is not defined

In [None]:
def _parse_csv(rows_string_tensor):
        """Takes the string input tensor and returns tuple of (features, labels)."""
        # Last dim is the label.
        print(rows_string_tensor)
        num_features = len(FEATURE_KEYS)
        num_columns = num_features 
        columns = tf.decode_csv(rows_string_tensor,
                                record_defaults=[[0.0]] * num_columns, field_delim='\t')
        #features = dict(zip(FEATURE_KEYS, columns[:num_features]))
        tf.Print('d', columns)
        print(columns)
        return tf.stack(columns)

def input_fn(file_names, batch_size):   

    """The input_fn."""
    dataset = tf.data.TextLineDataset(file_names).skip(1)
    # Skip the first line (which does not have data).
    labels = [0 if 'normal' in x else 1 for x in file_names]
    labels = tf.data.Dataset.from_tensor_slices(labels)
   # dataset = dataset.skip(1)
    dataset = dataset.map(_parse_csv)

    dataset = tf.data.Dataset.zip((dataset, labels))
    dataset = dataset.batch(batch_size)
    iterator = dataset.make_one_shot_iterator()
    features, labels = iterator.get_next()
    return features, labels
    #iterator = tf.data.Iterator.from_structure(dataset.output_types,
    #                                           dataset.output_shapes)
    #next_batch = iterator.get_next()
    #init_op = iterator.make_initializer(dataset)
    #return init_op, next_batch
    



In [None]:
#training_init_op, training_next_batch =  input_fn(normal_segmented_tsv+tumor_segmented_tsv,
#                                                  1024)
features, labels = input_fn(['normal_tumor_segmented_df_with_label.csv'],
                                                  1024)

In [None]:
training_init_op

In [None]:
X = tf.placeholder(tf.float32, shape=[None, num_features])
Y = tf.placeholder(tf.int32, shape=[None])


hparams = tensor_forest.ForestHParams(num_classes=num_classes,
                                      num_features=num_features,
                                      num_trees=num_trees,
                                      max_nodes=max_nodes).fill()


forest_graph = tensor_forest.RandomForestGraphs(hparams)

train_op = forest_graph.training_graph(X, Y)
loss_op = forest_graph.training_loss(X, Y)


infer_op, _, _ = forest_graph.inference_graph(X)
correct_prediction = tf.equal(tf.argmax(infer_op, 1), tf.cast(Y, tf.int64))
accuracy_op = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))


init_vars = tf.group(tf.global_variables_initializer(),
    resources.initialize_resources(resources.shared_resources()))

# Start TensorFlow session
sess = tf.Session()

# Run the initializer
sess.run(init_vars)

In [None]:
df = pd.DataFrame()
y = []

def load_df(path):
    temp_df = pd.read_table(path)
    if len(temp_df.index):
        return temp_df
    return None

with tqdm_notebook(total=len(normal_segmented_tsv)) as pbar:
    with Pool(processes=32) as p:
        for i, temp_df in enumerate(p.imap_unordered(load_df, normal_segmented_tsv)):
            if temp_df is not None:
                df = pd.concat((df, temp_df))
                y.append(0)
            pbar.update()

    

In [None]:
with tqdm_notebook(total=len(tumor_segmented_tsv)) as pbar:
    with Pool(processes=32) as p:
        for i, temp_df in enumerate(p.imap_unordered(load_df, tumor_segmented_tsv)):
            if temp_df is not None:
                df = pd.concat((df, temp_df))
                y.append(1)
            pbar.update()


In [None]:
len(df.index)

In [None]:
df_with_label = df.copy()
df_with_label['label'] = y
df_with_label = df_with_label.dropna()

In [None]:
df_with_label.head()

In [None]:
len(df_with_label.index)

In [None]:
df_with_label.to_csv('normal_tumor_segmented_df_with_label.csv', header=True, index=False)

In [None]:
batch_size = 32
filenames = normal_segmented_tsv + tumor_segmented_tsv
labels = [0 if 'normal' in x else 1 for x in filenames]
training_dataset = tf.data.Dataset.from_tensor_slices((tf.constant(filenames), tf.constant(labels)))
training_dataset = training_dataset.batch(batch_size)
#iterator = training_dataset.make_one_shot_iterator()
#next_element = iterator.get_next()

training_iterator = tf.data.Iterator.from_structure(training_dataset.output_types,
                                                    training_dataset.output_shapes)
training_next_batch = training_iterator.get_next()
training_init_op = training_iterator.make_initializer(training_dataset)
"""
num_epoch = 10
with tf.Session() as sess:
    tf.global_variables_initializer().run()
    for epoch in range(num_epoch):
        sess.run(training_init_op)    
        while True:
            try:
                training_features_batch, training_label_batch = sess.run(training_next_batch)
            except tf.errors.OutOfRangeError:
                break
            input_batch = training_features_batch
"""

In [None]:
with tf.Session() as sess:
    tf.global_variables_initializer().run()
    sess.run(next_element)
    while nof_examples > 0:
        nof_examples -= 1
        try:
            data_features, data_labels = sess.run([features, labels])
            print(data_features)
        except tf.errors.OutOfRangeError:
            pass

In [None]:
def read_my_csv(filename_queue):
    reader = tf.TextLineReader(skip_header_lines=1)
    key, value = reader.read(filename_queue)
    record_defaults = [[1.0]]*46
    decoded = tf.decode_csv(value, record_defaults = record_defaults, field_delim='\t')     
    return tf.stack(decoded)
    """
    reader = tf.SomeReader()
    key, record_string = reader.read(filename_queue)
    example, label = tf.some_decoder(record_string)
    processed_example = some_processing(example)
    
    return processed_example, label
    """

def input_pipeline(filenames, batch_size, num_epochs=None):
    filename_queue = tf.train.string_input_producer(
      filenames, num_epochs=num_epochs, shuffle=True)
    labels = [0 if 'normal' in x else 1 for x in filenames]
    #labels_queue = tf.train.string_input_producer(
    #    labels, num_epochs=num_epochs, shuffle=True)
    label_fifo = tf.FIFOQueue(len(filenames),tf.int32, shapes=[[]])
    lv = tf.constant(labels)

    label_enqueue = label_fifo.enqueue_many([lv])
    
    example = read_my_csv(filename_queue)
    # min_after_dequeue defines how big a buffer we will randomly sample
    #   from -- bigger means better shuffling but slower start up and more
    #   memory used.
    # capacity must be larger than min_after_dequeue and the amount larger
    #   determines the maximum we will prefetch.  Recommendation:
    #   min_after_dequeue + (num_threads + a small safety margin) * batch_size
    min_after_dequeue = 10000
    capacity = min_after_dequeue + 3 * batch_size
    example_batch, label_batch = tf.train.batch(
      [example, label_fifo.dequeue()], batch_size=batch_size, capacity=capacity)
      #min_after_dequeue=min_after_dequeue)

    

    return example_batch, label_batch

    

In [None]:
dataset_batch, dataset_label = input_pipeline(normal_segmented_tsv+tumor_segmented_tsv, batch_size, 10)

In [None]:
type(dataset_batch)

In [None]:
"""
def read_row(csv_row):
    record_defaults = [[0.0]]*len(COLUMNS)
    row = tf.decode_csv(csv_row, record_defaults=record_defaults)
    return row

def input_pipeline(filenames, batch_size):
    # Define a `tf.contrib.data.Dataset` for iterating over one epoch of the data.
    dataset = (tf.contrib.data.TextLineDataset(filenames)
               .skip(1)
               .map(lambda line: read_row(line))
               .shuffle(buffer_size=10)  # Equivalent to min_after_dequeue=10.
               .batch(batch_size))
    return dataset
"""

In [None]:
dataset = input_pipeline(normal_segmented_tsv+tumor_segmented_tsv, batch_size)
iterator = dataset.make_initializable_iterator()
features, labels = iterator.get_next()

In [None]:
num_steps = 500 # Total steps to train
batch_size = 1024 # The number of samples per batch
num_classes = 2 # The 10 digits
num_features = 46 # Each image is 28x28 pixels
num_trees = 10
max_nodes = 1000

In [None]:
X = tf.placeholder(tf.float32, shape=[None, num_features])
# For random forest, labels must be integers (the class id)
Y = tf.placeholder(tf.int32, shape=[None])

# Random Forest Parameters
hparams = tensor_forest.ForestHParams(num_classes=num_classes,
                                      num_features=num_features,
                                      num_trees=num_trees,
                                      max_nodes=max_nodes).fill()

# Build the Random Forest
forest_graph = tensor_forest.RandomForestGraphs(hparams)
# Get training graph and loss
train_op = forest_graph.training_graph(X, Y)
loss_op = forest_graph.training_loss(X, Y)

# Measure the accuracy
infer_op, _, _ = forest_graph.inference_graph(X)
correct_prediction = tf.equal(tf.argmax(infer_op, 1), tf.cast(Y, tf.int64))
accuracy_op = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

# Initialize the variables (i.e. assign their default value) and forest resources
init_vars = tf.group(tf.global_variables_initializer(),
    resources.initialize_resources(resources.shared_resources()))

# Start TensorFlow session
sess = tf.Session()

# Run the initializer
sess.run(init_vars)

In [None]:
training_iterator = tf.data.Iterator.from_structure(training_dataset.output_types,
                                                    training_dataset.output_shapes)
training_next_batch = training_iterator.get_next()

training_init_op = training_iterator.make_initializer(training_dataset)

In [None]:
# Training
for i in range(1, num_steps + 1):
    # Prepare Data
    # Get the next batch of MNIST data (only images are needed, not labels)
    batch_x, batch_y = mnist.train.next_batch(batch_size)
    _, l = sess.run([train_op, loss_op], feed_dict={X: batch_x, Y: batch_y})
    if i % 50 == 0 or i == 1:
        acc = sess.run(accuracy_op, feed_dict={X: batch_x, Y: batch_y})
        print('Step %i, Loss: %f, Acc: %f' % (i, l, acc))

# Test Model
test_x, test_y = mnist.test.images, mnist.test.labels
print("Test Accuracy:", sess.run(accuracy_op, feed_dict={X: test_x, Y: test_y}))

In [None]:
filename_queue = tf.train.string_input_producer(normal_segmented_tsv+tumor_segmented_tsv)

reader = tf.TextLineReader(skip_header_lines=1)
key, value = reader.read(filename_queue)
record_defaults = [[1.0]]*46

labels = [0]* len(normal_segmented_tsv) + [1] * len(tumor_segmented_tsv)

# Default values, in case of empty columns. Also specifies the type of the
# decoded result.
decoded = tf.decode_csv(value, record_defaults = record_defaults, field_delim='\t')  
stacked_cols = tf.stack(decoded) 

with tf.Session() as session:
    coordinator = tf.train.Coordinator()
    tf.train.start_queue_runners(session, coord=coordinator)
    #print (session.run(name))
    coordinator.request_stop()
    coordinator.join()


In [None]:
dataset1 = tf.data.Dataset.from_tensor_slices((decoded, labels))


In [None]:
FIELD_DEFAULTS = [[0.0]]*len(COLUMNS)
def _parse_line(line):
    # Decode the line into its fields
    fields = tf.decode_csv(line, FIELD_DEFAULTS)
    # Pack the result into a dictionary
    features = dict(zip(COLUMNS,fields))
    # Separate the label from the features
    label = features.pop('label')

    return features, label