# 3D CNN for Lung Cancer Detection

First import the libraries:

In [56]:
import tensorflow as tf
import numpy as np
import os
from random import shuffle

## Splitting training and validation set
### One-hot encoding
Performing one-hot encoding for labels (1D array):

In [63]:
def one_hot_encode(labels):
    one_hot = np.zeros((len(labels), 2))
    one_hot[np.arange(len(labels)), labels] = 1
    
    return one_hot

sample_labels = [1, 0, 1]
one_hot_sample_labels = one_hot_encode(sample_labels)
print(one_hot_sample_labels)

[[ 0.  1.]
 [ 1.  0.]
 [ 0.  1.]]


### Saving and Loading Data

In [None]:
preprocessed_data_folder = './sample_images_preprocessed/'
train_valid_data_folder = './data/'

patients = os.listdir(preprocessed_data_folder)
shuffle(patients) # randomize the order of data

# initialization
train_features_batch = []; train_labels_batch = []
valid_features_batch = []; valid_labels_batch = []

train_size = 0.8 # relative size of train set
batch_size = 5 # divide set into batches to prevent machine from running out of memory

n_train = 0; n_train_batch = 0; n_valid = 0; n_valid_batch = 0
data_shape = [250, 350, 350]

train_batch_id = 0
valid_batch_id = 0
for i in range(len(patients)):  
    if patients[i].startswith('.'): continue # ignore hidden files
    patient_data = np.load(preprocessed_data_folder + patients[i])
    if patient_data['set'] == 'test': continue # ignore test data
    
    if i < int(len(patients) * train_size): # train set
        train_features_batch.append(patient_data['data'])
        train_labels_batch.append(patient_data['label'])
        n_train += 1
        n_train_batch += 1
        
        #  if current train batch is full
        if (n_train_batch == batch_size): 
            # save the current batch
            np.savez_compressed(train_valid_data_folder + 'train_batch_' + str(train_batch_id),
                               features = np.array(train_features_batch),
                               labels = one_hot_encode(train_labels_batch))
            # initialization
            train_features_batch = []; train_labels_batch = []
            train_batch_id += 1
            n_train_batch = 0
            
    else: # validation set
        valid_features_batch.append(patient_data['data'])
        valid_labels_batch.append(patient_data['label'])
        n_valid += 1
        n_valid_batch += 1
        
        #  if current validation batch is full
        if (n_valid_batch == batch_size): 
            # save the current batch
            np.savez_compressed(train_valid_data_folder + 'valid_batch_' + str(valid_batch_id),
                               features = np.array(valid_features_batch),
                               labels = one_hot_encode(valid_labels_batch))
            # initialization
            valid_features_batch = []; valid_labels_batch = []
            valid_batch_id += 1
            n_valid_batch = 0

# save the remaining training and validation data set
if n_train_batch != 0:
    np.savez_compressed(train_valid_data_folder + 'train_batch_' + str(train_batch_id),
                        features = np.array(train_features_batch),
                        labels = one_hot_encode(train_labels_batch)) 
else:
    train_batch_id -= 1

if n_valid_batch != 0:
    np.savez_compressed(train_valid_data_folder + 'valid_batch_' + str(valid_batch_id),
                        features = np.array(valid_features_batch),
                        labels = one_hot_encode(valid_labels_batch)) 
else:
    valid_batch_id -= 1

print('Total data size: {:d}, Training data size: {:d}, Validation data size: {:d} '.format(
    n_train + n_valid, n_train, n_valid))
print('Number of train batches: {:d}, Number of validation batches: {:d}'.format(
    train_batch_id+1, valid_batch_id+1))