### preset

In [25]:
conf = dict()
# Change this variable to 0 in case you want to use full dataset
conf['use_sample_only'] = 1
# Save weights
conf['save_weights'] = 0
# How many patients will be in train and validation set during training. Range: (0; 1)
conf['train_valid_fraction'] = 0.5
# Batch size for CNN [Depends on GPU and memory available]
conf['batch_size'] = 200
# Number of epochs for CNN training
conf['nb_epoch'] = 40
# Early stopping. Stop training after epochs without improving on validation
conf['patience'] = 3
# Shape of image for CNN (Larger the better, but you need to increase CNN as well)
conf['image_shape'] = (64, 64)
# Learning rate for CNN. Lower better accuracy, larger runtime.
conf['learning_rate'] = 1e-2
# Number of random samples to use during training per epoch 
conf['samples_train_per_epoch'] = 10000
# Number of random samples to use during validation per epoch
conf['samples_valid_per_epoch'] = 1000
# Some variables to control CNN structure
conf['level_1_filters'] = 4
conf['level_2_filters'] = 8
conf['dense_layer_size'] = 32
conf['dropout_value'] = 0.5


import dicom 
import os
import cv2
import numpy as np
import pandas as pd
import glob
import random
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Flatten
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.optimizers import SGD
from keras.callbacks import EarlyStopping, ModelCheckpoint
np.random.seed(2016)
random.seed(2016)

from utils import *

def load_and_normalize_dicom(path, x, y):
    dicom1 = dicom.read_file(path)
    dicom_img = dicom1.pixel_array.astype(np.float64)
    mn = dicom_img.min()
    mx = dicom_img.max()
    if (mx - mn) != 0:
        dicom_img = (dicom_img - mn)/(mx - mn)
    else:
        dicom_img[:, :] = 0
    if dicom_img.shape != (x, y):
        dicom_img = cv2.resize(dicom_img, (x, y), interpolation=cv2.INTER_CUBIC)
    return dicom_img


def batch_generator_train(files, train_csv_table, batch_size):
    number_of_batches = np.ceil(len(files)/batch_size)
    counter = 0
    random.shuffle(files)
    while True:
        batch_files = files[batch_size*counter:batch_size*(counter+1)]
        image_list = []
        mask_list = []
        for f in batch_files:
            image = load_and_normalize_dicom(f, conf['image_shape'][0], conf['image_shape'][1])
            patient_id = os.path.basename(os.path.dirname(f))
            is_cancer = train_csv_table.loc[train_csv_table['id'] == patient_id]['cancer'].values[0]
            if is_cancer == 0:
                mask = [1, 0]
            else:
                mask = [0, 1]
            image_list.append([image])
            mask_list.append(mask)
        counter += 1
        image_list = np.array(image_list)
        mask_list = np.array(mask_list)
        # print(image_list.shape)
        # print(mask_list.shape)
        yield image_list, mask_list
        if counter == number_of_batches:
            random.shuffle(files)
            counter = 0


In [7]:
train_csv_table = pd.read_csv('/datadrive/kaggle_ds_bowl_17/stage1_labels.csv')
ids = train_csv_table['id'].values
print ids
#train_patients, valid_patients = get_train_single_fold(train_csv_table, conf['train_valid_fraction'])

['0015ceb851d7251b8f399e39779d1e7d' '0030a160d58723ff36d73f41b170ec21'
 '003f41c78e6acfa92430a057ac0b306e' ..., 'fe5c37e82b412833b8ad0abb57978377'
 'ff5d8e90500cf324e7b04a2f07cf0399' 'ffe02fe7d2223743f7fb455dfaff3842']


In [8]:
get_dir = 'stage1'
train_files = []
for p in ids:
    train_files += glob.glob("/datadrive/kaggle_ds_bowl_17/sample/{}/*.dcm".format(p))
print('Number of train files: {}'.format(len(train_files)))

Number of train files: 3408


In [9]:
#resize image(s) to 64X64
images=np.zeros([1,64,64])
for f in train_files:
    image = load_and_normalize_dicom(f, conf['image_shape'][0], conf['image_shape'][1])
    images=np.concatenate((images,np.expand_dims(image,axis=0)),axis=0)

In [10]:
images.shape

(3409, 64, 64)

In [44]:
train_batch=batch_generator_train(train_files,train_csv_table, 4)

In [19]:
model = Sequential([
        BatchNormalization(axis=1, input_shape=(3,224,224)),
        Flatten(),
        Dense(10, activation='softmax')
    ])

In [27]:
#define simple model
model = Sequential([
        BatchNormalization(axis=1, input_shape=(3,224,224)),
        Flatten(),
        Dense(100, activation='relu'),
        BatchNormalization(),
        Dense(10, activation='softmax')
    ])
model.compile(Adam(lr=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
#model.fit_generator(train_batch, 64, nb_epoch=2)

#model.optimizer.lr = 0.01

In [25]:
train_batch

<generator object batch_generator_train at 0x7f94b0fc5f00>

In [29]:
path='/datadrive/state_farm/'
batches = get_batches(path+'/train', batch_size=64)

Found 20017 images belonging to 10 classes.


In [62]:
i[0].shape

(64, 3, 224, 224)

In [53]:
for i in batches:
    print i

(array([[[[  57.,   63.,   65., ...,  167.,   15.,   54.],
         [  65.,   72.,   78., ...,  158.,   12.,   58.],
         [  78.,   82.,   90., ...,  153.,   39.,  174.],
         ..., 
         [   6.,    6.,    6., ...,   17.,   17.,   14.],
         [   6.,    6.,    6., ...,   14.,   18.,   16.],
         [   6.,    6.,    6., ...,   12.,   12.,   16.]],

        [[  70.,   76.,   78., ...,  224.,   47.,   63.],
         [  77.,   84.,   90., ...,  232.,   61.,   81.],
         [  88.,   92.,  100., ...,  241.,  100.,  212.],
         ..., 
         [   7.,    7.,    7., ...,   15.,   15.,   12.],
         [   7.,    7.,    7., ...,   12.,   16.,   14.],
         [   7.,    7.,    7., ...,   10.,   10.,   14.]],

        [[  50.,   56.,   58., ...,  241.,   60.,   72.],
         [  53.,   60.,   66., ...,  243.,   68.,   87.],
         [  63.,   67.,   75., ...,  245.,  105.,  215.],
         ..., 
         [   2.,    2.,    2., ...,   16.,   16.,   13.],
         [   2.,    2.

KeyboardInterrupt: 

In [33]:
batches1 = get_batches('/datadrive/kaggle_ds_bowl_17/sample/', batch_size=64)

Found 0 images belonging to 20 classes.


In [11]:
# Some constants 
INPUT_FOLDER = '/datadrive/kaggle_ds_bowl_17/sample/'
full_INPUT_FOLDER= '/datadrive/kaggle_ds_bowl_17/stage1/'
patients = os.listdir(INPUT_FOLDER)
all_patients = os.listdir(full_INPUT_FOLDER)
patients.sort()
all_patients.sort()

In [15]:
# Load the scans in given folder path
def load_scan(path):
    slices = [dicom.read_file(path + '/' + s) for s in os.listdir(path)]
    slices.sort(key = lambda x: int(x.ImagePositionPatient[2]))
    try:
        slice_thickness = np.abs(slices[0].ImagePositionPatient[2] - slices[1].ImagePositionPatient[2])
    except:
        slice_thickness = np.abs(slices[0].SliceLocation - slices[1].SliceLocation)
        
    for s in slices:
        s.SliceThickness = slice_thickness
        
    return slices

def get_pixels_hu(slices):
    image = np.stack([s.pixel_array for s in slices])
    # Convert to int16 (from sometimes int16), 
    # should be possible as values should always be low enough (<32k)
    image = image.astype(np.int16)

    # Set outside-of-scan pixels to 0
    # The intercept is usually -1024, so air is approximately 0
    image[image == -2000] = 0
    
    # Convert to Hounsfield units (HU)
    for slice_number in range(len(slices)):
        
        intercept = slices[slice_number].RescaleIntercept
        slope = slices[slice_number].RescaleSlope
        
        if slope != 1:
            image[slice_number] = slope * image[slice_number].astype(np.float64)
            image[slice_number] = image[slice_number].astype(np.int16)
            
        image[slice_number] += np.int16(intercept)
    
    return np.array(image, dtype=np.int16)

In [22]:
y=np.array(train_csv_table[train_csv_table.id.isin(patients)].cancer)

In [45]:
patient_data_pixels=np.zeros([134,512,512])
for patient in patients:
    patient_data = load_scan(INPUT_FOLDER + patient)
    #patient_data_pixels.append(get_pixels_hu(patient_data))
    data=get_pixels_hu(patient_data)
    print (data.shape,patient_data_pixels.shape)
    patient_data_pixels=np.concatenate((patient_data_pixels,data),axis=0)

((134, 512, 512), (134, 512, 512))
((128, 512, 512), (268, 512, 512))
((133, 512, 512), (396, 512, 512))
((110, 512, 512), (529, 512, 512))
((203, 512, 512), (639, 512, 512))
((196, 512, 512), (842, 512, 512))
((280, 512, 512), (1038, 512, 512))
((123, 512, 512), (1318, 512, 512))
((164, 512, 512), (1441, 512, 512))
((244, 512, 512), (1605, 512, 512))
((136, 512, 512), (1849, 512, 512))
((180, 512, 512), (1985, 512, 512))
((221, 512, 512), (2165, 512, 512))
((147, 512, 512), (2386, 512, 512))


KeyboardInterrupt: 

In [23]:
len(patient_data_pixels)

20

In [33]:
(patient_data_pixels[0]).shape

(134, 512, 512)

In [28]:
model.fit(patient_data_pixels,y)

ValueError: Error when checking model input: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 1 arrays but instead got the following list of 20 arrays: [array([[[-1024, -1024, -1024, ..., -1024, -1024, -1024],
        [-1024, -1024, -1024, ..., -1024, -1024, -1024],
        [-1024, -1024, -1024, ..., -1024, -1024, -1024],
        ..., 
        [-1024...