In [8]:
import os

os.listdir('.')

['.git',
 '.gitignore',
 '.ipynb_checkpoints',
 'CAMELYON dataset.pdf',
 'features.csv',
 'histopathologic-cancer-detection',
 'histopathologic-cancer-detection.zip',
 'solution.ipynb']

# Split up dataset

A key finding in this project has been the size of the dataset that I have to navigate every time I want to create a batch. Since most of the processing is done using a backend written in C, it appears to be quick. However, the time taken to iterate over a list of imaged and read them into arrays may be what is making this network take so long!

Therefore, I will be using the Keras `ImageDataGenerator` class! 

It needs the dataset to be split up by classes.

In [7]:
import pandas as pd
from shutil import copyfile

df = pd.read_csv('histopathologic-cancer-detection/train_labels.csv')
cancer = df[df['label'] == 1]
noncancer = df[df['label'] == 0]

for filename in cancer['id']:
    copyfile(
        'histopathologic-cancer-detection/train/{0}'.format(filename + '.tif'),
        'histopathologic-cancer-detection/train/cancer/{0}'.format(filename + '.tif')
    )

for filename in noncancer['id']:
    copyfile(
        'histopathologic-cancer-detection/train/{0}'.format(filename + '.tif'),
        'histopathologic-cancer-detection/train/noncancer/{0}'.format(filename + '.tif')
    )

# Write a batch generator

As per the keras documentation, the `flow_from_directory` function returns a tuple of (x, y) arrays. This is consistent with the format that can be used in the model!

In [1]:
import keras

image_generator = keras.preprocessing.image.ImageDataGenerator()

batch_generator = image_generator.flow_from_directory(
    'histopathologic-cancer-detection/split_dataset/',
    target_size=(96, 96),
    class_mode='binary',
    batch_size=64
)

Using TensorFlow backend.


Found 220025 images belonging to 2 classes.


# Define a model

I will attempt to define a model from scratch and train it on my GPU. It seems to perform quite well against the GPU benchmarks written in the keras examples so I have confidence. This message would not exist for long if I was not successful.

In [2]:
from __future__ import print_function
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import BatchNormalization, Conv2D, MaxPooling2D
from keras import backend as K
from keras.backend import var, sum, min, max

import imageio
import numpy as np
import os
import pandas as pd

model = Sequential()
# Convolution layer 1
model.add(
    Conv2D(
        64,
        kernel_size=(5, 5),
        input_shape=(96, 96, 3),
        padding='same',
        activation='relu'
    )
)
model.add(BatchNormalization())

# Convolution layer 2
model.add(Conv2D(64, (5, 5), padding='same', activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2))) # 48

# Convolution layer 3
model.add(Conv2D(64, (5, 5), padding='same', activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2))) # 24

# Convolution layer 4
model.add(Conv2D(64, (3, 3), padding='same', activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2))) # 12

# Convolution layer 5
model.add(Conv2D(64, (3, 3), padding='same', activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2))) # 6



# FC
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    loss=keras.losses.binary_crossentropy,
    optimizer=keras.optimizers.Adadelta(),
    metrics=['accuracy']
)

model.fit_generator(
    batch_generator,
    steps_per_epoch=1000, 
    epochs=7
)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x1a97ed4d0f0>

# Testing the dataset

In [7]:
import imageio
import numpy as np
import os
from tqdm import tqdm

image_files = np.asarray(
    os.listdir('histopathologic-cancer-detection/test/')
)

dataset = []
for image_file in tqdm(image_files):
    hash_id = image_file.split('.')[0]
    prediction = model.predict(
        imageio.imread('histopathologic-cancer-detection/test/{0}'.format(image_file)).reshape(1, 96, 96, 3)
    )[0][0]
    dataset.append([hash_id, prediction])

100%|██████████| 57458/57458 [06:30<00:00, 147.27it/s]


In [8]:
df = pd.DataFrame(dataset, columns=['id', 'label'])
df.to_csv('solution.csv', index=False)