# Histopathologic Cancer Detection : Image classification

**Description:** In this competition, you must create an algorithm to identify metastatic cancer in small image patches taken from larger digital pathology scans. The data for this competition is a slightly modified version of the PatchCamelyon (PCam) benchmark dataset (the original PCam dataset contains duplicate images due to its probabilistic sampling, however, the version presented on Kaggle does not contain duplicates).

## Setting up the Environment

In [None]:
# Libraries
import pandas as pd                     # data processing
import numpy as np                      # linear algebra; asarray, save, load
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.image import imread
from tqdm import tqdm_notebook

import os, warnings, random, time, multiprocessing, pickle


In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import BatchNormalization

os.environ["CUDA_VISIBLE_DEVICES"] = "0"         # Set for GPU use
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"          # Set for CPU use
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"     # Set for CPU use

device_name = tf.test.gpu_device_name()

if device_name != '/device:GPU:0':
    print('GPU device not found')
    workers = multiprocessing.cpu_count()
    print('You have %d Cores' % workers)
else:
    print('Found GPU at: {}'.format(device_name))
    physical_devices = tf.config.list_physical_devices('GPU')
    print("Num GPUs Available: ", len(physical_devices))
#    tf.config.experimental.set_memory_growth(physical_devices[0], True)
#    tf.debugging.set_log_device_placement(False)

In [None]:
# Setting Variables
warnings.filterwarnings('ignore')
pd.options.display.max_rows = 30
pd.options.display.float_format = "{:.2f}".format
%matplotlib inline
mpl.style.use('ggplot')

In [None]:
DB_PATH   = r'../input/histopathologic-cancer-detection/'
TRAIN_DIR = r'../input/histopathologic-cancer-detection/train/'
TEST_DIR  = r'../input/histopathologic-cancer-detection/test/'
DIR       = ['train/', 'test/']

## Load Dataframes

In [None]:
train_df = pd.read_csv(DB_PATH + 'train_labels.csv',dtype=str)
train_df.id = train_df.id + '.tif'

In [None]:
print('Train Shape: ' , train_df.shape)
train_df.head()

In [None]:
(train_df.label.value_counts() / len(train_df)).to_frame().sort_index().T

In [None]:
train_df.dtypes

## Preprocessing and Generate the Dataset

### Preprocessing

In [None]:
# Check for image string

img = TRAIN_DIR + train_df.id[5]
fobj = open(img, "rb")
fobj.peek(10)

In [None]:
# ## Checking for Corrupted images

# num_skipped = 0
# for folder_name in DIR:
#     folder_path = os.path.join(DB_PATH, folder_name)
#     for fname in os.listdir(folder_path):
#         fpath = os.path.join(folder_path, fname)
#         try:
#             fobj = open(fpath, "rb")
#             is_jfif = tf.compat.as_bytes('tif') in fobj.peek(10)   # check for 'tif' string
#         finally:
#             fobj.close()

#         if not is_jfif:
#             num_skipped += 1
#             # Delete corrupted image
#             print (fpath)
#             # os.remove(fpath)

# print("%d Corrupted images" % num_skipped)

In [None]:
# # Check for any completely black or white images

# dark_th = 10 / 255                             # If no pixel reaches this threshold, image is considered too dark
# bright_th = 245 / 255                          # If no pixel is under this threshold, image is considerd too bright
# too_dark_idx = []
# too_bright_idx = []

# x_tot = np.zeros(3)
# x2_tot = np.zeros(3)
# counted_ones = 0

# for i, idx in tqdm_notebook(enumerate(train_df['id']), 'Computing...(220.025 total files)'):
#     path = os.path.join(TRAIN_DIR, idx)
#     imagearray = imread(path).reshape(-1,3)
    
#     if((imagearray.max() / 255) < dark_th):            # is this too dark
#         too_dark_idx.append(idx)
#         continue                                       # do not include in statistics
    
#     if((imagearray.min() / 255) > bright_th):          # is this too bright
#         too_bright_idx.append(idx)
#         continue                                       # do not include in statistics

# print('There was {0} extremely dark image'.format(len(too_dark_idx)))
# print('and {0} extremely bright images'.format(len(too_bright_idx)))
# print('Dark one:')
# print(too_dark_idx)
# print('Bright ones:')
# print(too_bright_idx)

In [None]:
# unusable = too_dark_idx + too_bright_idx

unusable = ['9369c7278ec8bcc6c880d99194de09fc2bd4efbe.tif', '9071b424ec2e84deeb59b54d2450a6d0172cf701.tif',
            'f6f1d771d14f7129a6c3ac2c220d90992c30c10b.tif', '5f30d325d895d873d3e72a82ffc0101c45cba4a8.tif',
            '54df3640d17119486e5c5f98019d2a92736feabc.tif', '5a268c0241b8510465cb002c4452d63fec71028a.tif',
            'c448cd6574108cf14514ad5bc27c0b2c97fc1a83.tif']

plt.figure(figsize=(10,10))
i = 0
for n in unusable:
    img = imread(TRAIN_DIR + n)
    plt.subplot(6,6,i+1)
    plt.imshow(img) 
    plt.axis('off')
    i = i+1
    plt.tight_layout()
plt.show()

In [None]:
### Remove corrupted or unusable

for n in unusable:
    idx = train_df[train_df['id'] == n].index
    train_df.drop(idx, inplace=True)
    print ('Deleting image ', n)

In [None]:
# plot few Train images

sample = train_df.sample(n=18).reset_index()
plt.figure(figsize=(10,10))
plt.suptitle('Histopathologic scans of lymph node sections',fontsize=16)
for i, row in sample.iterrows():
    img = imread(TRAIN_DIR + f'{row.id}')    
    label = row.label

    plt.subplot(6,6,i+1)
    plt.imshow(img)
    plt.text(0, -5, f'Class {label}', color='k')        
    plt.axis('off')

plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(2,5, figsize=(20,8))
fig.suptitle('Histopathologic scans of lymph node sections',fontsize=20)


# Negatives
for i, idx in enumerate(train_df[train_df['label'] == '0']['id'][:5]):
    path = os.path.join(TRAIN_DIR, idx)
    img = imread(path)
    ax[0,i].imshow(img)
ax[0,0].set_ylabel('Negative samples', size='large')


# # Positives
for i, idx in enumerate(train_df[train_df['label'] == '1']['id'][:5]):
    path = os.path.join(TRAIN_DIR, idx)
    img = imread(path)
    ax[1,i].imshow(img)
ax[1,0].set_ylabel('Positive samples', size='large');

In [None]:
train_df.head()

### Data Generators

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

SEED           = 1337
SPLIT_SIZE     = 0.2
BATCH_SIZE     = 32
IMAGE_SIZE     = (96,96)

In [None]:
datagen = ImageDataGenerator(rescale=1./255., validation_split=SPLIT_SIZE)

train_gen = datagen.flow_from_dataframe(
    dataframe  = train_df,
    directory  = TRAIN_DIR,
    color_mode = 'rgb',
    x_col      = 'id',
    y_col      = 'label',
    subset     = 'training',
    batch_size = BATCH_SIZE,
    seed       = SEED,
    shuffle    = True,
    class_mode = 'binary',
    target_size = IMAGE_SIZE)

valid_gen = datagen.flow_from_dataframe(
    dataframe  = train_df,
    directory  = TRAIN_DIR,
    color_mode = 'rgb',
    x_col      = 'id',
    y_col      = 'label',
    subset     = 'validation',
    batch_size = BATCH_SIZE,
    seed       = SEED,
    shuffle    = True,
    class_mode = 'binary',
    target_size = IMAGE_SIZE)

In [None]:
train_steps = np.ceil(len(train_gen) / BATCH_SIZE)
val_steps = np.ceil(len(valid_gen) / BATCH_SIZE)

print('Steps:')
print('Train: %d | Validation: %d ' %(train_steps, val_steps))

## Building a Model

In [None]:
# Model Libraries
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dropout, Flatten, Conv2D, MaxPooling2D, Dense, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [None]:
# Variables

kernel_size     = (3,3)
pool_size       = (2,2)
input_shape     = (96,96,3)

first_filters   = 32
second_filters  = 64
third_filters   = 128
fouth_filters   = 256

dropout_conv = 0.3
dropout_dense = 0.3

early_stopping  = EarlyStopping(
        monitor = 'val_acc',
      min_delta = 0.001,
       patience = 5,
        verbose = 1,
           mode = 'auto')

reduce_lr    = ReduceLROnPlateau(
    monitor  ='val_acc',
    factor   = 0.5,
    patience = 2,
    verbose  = 1,
    mode     = 'max',
    min_lr   = 0.00001)

callbacks       = [early_stopping, reduce_lr]

optimizer       = Adam(learning_rate=0.0001)                 # SGD(lr=0.001, momentum=0.9), Adam, RMSprop
loss            = 'binary_crossentropy'                      # 'categorical_crossentropy', 'binary_crossentropy'
metric          = 'accuracy'
activation      = 'sigmoid'                                  # 'sigmoid'; 'softmax'
epochs          = 10
val_split       = 0.2

In [None]:
#####################  Plot Loss Curves  #####################

def Plot_Train(hlist, start=1):

    history = {}
    for k in hlist[0].history.keys():
        history[k] = sum([h.history[k] for h in hlist], [])
  
    epoch_range = range(start, len(history['loss']) +1)
    s           = slice(start-1, None)
    n           = int(len(history.keys()) / 2)
    
    plt.figure(figsize=[14,4])
    for i in range(n):
        k = list(history.keys())[i]
        plt.subplot(1, n, i+1)
        plt.plot(epoch_range, history[k][s], label='Training')
        plt.plot(epoch_range, history['val_' + k][s], label='Validation')
        plt.xlabel('Epoch'); plt.ylabel(k); plt.title(k)
        plt.grid()
        plt.legend()

    plt.tight_layout()
    plt.show()

In [None]:
### Building the Base Model

def model_vgg16(input_shape, activation):
    
    model = tf.keras.applications.vgg16.VGG16(
        input_shape   = input_shape,
        include_top   = False,
        weights = 'imagenet')
    
    x = model.layers[-1].output
    # model.layers.pop()
    x = layers.GlobalAveragePooling2D()(x)
    output = layers.Dense(1, activation=activation)(x)
    model.trainable = False    
    model = keras.Model(inputs=model.input, outputs=output)
    
    return model

In [None]:
def model_mobile(input_shape, activation):

    model = tf.keras.applications.MobileNetV3Large(
                             input_shape = input_shape,
                             # input_tensor=input_shape,
                             alpha       = 1.0,
                             minimalistic= False,
                             include_top = False,
                             weights     = 'imagenet',
                             classes     = 1000,
                             pooling     = None,
                             dropout_rate= 0.2,
                             classifier_activation='softmax',
                             include_preprocessing=True)
      
    x = model.layers[-1].output
    x = layers.GlobalAveragePooling2D()(x)
    output = layers.Dense(1, activation=activation)(x)
    model.trainable = False 
    model = keras.Model(inputs=model.input, outputs=output)
    
    return model

In [None]:
def model_custom(input_shape, activation):
    
    model = Sequential()
    model.add(Conv2D(first_filters, kernel_size, activation = 'relu', input_shape = input_shape))
    model.add(Conv2D(first_filters, kernel_size, activation = 'relu'))
    model.add(Conv2D(first_filters, kernel_size, activation = 'relu'))
    model.add(MaxPooling2D(pool_size = pool_size)) 
    model.add(Dropout(dropout_conv))
    
    model.add(Conv2D(second_filters, kernel_size, activation ='relu'))
    model.add(Conv2D(second_filters, kernel_size, activation ='relu'))
    model.add(Conv2D(second_filters, kernel_size, activation ='relu'))
    model.add(MaxPooling2D(pool_size = pool_size))
    model.add(Dropout(dropout_conv))
    
    model.add(Conv2D(third_filters, kernel_size, activation ='relu'))
    model.add(Conv2D(third_filters, kernel_size, activation ='relu'))
    model.add(Conv2D(third_filters, kernel_size, activation ='relu'))
    model.add(MaxPooling2D(pool_size = pool_size))
    model.add(Dropout(dropout_conv))
    
    model.add(Flatten())
    model.add(Dense(256, activation = "relu"))
    model.add(Dropout(dropout_dense))
    model.add(Dense(1, activation = activation))
    
    return model    

In [None]:
def model_custom2(input_shape, activation):
    
    model = Sequential()
    model.add(Conv2D(first_filters, kernel_size, activation = 'relu', input_shape = input_shape))
    model.add(Conv2D(first_filters, kernel_size, use_bias=False))
    model.add(BatchNormalization())
    model.add(Activation("relu"))
    model.add(MaxPooling2D(pool_size = pool_size)) 
    model.add(Dropout(dropout_conv))
    
    model.add(Conv2D(second_filters, kernel_size, use_bias=False))
    model.add(BatchNormalization())
    model.add(Activation("relu"))
    model.add(Conv2D(second_filters, kernel_size, use_bias=False))
    model.add(BatchNormalization())
    model.add(Activation("relu"))
    model.add(MaxPooling2D(pool_size = pool_size))
    model.add(Dropout(dropout_conv))
    
    model.add(Conv2D(third_filters, kernel_size, use_bias=False))
    model.add(BatchNormalization())
    model.add(Activation("relu"))
    model.add(Conv2D(third_filters, kernel_size, use_bias=False))
    model.add(BatchNormalization())
    model.add(Activation("relu"))
    model.add(MaxPooling2D(pool_size = pool_size))
    model.add(Dropout(dropout_conv))
    
    model.add(Flatten())
    model.add(Dense(256, use_bias=False))
    model.add(BatchNormalization())
    model.add(Activation("relu"))
    model.add(Dropout(dropout_dense))
    model.add(Dense(1, activation = "sigmoid"))
    
    return model

In [None]:
model_vgg = model_vgg16(input_shape, activation)
model_vgg.compile(loss=loss, optimizer=optimizer, metrics=[metric, tf.keras.metrics.AUC()])

# model.save(DB_PATH + 'HCDmVGG16.h5',
#         overwrite=True,
#         include_optimizer=True,
#         save_format=None,
#         signatures=None,
#         options=None,
#         save_traces=True)

model_vgg.summary()

### Trainning the Model

In [None]:
# model.fit(
#     x                     = None,               
#     y                     = None,               
#     batch_size            = None,                # Number of samples per gradient update
#     epochs                = 1,                   # Number of iterations over the entire x and y
#     verbose               = 'auto',              # Verbosity mode. 0 = silent, 1 = progress bar, 2 = one line per epoch.
#     callbacks             = None,                # List of keras.callbacks.Callback instances
#     validation_split      = 0.0,                 # Fraction of the training data to be used as validation data
#     validation_data       = None,                # Data on which to evaluate the loss and any model metrics
#     shuffle               = True,                # Boolean (to shuffle training data before each epoch) or str (for 'batch')
#     class_weight          = None,                # 
#     sample_weight         = None,                # 
#     initial_epoch         = 0,                   # 
#     steps_per_epoch       = None,                # 
#     validation_steps      = None,                # 
#     validation_batch_size = None,                # 
#     validation_freq       = 1,                   # 
#     max_queue_size        = 10,                  # 
#     workers               = 4,                   # 
#     use_multiprocessing   = False)               # 

In [None]:
s = time.time()
# with tf.device('/CPU:0'):              # Running on CPU
h1 = model_vgg.fit(x                = train_gen,
                   steps_per_epoch  = train_steps,
                   validation_data  = valid_gen,
                   validation_steps = val_steps,
                   epochs           = epochs)

print('Fitting model in %.2f secs' % (time.time()-s))

# pickle.dump(h3, open(f'HCDmVGG16.pkl', 'wb'))

# with open(DB_PATH 'HCDmVGG16.pkl', 'wb') as file_pkl:
#     pickle.dump(history.history, file_pkl)


In [None]:
# mod1 = tf.keras.models.load_model(
#     DB_PATH + 'HCDmVGG16.h5',
#     custom_objects=None,
#     compile=True,
#     options=None)

In [None]:
Plot_Train([h1])

#### Model 2

In [None]:
model_mob = model_mobile(input_shape, activation)
model_mob.compile(loss=loss, optimizer=optimizer, metrics=[metric, tf.keras.metrics.AUC()])

# model.save(DB_PATH + 'HCDmMNv3LG.h5',
#            overwrite=True,
#            include_optimizer=True,
#            save_format=None,
#            signatures=None,
#            options=None,
#            save_traces=True)

model_mob.summary()

In [None]:
s = time.time()
# with tf.device('/CPU:0'):              # Running on CPU
h2 = model_mob.fit(x                = train_gen,
                   steps_per_epoch  = train_steps,
                   validation_data  = valid_gen,
                   validation_steps = val_steps,
                   epochs           = epochs)

print('Fitting model in %.2f secs' % (time.time()-s))

# pickle.dump(h3, open(f'HCDmMNv3LG.pkl', 'wb'))


In [None]:
# mod2 = tf.keras.models.load_model(
#     DB_PATH + 'HCDmMNv3LG.h5',
#     custom_objects=None,
#     compile=True,
#     options=None)

In [None]:
Plot_Train([h2])

#### Model 3

In [None]:
model_c1 = model_custom(input_shape, activation)
model_c1.compile(loss=loss, optimizer=optimizer, metrics=[metric, tf.keras.metrics.AUC()])

# model.save(DB_PATH + 'HCDmCustom.h5',
#         overwrite=True,
#         include_optimizer=True,
#         save_format=None,
#         signatures=None,
#         options=None,
#         save_traces=True)

model_c1.summary()

In [None]:
s = time.time()
# with tf.device('/CPU:0'):              # Running on CPU
h3 = model_c1.fit(x                = train_gen,
                  steps_per_epoch  = train_steps,
                  validation_data  = valid_gen,
                  validation_steps = val_steps,
                  epochs           = epochs)

print('Fitting model in %.2f secs' % (time.time()-s))

# pickle.dump(h3, open(f'HCDmCustom.pkl', 'wb'))

In [None]:
# mod3 = tf.keras.models.load_model(
#     DB_PATH + 'HCDmCustom.h5',
#     custom_objects=None,
#     compile=True,
#     options=None)

In [None]:
Plot_Train([h3])

### Model 4

In [None]:
model_c2 = model_custom2(input_shape, activation)
model_c2.compile(loss=loss, optimizer=optimizer, metrics=[metric, tf.keras.metrics.AUC()])

# model.save(DB_PATH + 'HCDmCustom2.h5',
#         overwrite=True,
#         include_optimizer=True,
#         save_format=None,
#         signatures=None,
#         options=None,
#         save_traces=True)

model_c2.summary()

In [None]:
s = time.time()
h4 = model_c2.fit(x                = train_gen,
                  steps_per_epoch  = train_steps,
                  validation_data  = valid_gen,
                  validation_steps = val_steps,
                  epochs           = epochs)

print('Fitting model in %.2f secs' % (time.time()-s))

In [None]:
Plot_Train([h4])

### Validating the Model

In [None]:
### Evaluate the model
### Returns the loss value & metrics values for the model in test mode.

# model.evaluate(x              = valid_gen,
#                y              = None,
#                steps          = test_steps
#                batch_size     = None,
#                verbose        = 'auto',
#                sample_weight  = None,
#                callbacks      = None,
#                max_queue_size = 10,
#                workers        = 1,
#                use_multiprocessing = False,
#                return_dict    = False,

### Predicting the Model

In [None]:
test_df = pd.read_csv(DB_PATH + 'sample_submission.csv')
test_df['filename'] = test_df.id + '.tif'

In [None]:
print('Test Images:', len(os.listdir(TEST_DIR)))

datagen_test = ImageDataGenerator(rescale=1./255.)

test_gen = datagen_test.flow_from_dataframe(
    dataframe  = test_df,
    directory  = TEST_DIR,
    color_mode = 'rgb',
    x_col      = 'filename',
    batch_size = 32,
    seed       = SEED,
    shuffle    = False,
    class_mode = None,
    target_size = IMAGE_SIZE)

In [None]:
test_steps = np.ceil(len(test_gen) / BATCH_SIZE)
test_images_path = len(os.listdir(TEST_DIR))

print('Test Images in path:', test_images_path)
print('Test Dataframe Size:', len(test_df))
print('Steps: ', test_steps)

In [None]:
predictions = model_c2.predict(
    test_gen,
#    steps=test_steps,
    verbose=1)

predictions.shape

In [None]:
submission = pd.read_csv(DB_PATH + 'sample_submission.csv', index_col='id')
submission.label = predictions

In [None]:
submission.to_csv('./submission.csv')