# Histopathologic Cancer Detection
## Identify metastatic tissue in histopathologic scans of lymph node sections

# About the images

#### There are 220,025 training images and 57,456 test images.
#### The images are 96x96 pixels and are full color.

# Import Packages

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import pandas as pd
from sklearn.model_selection import train_test_split
import pickle
from IPython.lib.display import Audio

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import *
from tensorflow.keras import backend as K

import zipfile 

# Helper Functions

In [None]:
def merge_history(hlist):
    history = {}
    for k in hlist[0].history.keys():
        history[k] = sum([h.history[k] for h in hlist], [])
    return history

In [None]:
def vis_training(h, start=1):
    epoch_range = range(start, len(h['loss'])+1)
    s = slice(start-1, None)

    plt.figure(figsize=[14,4])

    n = int(len(h.keys()) / 2)

    for i in range(n):
        k = list(h.keys())[i]
        plt.subplot(1,n,i+1)
        plt.plot(epoch_range, h[k][s], label='Training')
        plt.plot(epoch_range, h['val_' + k][s], label='Validation')
        plt.xlabel('Epoch'); plt.ylabel(k); plt.title(k)
        plt.grid()
        plt.legend()

    plt.tight_layout()
    plt.show()

In [None]:
def play_alarm():
    framerate = 4410
    play_time_seconds = 6

    t = np.linspace(0, play_time_seconds, framerate*play_time_seconds)
    audio_data = (np.sin(2*np.pi*300*t) + np.sin(2*np.pi*240*t)) * np.sin(2*np.pi*t)
    return Audio(audio_data, rate=framerate, autoplay=True)

# Load and Prepare Data

# Dataframes

In [None]:
test = pd.read_csv('../input/histopathologic-cancer-detection/sample_submission.csv', dtype=str)
train_full = pd.read_csv('../input/histopathologic-cancer-detection/train_labels.csv', dtype=str)

print(train_full.shape)
print(test.shape)

In [None]:
train_full.head()

In [None]:
test.head()

In [None]:
test_id = test.id

train_full.id = train_full.id + '.tif'
test.id = test.id + '.tif'

print(train_full.head())
print(test.head())

# Label Distribution

In [None]:
y_train = train_full.label

(train_full.label.value_counts() / len(train_full)).to_frame()

In [None]:
print('Training Images:', len(os.listdir('../input/histopathologic-cancer-detection/train/')))

for i in range(10):
  img = plt.imread('../input/histopathologic-cancer-detection/train/' + train_full.id[i])
  print('Images shape', img.shape)

# Number of images in the train and test folder

In [None]:
print('Number of images in train set',len(os.listdir('../input/histopathologic-cancer-detection/train')))
print('Number of images in test set',len(os.listdir('../input/histopathologic-cancer-detection/test')))

# View Sample of Images

In [None]:
sample = train_full.sample(n=16).reset_index()

plt.figure(figsize=(8,8))

for i, row in sample.iterrows():

    img = mpimg.imread(f'../input/histopathologic-cancer-detection/train/{row.id}')    
    label = row.label

    plt.subplot(4,4,i+1)
    plt.imshow(img)
    plt.text(0, -5, f'Class {label}', color='k')
        
    plt.axis('off')

plt.tight_layout()
plt.show()

# Training and Validation Sets

In [None]:
train, valid = train_test_split(train_full, test_size=0.2, random_state=1, stratify=train_full.label)

print(train.shape)
print(valid.shape)

# Data generators

In [None]:
train_datagen = ImageDataGenerator(rescale=1/255)
valid_datagen = ImageDataGenerator(rescale=1/255)

In [None]:
BATCH_SIZE = 64

train_loader = train_datagen.flow_from_dataframe(
    dataframe = train,
    directory = '../input/histopathologic-cancer-detection/train/',
    x_col = "id",
    y_col = "label",
    batch_size = BATCH_SIZE,
    seed = 1,
    shuffle = True,
    class_mode = "categorical",
    target_size = (96,96),
)


valid_loader = valid_datagen.flow_from_dataframe(
    dataframe = valid,
    directory = '../input/histopathologic-cancer-detection/train/',
    x_col = "id",
    y_col = "label",
    batch_size = BATCH_SIZE,
    seed = 1,
    shuffle = True,
    class_mode = "categorical",
    target_size = (96,96))

In [None]:
TR_STEPS = len(train_loader)
VA_STEPS = len(valid_loader)

print(TR_STEPS)
print(VA_STEPS)

# Build network

In [None]:
np.random.seed(1)
tf.random.set_seed(1)

cnn = Sequential([
    Conv2D(32, (3,3), activation = 'relu', padding = 'same', input_shape=(96,96,3)),
    BatchNormalization(),
    Conv2D(32, (3,3), activation = 'relu', padding = 'same'),
    BatchNormalization(),
    MaxPooling2D(2,2),
    Dropout(0.3),

    Conv2D(64, (3,3), activation = 'relu', padding = 'same'),
    BatchNormalization(),
    Conv2D(64, (3,3), activation = 'relu', padding = 'same'),
    BatchNormalization(),
    MaxPooling2D(2,2),
    Dropout(0.3),
    
    Conv2D(128, (3,3), activation = 'relu', padding = 'same'),
    BatchNormalization(),
    Conv2D(128, (3,3), activation = 'relu', padding = 'same'),
    BatchNormalization(),
    MaxPooling2D(2,2),
    Dropout(0.3),

    Flatten(),

    Dense(256, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(2, activation='softmax')
])

cnn.summary()

# Train network

In [None]:
opt = tf.keras.optimizers.Adam(0.001)
cnn.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy', tf.keras.metrics.AUC()])

In [None]:
%%time 

h1 = cnn.fit(
    train_loader, 
    steps_per_epoch=TR_STEPS, 
    validation_data=valid_loader, 
    validation_steps=VA_STEPS, 
    epochs = 20,
    verbose=1, 
    use_multiprocessing=True, 
    workers=8
)

In [None]:
history = merge_history([h1])
vis_training(history)

In [None]:
play_alarm()

In [None]:
K.set_value(cnn.optimizer.learning_rate, 0.0001)

In [None]:
%%time 

h2 = cnn.fit(
    train_loader, 
    steps_per_epoch=TR_STEPS, 
    validation_data=valid_loader, 
    validation_steps=VA_STEPS, 
    epochs = 20,
    verbose=1, 
    use_multiprocessing=True, 
    workers=8
)

In [None]:
history = merge_history([h1, h2])
vis_training(history)

In [None]:
play_alarm()

In [None]:
K.set_value(cnn.optimizer.learning_rate, 0.00001)

In [None]:
%%time 

h3 = cnn.fit(
    train_loader, 
    steps_per_epoch=TR_STEPS, 
    validation_data=valid_loader, 
    validation_steps=VA_STEPS, 
    epochs = 20,
    verbose=1, 
    use_multiprocessing=True, 
    workers=8
)

In [None]:
history = merge_history([h1, h2, h3])
vis_training(history)

In [None]:
play_alarm()

In [None]:
cnn.save('cancer_model_v01.h5')
pickle.dump(history, open(f'cancer_history_v01.pkl', 'wb'))