# Histopathic Cancer Detection (HCD)
### Taylor Kern

# Import Statements

The following cell contains import statements 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import pickle
import os

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow import keras
from tensorflow.keras.layers import * 

from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import backend as k

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

# Helper Functions

In [None]:
def merge_history(hlist):
    history = {}
    for k in hlist[0].history.keys():
        history[k] = sum([h.history[k] for h in hlist], [])
    return history

def vis_training(h, start=1):
    epoch_range = range(start, len(h['loss'])+1)
    s = slice(start-1, None)

    plt.figure(figsize=[14,4])

    n = int(len(h.keys()) / 2)

    for i in range(n):
        k = list(h.keys())[i]
        plt.subplot(1,n,i+1)
        plt.plot(epoch_range, h[k][s], label='Training')
        plt.plot(epoch_range, h['val_' + k][s], label='Validation')
        plt.xlabel('Epoch'); plt.ylabel(k); plt.title(k)
        plt.grid()
        plt.legend()

    plt.tight_layout()
    plt.show()

# Load Training DataFrame

When we are loading the training dataframe, we are displaying the images. We are also removing the file extension .tif

In [None]:
train = pd.read_csv('../input/histopathologic-cancer-detection/train_labels.csv', dtype=str)
print(train.shape)

In [None]:
train.head()

In [None]:
train.id = train.id + '.tif'

In [None]:
train.head()

# Label Distribution

We are now finding the series containing the counts of unique values. We then sort them and format them properly

In [None]:
(train.label.value_counts() / len(train)).to_frame().sort_index().T

# Extract Images

Here we create the filepath on the training data. We use mpimg and read in the trainingpath, and loop over each row of the id column. From here we display the images

In [None]:
train_path = "../input/histopathologic-cancer-detection/train"

sample = train.sample(n=16).reset_index()

plt.figure(figsize=(6,6))

for i, row in sample.iterrows():

    img = mpimg.imread(f'../input/histopathologic-cancer-detection/train/{row.id}')    
    label = row.label

    plt.subplot(4,4,i+1)
    plt.imshow(img)
    plt.text(0, -5, f'Class {label}', color='k')
        
    plt.axis('off')

plt.tight_layout()
plt.show()

# Training and Validation Sets

Now we are splitting the dataset into train_df, and valid_df

In [None]:
train_df, valid_df = train_test_split(train, test_size=0.2, random_state=1, stratify=train.label)

# Data Generators

Now we're create two datagenerators, train_datagen and validation_datagen. We then create the train_loader and valid_loader from the train_datagen and validation_datagen. 

we set each datafram to valid_df, with the directory of train_path. The x column and y column of the dataset we can see when we originally showed the files in the dataset. These were id, and label. Furthermore, we are setting the x_col to 'id' and the y_col to 'label'. This is a categorical project, therefore we set class_mode to categorical. Since the size of the images are 96 x 96. Therefore we set the target_size to 96, 96.

Finally, we find the length of each loader and print it.

In [None]:
train_datagen = ImageDataGenerator(rescale=1/255)
validation_datagen = ImageDataGenerator(rescale=1/255)

In [None]:
BATCH_SIZE = 64

train_loader = train_datagen.flow_from_dataframe(
    dataframe = valid_df,
    directory = train_path,
    x_col = 'id',
    y_col = 'label',
    batch_size = BATCH_SIZE,
    seed = 1,
    shuffle = True,
    class_mode = 'categorical',
    target_size = (96,96)
)

valid_loader = train_datagen.flow_from_dataframe(
    dataframe = valid_df,
    directory = train_path,
    x_col = 'id',
    y_col = 'label',
    batch_size = BATCH_SIZE,
    seed = 1,
    shuffle = True,
    class_mode = 'categorical',
    target_size = (96,96)
)

In [None]:
TR_STEPS = len(train_loader)
VA_STEPS = len(valid_loader)

print(TR_STEPS)
print(VA_STEPS)

# Load Base Model

Here, we are creating our base model. From tensorflow, and keras, we use the function ResNet50. We are able to input the size of the images, and include imagenet as the weight. 

We must set base_model.trainable to False for the best results. 

Finally, we print out the summary of the results

In [None]:
base_model = tf.keras.applications.ResNet50(
    input_shape=(96,96,3), 
    include_top=False, 
    weights='imagenet'
)

base_model.trainable = False

base_model.summary()

# Build and Train

Now we create the cn, which contains all the content from the base model, and then we apply flatte, dense, batchnormalization and dropout

In [None]:
np.random.seed(1)
tf.random.set_seed(1)

cnn = Sequential([
    base_model,
    BatchNormalization(),

    Flatten(),
    
    Dense(16, activation='relu'),
    Dropout(0.5),
    Dense(8, activation='relu'),
    Dropout(0.5),
    BatchNormalization(),
    Dense(2, activation='softmax')
])

cnn.summary()

Here is where we set the learning rate. At the moment, the learning rate is quite high which means the product of this training set will not be very accurate. 

Since we are messuring the loss, accuracy and AUC, we include those as well while compiling the model

In [None]:
opt = tf.keras.optimizers.Adam(0.001)
cnn.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy', tf.keras.metrics.AUC()])

Now we train the train_loader for 40 epochs with the learning rate of .001

In [None]:
%%time 

h1 = cnn.fit(
    x = train_loader, 
    steps_per_epoch = TR_STEPS, 
    epochs = 40,
    validation_data = valid_loader, 
    validation_steps = VA_STEPS, 
    verbose = 1
)

In [None]:
history = merge_history([h1])
vis_training(history)

# Fine Tuning

Here is where we set the learning rate or the second training set. Now, the learning rate is lower which means the product of this training set will substantially more accurate. 

Again, we include the loss, accuracy and AUC 

In [None]:
base_model.trainable = True
k.set_value(cnn.optimizer.learning_rate, 0.00001)
cnn.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy', tf.keras.metrics.AUC()])

We include a summary of the output

In [None]:
cnn.summary()

# Train 2

Now we train the train_loader for another 30 epochs with the learning rate of .00001

In [None]:
%%time 

h2 = cnn.fit(
    x = train_loader, 
    steps_per_epoch = TR_STEPS, 
    epochs = 30,
    validation_data = valid_loader, 
    validation_steps = VA_STEPS, 
    verbose = 1
)

In [None]:
h2.history['auc'] = h2.history['auc_1']
h2.history['val_auc'] = h2.history['val_auc_1']

In [None]:
history = merge_history([h1, h2])
vis_training(history, start=10)

# Training 3

We now train the train_loader for another 20 epochs with the learning rate of .00001. This is the last time we train.

In [None]:
%%time 

h3 = cnn.fit(
    x = train_loader, 
    steps_per_epoch = TR_STEPS, 
    epochs = 20,
    validation_data = valid_loader, 
    validation_steps = VA_STEPS, 
    verbose = 1
)

In [None]:
h3.history['auc'] = h3.history['auc_1'] 
h3.history['val_auc'] = h3.history['val_auc_1'] 

In [None]:
history = merge_history([h1, h2, h3])
vis_training(history, start=10)

In [None]:
cnn.save('HCDv01.h5')
pickle.dump(history, open(f'HCDv01.pkl', 'wb'))

# Submission

In [None]:
test = pd.read_csv('../input/histopathologic-cancer-detection/sample_submission.csv')

print('Test Set Size:', test.shape)

In [None]:
test['filename'] = test.id + '.tif'

In [None]:
test.head()

In [None]:
test_path = "../input/histopathologic-cancer-detection/test"
print('Test Images:', len(os.listdir(test_path)))

In [None]:
BATCH_SIZE = 64

test_datagen = ImageDataGenerator(rescale=1/255)

test_loader = test_datagen.flow_from_dataframe(
    dataframe = test,
    directory = test_path,
    x_col = 'filename',
    batch_size = BATCH_SIZE,
    shuffle = False,
    class_mode = None,
    target_size = (96,96)
)

In [None]:
test_probs = cnn.predict(test_loader)
print(test_probs.shape)

In [None]:
print(len(test_loader))

In [None]:
print(test_probs[:10,].round(2))

In [None]:
test_pred = np.argmax(test_probs, axis=1)
print(test_pred[:10])

# Prepare Submission

In [None]:
submission = pd.read_csv('../input/histopathologic-cancer-detection/sample_submission.csv')
submission.head()

In [None]:
submission.label = test_probs[:,1]
submission.head()

In [None]:
submission.to_csv('submission.csv', header=True, index=False)