In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import pandas as pd
import pickle
import gc

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.preprocessing.image import ImageDataGenerator

import zipfile 

In [None]:
gc.collect()

## Importing Training Labels

In [None]:
train = pd.read_csv("../input/histopathologic-cancer-detection/train_labels.csv", dtype=str)
print(train.shape)

In [None]:
train.head(10)

## Seeing the Distribution of Labels

In [None]:
y_train = train.label

(train.label.value_counts() / len(train)).to_frame().T

## Sampling a Few Images

In [None]:
# Sample 16 images from the training set and display these along with their labels.

plt.figure(figsize=(10,10)) # specifying the overall grid size

for i in range(16):
    plt.subplot(4,4,i+1)    # the number of images in the grid is 6*6 (16)
    img = mpimg.imread(f'../input/histopathologic-cancer-detection/train/{train["id"][i]}.tif')
    plt.imshow(img)
    plt.text(0, -5, f'Label {train["label"][i]}')
    plt.axis('off')
    
plt.tight_layout()
plt.show()

## Taking Even Amount of Neg and Pos Labels

In [None]:
train_neg = train[train['label']=='0'].sample(10000,random_state=1)
train_pos = train[train['label']=='1'].sample(10000,random_state=1)

train_data = pd.concat([train_neg, train_pos], axis=0).reset_index(drop=True)

train = shuffle(train_data)

In [None]:
train['label'].value_counts()

In [None]:
# function to apply the .tif extension
def append_ext(fn):
    return fn+".tif"


train['id'] = train['id'].apply(append_ext)
train.head()

## Splitting the Data

In [None]:
# Split the dataframe train into two DataFrames named train_df and valid_df. 
# Use 20% of the data for the validation set. 
# Use stratified sampling so that the label proportions are preserved.
# Set a random seed for the split. 

train_df, valid_df = train_test_split(train, test_size=0.2, random_state=1, stratify=train.label)

print(train_df.shape)
print(valid_df.shape)

## Creating Datagenerators

In [None]:
# Create image data generators for both the training set and the validation set. 
# Use the data generators to scale the pixel values by a factor of 1/255. 
train_datagen = ImageDataGenerator(rescale=1/255)
valid_datagen = ImageDataGenerator(rescale=1/255)

In [None]:
# Complete the code for the data loaders below. 

BATCH_SIZE = 64

train_loader = train_datagen.flow_from_dataframe(
    dataframe = train_df,
    directory = '../input/histopathologic-cancer-detection/train/',
    x_col = 'id',
    y_col = 'label',
    batch_size = BATCH_SIZE,
    seed = 1,
    shuffle = True,
    class_mode = 'categorical',
    target_size = (32,32)
)

valid_loader = train_datagen.flow_from_dataframe(
    dataframe = valid_df,
    directory = '../input/histopathologic-cancer-detection/train/',
    x_col = 'id',
    y_col = 'label',
    batch_size = BATCH_SIZE,
    seed = 1,
    shuffle = True,
    class_mode = 'categorical',
    target_size = (32,32)
)

In [None]:
# Run this cell to determine the number of training and validation batches. 

TR_STEPS = len(train_loader)
VA_STEPS = len(valid_loader)

print(TR_STEPS)
print(VA_STEPS)

## Building the CNN

In [None]:
# Use this cell to construct a convolutional neural network model. 
# Your model should make use of each of the following layer types:
#    Conv2D, MaxPooling2D, Dropout, BatchNormalization, Flatten, Dense
# You can start by mimicking the architecture used in the 
# Aerial Cactus competetition, but you should explore different architectures
# by adding more layers and/or adding more nodes in individual layers

np.random.seed(1)
tf.random.set_seed(1)

cnn1 = Sequential([
    Conv2D(32, (3,3), activation = 'relu', padding = 'same', input_shape=(32,32,3)),
    BatchNormalization(),
    Conv2D(32, (3,3), activation = 'relu', padding = 'same'),
    MaxPooling2D(2,2),
    Dropout(0.2),
    BatchNormalization(),

    Conv2D(64, (3,3), activation = 'relu', padding = 'same'),
    BatchNormalization(),
    Conv2D(64, (3,3), activation = 'relu', padding = 'same'),
    MaxPooling2D(2,2),
    Dropout(0.4),
    BatchNormalization(),
    
    Conv2D(128, (3,3), activation = 'relu', padding = 'same'),
    BatchNormalization(),
    Conv2D(128, (3,3), activation = 'relu', padding = 'same'),
    MaxPooling2D(2,2),
    Dropout(0.5),
    BatchNormalization(),

    Flatten(),
    
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(16, activation='relu'),
    Dropout(0.2),
    BatchNormalization(),
    # we have 2 here because we have 2 classes
    Dense(2, activation='softmax')
])

cnn1.summary()

In [None]:
opt = tf.keras.optimizers.Adam(0.001)
cnn1.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy', tf.keras.metrics.AUC()])

## Fitting the CNN

In [None]:
%%time 

h1 = cnn1.fit(
    x = train_loader, 
    steps_per_epoch = TR_STEPS, 
    epochs = 20,
    validation_data = valid_loader, 
    validation_steps = VA_STEPS, 
    verbose = 1
)

In [None]:
history = h1.history
print(history.keys())

## Graphing the Results

In [None]:
# Graph the result

epoch_range = range(1, len(history['loss'])+1)

plt.figure(figsize=[14,4])

plt.subplot(1,3,1)
plt.plot(epoch_range, history['loss'], label='Training')
plt.plot(epoch_range, history['val_loss'], label='Validation')
plt.xlabel('Epoch'); plt.ylabel('Loss'); plt.title('Loss')
plt.legend()

plt.subplot(1,3,2)
plt.plot(epoch_range, history['accuracy'], label='Training')
plt.plot(epoch_range, history['val_accuracy'], label='Validation')
plt.xlabel('Epoch'); plt.ylabel('Accuracy'); plt.title('Accuracy')
plt.legend()

plt.subplot(1,3,3)
plt.plot(epoch_range, history['auc'], label='Training')
plt.plot(epoch_range, history['val_auc'], label='Validation')
plt.xlabel('Epoch'); plt.ylabel('AUC'); plt.title('AUC')
plt.legend()

plt.tight_layout()
plt.show()

## Another Training Run to Smooth out Validation Graph

In [None]:
tf.keras.backend.set_value(cnn1.optimizer.learning_rate, 0.0001)

In [None]:
%%time 

h2 = cnn1.fit(
    x = train_loader, 
    steps_per_epoch = TR_STEPS, 
    epochs = 20,
    validation_data = valid_loader, 
    validation_steps = VA_STEPS, 
    verbose = 1
)

In [None]:
# Graph the result

for k in history.keys():
    history[k] += h2.history[k]
    

epoch_range = range(1, len(history['loss'])+1)

plt.figure(figsize=[14,4])

plt.subplot(1,3,1)
plt.plot(epoch_range, history['loss'], label='Training')
plt.plot(epoch_range, history['val_loss'], label='Validation')
plt.xlabel('Epoch'); plt.ylabel('Loss'); plt.title('Loss')
plt.legend()

plt.subplot(1,3,2)
plt.plot(epoch_range, history['accuracy'], label='Training')
plt.plot(epoch_range, history['val_accuracy'], label='Validation')
plt.xlabel('Epoch'); plt.ylabel('Accuracy'); plt.title('Accuracy')
plt.legend()

plt.subplot(1,3,3)
plt.plot(epoch_range, history['auc'], label='Training')
plt.plot(epoch_range, history['val_auc'], label='Validation')
plt.xlabel('Epoch'); plt.ylabel('AUC'); plt.title('AUC')
plt.legend()

plt.tight_layout()
plt.show()

## One Last Training Run

In [None]:
tf.keras.backend.set_value(cnn1.optimizer.learning_rate, 0.0001)

In [None]:
%%time 

h3 = cnn1.fit(
    x = train_loader, 
    steps_per_epoch = TR_STEPS, 
    epochs = 20,
    validation_data = valid_loader, 
    validation_steps = VA_STEPS, 
    verbose = 1
)

In [None]:
# Graph the result
for k in history.keys():
    history[k] += h2.history[k]

epoch_range = range(1, len(history['loss'])+1)

plt.figure(figsize=[14,4])

plt.subplot(1,3,1)
plt.plot(epoch_range, history['loss'], label='Training')
plt.plot(epoch_range, history['val_loss'], label='Validation')
plt.xlabel('Epoch'); plt.ylabel('Loss'); plt.title('Loss')
plt.legend()

plt.subplot(1,3,2)
plt.plot(epoch_range, history['accuracy'], label='Training')
plt.plot(epoch_range, history['val_accuracy'], label='Validation')
plt.xlabel('Epoch'); plt.ylabel('Accuracy'); plt.title('Accuracy')
plt.legend()

plt.subplot(1,3,3)
plt.plot(epoch_range, history['auc'], label='Training')
plt.plot(epoch_range, history['val_auc'], label='Validation')
plt.xlabel('Epoch'); plt.ylabel('AUC'); plt.title('AUC')
plt.legend()

plt.tight_layout()
plt.show()

## Saving the Model

In [None]:
cnn1.save('cancer_model15.h5')
pickle.dump(history, open(f'cancer_history15.pkl', 'wb'))