# Import namespaces

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import matplotlib.image as mpimg

from sklearn.model_selection import train_test_split

import pickle

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.preprocessing.image import ImageDataGenerator

import os
from tensorflow.keras import layers

# Helper Functions

In [None]:
def merge_history(hlist):
    history = {}
    for k in hlist[0].history.keys():
        history[k] = sum([h.history[k] for h in hlist], [])
    return history

def vis_training(h, start=1):
    epoch_range = range(start, len(h['loss'])+1)
    s = slice(start-1, None)

    plt.figure(figsize=[14,4])

    n = int(len(h.keys()) / 2)

    for i in range(n):
        k = list(h.keys())[i]
        plt.subplot(1,n,i+1)
        plt.plot(epoch_range, h[k][s], label='Training')
        plt.plot(epoch_range, h['val_' + k][s], label='Validation')
        plt.xlabel('Epoch'); plt.ylabel(k); plt.title(k)
        plt.grid()
        plt.legend()

    plt.tight_layout()
    plt.show()

# Load dataset

In [None]:
# Load the training data into a DataFrame named 'train'. 
# Print the shape of the resulting DataFrame. 
# You do not need the test data in this notebook. 

train = pd.read_csv(f'../input/histopathologic-cancer-detection/train_labels.csv', dtype=str)

print('Training Set Size:', train.shape)

train.head()

In [None]:
# # Lets play with 1% data to check if all code works
# # Comment this when running the entire code
# ignore, train = train_test_split(train, test_size=0.01, random_state=1, stratify=train.label)
# print('Training Set Size:', train.shape)

Lets update the dataset to include filename extensions

In [None]:
train['id'] = train['id'].apply(lambda x: f'{x}.tif')
train.head()

# Label Distribution

In [None]:
(train.label.value_counts() / len(train)).to_frame().sort_index().T

# View Sample of Images

In [None]:
train_path = "../input/histopathologic-cancer-detection/train"
print('Training Images:', len(os.listdir(train_path)))

sample = train.sample(n=16).reset_index()

plt.figure(figsize=(8,8))

for i, row in sample.iterrows():

    img = mpimg.imread(f'../input/histopathologic-cancer-detection/train/{row.id}')    
    label = row.label

    plt.subplot(4,4,i+1)
    plt.imshow(img)
    plt.text(0, -5, f'Class {label}', color='k')
        
    plt.axis('off')

plt.tight_layout()
plt.show()

# Data Generators

In [None]:
train_df, valid_df = train_test_split(train, test_size=0.2, random_state=1, stratify=train.label)

print(train_df.shape)
print(valid_df.shape)

In [None]:
# Create image data generators for both the training set and the validation set. 
# Use the data generators to scale the pixel values by a factor of 1/255. 

train_datagen = ImageDataGenerator(rescale=1/255)
valid_datagen = ImageDataGenerator(rescale=1/255)

In [None]:
# Complete the code for the data loaders below. 

BATCH_SIZE = 64

train_loader = train_datagen.flow_from_dataframe(
    dataframe = train_df,
    directory = train_path,
    x_col = 'id',
    y_col = 'label',
    batch_size = BATCH_SIZE,
    seed = 1,
    shuffle = True,
    class_mode = 'categorical',
    target_size = (96,96)
)

valid_loader = train_datagen.flow_from_dataframe(
    dataframe = valid_df,
    directory = train_path,
    x_col = 'id',
    y_col = 'label',
    batch_size = BATCH_SIZE,
    seed = 1,
    shuffle = True,
    class_mode = 'categorical',
    target_size = (96,96)
)

In [None]:
TR_STEPS = len(train_loader)
VA_STEPS = len(valid_loader)

print(TR_STEPS)
print(VA_STEPS)

# Build Network

In [None]:
base_model = tf.keras.applications.InceptionResNetV2(include_top=False,
                                         weights='imagenet')

base_model.trainable = False
base_model.summary()

In [None]:
SEED = 1

data_augmentation = tf.keras.Sequential([
    layers.RandomFlip("horizontal_and_vertical", seed=SEED, input_shape=(96,96,3)),
    layers.RandomRotation(0.5, seed=SEED),
    layers.RandomZoom(0.3, 0.3, seed=SEED),
    layers.RandomContrast(0.3, seed=SEED),
    layers.RandomTranslation(0.3, 0.3, seed=SEED)
])


np.random.seed(SEED)
tf.random.set_seed(SEED)

cnn = Sequential([
    
    data_augmentation,
    base_model,

    Flatten(),
    
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(16, activation='relu'),
    Dropout(0.25),
    BatchNormalization(),
    Dense(2, activation='softmax')
])

cnn.summary()

# Train Network

In [None]:
# Define an optimizer and select a learning rate. 
# Then compile the model. 

opt = tf.keras.optimizers.Adam(0.001)
cnn.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy', tf.keras.metrics.AUC()])

In [None]:
%%time 

# Complete one or more training runs. 
# Display training curves after each run. 

h1 = cnn.fit(
    x = train_loader, 
    steps_per_epoch = TR_STEPS, 
    epochs = 25,
    validation_data = valid_loader, 
    validation_steps = VA_STEPS, 
    verbose = 1,
    use_multiprocessing=True, 
    workers=8
)

In [None]:
history = merge_history([h1])
vis_training(history)

# Training Run 2

In [None]:
tf.keras.backend.set_value(cnn.optimizer.learning_rate, 0.0001)

In [None]:
%%time 

h2 = cnn.fit(
    x = train_loader, 
    steps_per_epoch = TR_STEPS, 
    epochs = 25,
    validation_data = valid_loader, 
    validation_steps = VA_STEPS, 
    verbose = 1,
    use_multiprocessing=True, 
    workers=8
)

In [None]:
history = merge_history([h1, h2])
vis_training(history, start=15)

# Save Model and History

In [None]:
cnn.save('cancer_model_v02.h5')
pickle.dump(history, open(f'cancer_history_v02.pkl', 'wb'))