# **500 No Disease and 1000 Disease**

**Vgg16** - Using Keras

In [None]:
#necessary Libraries
import numpy as np
import pandas as pd
import os
from glob import glob 
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import shutil

import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc

from tqdm import tqdm
from tensorflow.keras.preprocessing.image import ImageDataGenerator

from keras.applications.vgg16 import VGG16, preprocess_input

from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten

from keras.optimizers import Adam, SGD
from keras import optimizers

In [None]:
#for handling the data set
path = "../input/histopathologic-cancer-detection/" 
labels = pd.read_csv(path + 'train_labels.csv')
train_path = path + 'train/'
test_path = path + 'test/'

**Create a dataframe which contains every training examples path, id and label**

In [None]:
#fixing the dataframe
df = pd.DataFrame({'path': glob(os.path.join(train_path,'*.tif'))})
df['id'] = df.path.map(lambda x: ((x.split("in")[2].split('.')[0])[1:]))
df = df.merge(labels, on = "id")
df.head(3)

In [None]:
SAMPLE_SIZE = 500
SAMPLE_SIZE1=1000

In [None]:
#filter out class 0
df_0 = df[df['label'] == 0].sample(SAMPLE_SIZE, random_state = 101)
# filter out class 1
df_1 = df[df['label'] == 1].sample(SAMPLE_SIZE1, random_state = 101)

In [None]:
# concat the dataframes
df_data = pd.concat([df_0, df_1], axis=0).reset_index(drop=True)
# shuffle
df_data = shuffle(df_data)

df_data['label'].value_counts()

**Going to split 20% of the training set into a validation set**

In [None]:
from sklearn.model_selection import train_test_split

# Use stratify= df['label'] to get balance ratio 1/1 in train and validation sets
df_train, df_val = train_test_split(df_data, test_size=0.2, stratify= df_data['label'])

# **Moving images to directory**

In [None]:
# Delete directory
import shutil
shutil.rmtree('main', ignore_errors=True)

# Create directory
os.mkdir('main')

# Create subfolder for train and val images
os.mkdir(os.path.join('main', 'train'))
os.mkdir(os.path.join('main', 'val'))

# Create subfolders for true positive and true negative in train
os.mkdir(os.path.join('main','train','true_positive'))
os.mkdir(os.path.join('main','train','true_negative'))      
         
# Create subfolders for true positive and true negative in val
os.mkdir(os.path.join('main','val','true_positive'))
os.mkdir(os.path.join('main','val','true_negative'))

In [None]:
#Prepare image name classes for the directory structure
# Save all train true positive names to list and add .tif
train_df_1 = df_train[df_train["label"] == 1]['id'].tolist()
train_df_1 = [name + ".tif" for name in train_df_1]

# Save all train true negativeto names list and add .tif
train_df_0 = df_train[df_train["label"] == 0]['id'].tolist()
train_df_0 = [name + ".tif" for name in train_df_0]

# Save all val true positive "id" to list and add .tif
val_df_1 = df_val[df_val["label"] == 1]['id'].tolist()
val_df_1 = [name + ".tif" for name in val_df_1]

# Save all val true negative "id" to list
val_df_0 = df_val[df_val["label"] == 0]['id'].tolist()
val_df_0 = [name + ".tif" for name in val_df_0]

In [None]:
# Move images to directory structure
import shutil
import os
from tqdm import tqdm

def transfer(source,destination,files):
    for image in tqdm(files):
        # source path to image
        src = os.path.join(source,image)
        dst = os.path.join(destination,image)
        # copy the image from the source to the destination
        shutil.copyfile(src,dst)
        
# transfer
transfer('../input/histopathologic-cancer-detection/train','main/train/true_positive',train_df_1)
transfer('../input/histopathologic-cancer-detection/train','main/train/true_negative',train_df_0)
transfer('../input/histopathologic-cancer-detection/train','main/val/true_positive',val_df_1)
transfer('../input/histopathologic-cancer-detection/train','main/val/true_negative',val_df_0)

**Increasing the size of the image results in a much higher performance**

In [None]:
# Generate batches of tensor image data with real-time data augmentation. 
import numpy as np
num_train_samples = len(df_train)
num_val_samples = len(df_val)
train_batch_size = 32
val_batch_size = 32

train_steps = np.ceil(num_train_samples / train_batch_size)
val_steps = np.ceil(num_val_samples / val_batch_size)

print(train_steps)
print(val_steps)

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Augmentation 
train_datagen = ImageDataGenerator(
                rescale=1./255,
                vertical_flip=True,
                horizontal_flip=True,
                rotation_range=90,
                shear_range=0.05)
test_datagen = ImageDataGenerator(rescale=1./255)

**Resizing and dividing training data into train,test and validation**

In [None]:
# Generator that will read pictures found in subfolers of 'main/train', and indefinitely generate batches of augmented image data
train_generator = train_datagen.flow_from_directory('main/train',
                                            target_size=(96,96),
                                            batch_size=train_batch_size,
                                            class_mode='categorical')

In [None]:
val_generator = test_datagen.flow_from_directory('main/val',
                                                  target_size=(96,96),
                                            batch_size=val_batch_size,
                                            class_mode='categorical')

In [None]:
# !!! batch_size=1 & shuffle=False !!!!
test_generator = test_datagen.flow_from_directory('main/val',
                                            target_size=(96,96),
                                            batch_size=1,
                                            class_mode='categorical',
                                            shuffle=False)

# **Creating Model**

In [None]:
# Import VGG16 model, with weights pre-trained on ImageNet.
from keras.applications.vgg16 import VGG16, preprocess_input

# VGG model without the last classifier layers (include_top = False)
vgg16_model = VGG16(include_top = False,
                    input_shape = (96,96,3),
                    #weights='../input/VGG16weights/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5')
                    weights = 'imagenet')
    


In [None]:
from keras.models import Sequential
from keras.layers import Dense,Flatten,Dropout

model = Sequential()
model.add(vgg16_model)
model.add(Flatten())
model.add(Dense(1024, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(512, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(2, activation="softmax"))

In [None]:
model.summary()

**As we're using Vgg16 trained on ImageNet, we'r freezing the last few layers.**

In [None]:
# Freeze the layers 
for layer in vgg16_model.layers[:-12]:
    layer.trainable = False
    
# Check the trainable status of the individual layers
for layer in vgg16_model.layers:
    print(layer, layer.trainable)

In [None]:
#for last few layers
from keras.optimizers import Adam, SGD
from keras import optimizers


model.compile(loss='binary_crossentropy',optimizer=optimizers.SGD(lr=0.00001, momentum=0.95),metrics=['accuracy'])

In [None]:
import time

# starting time
start = time.time()

history = model.fit_generator(
                    train_generator, 
                    steps_per_epoch  = train_steps, 
                    validation_data  = val_generator,
                    validation_steps = val_steps,
                    epochs           = 30, 
                    verbose          = 1)
# end time
end = time.time()

# total time taken
print(f"Total Train Time is {end - start}")

# **EVALUATION**

In [None]:
# Plot validation and accuracies over epochs
import matplotlib.pyplot as plt
epochs = [i for i in range(1, len(history.history['loss'])+1)]

plt.plot(epochs, history.history['loss'], color='blue', label="training_loss")
plt.plot(epochs, history.history['val_loss'], color='red', label="validation_loss")
plt.legend(loc='best')
plt.title('Loss')
plt.xlabel('epoch')
plt.show()
train_acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

epochs = range(len(train_acc))

plt.plot(epochs,train_acc,'b',label='Training accuracy')
plt.plot(epochs,val_acc,'r',label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()

plt.show()

In [None]:
print("Validation Accuracy: " + str(history.history['val_accuracy'][-1:]))

In [None]:
print("Training Accuracy: " + str(history.history['accuracy'][-1:]))

In [None]:
print("Validation Loss: " + str(history.history['val_loss'][-1:]))

In [None]:
print("Training Loss: " + str(history.history['loss'][-1:]))

In [None]:
val_predict = model.predict_generator(test_generator, steps=len(df_val), verbose=1)

# **ROC CURVE**

In [None]:
from sklearn.metrics import roc_curve, auc

fpr, tpr, thresholds = roc_curve(test_generator.classes, val_predict.argmax(axis=1))   
# Compute ROC area
print("ROC area is: " + str(auc(fpr, tpr)))

plt.figure()
plt.plot(fpr, tpr, color='darkred', label='ROC curve (area = %0.2f)' % auc(fpr, tpr))
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
plt.xlim([-0.01, 1.0])
plt.ylim([0.0, 1.01])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend()
plt.show()

# **PREDICTIONS**

**Using Test Time Augmentation. For each test image we will augment it 3 ways and average the prediction.**

In [None]:
testdf = pd.DataFrame({'path': glob(os.path.join(test_path, '*.tif'))})
testdf['id'] = testdf.path.map(lambda x: (x.split("st/")[1].split('.')[0]))
testdf.head(3)

In [None]:
df = testdf.sample(SAMPLE_SIZE, random_state = 101)

In [None]:
tta_datagen = ImageDataGenerator(rescale=1./255, #Normalise
                                 vertical_flip = True,
                                 horizontal_flip = True,
                                 rotation_range=90,
                                 zoom_range=0.2, 
                                 width_shift_range=0.1,
                                 height_shift_range=0.1,
                                 shear_range=0.05,
                                 channel_shift_range=0.1)

In [None]:
import cv2
from IPython.display import clear_output
tta_steps = 3
final = pd.DataFrame()
for index in range(0, len(df)):
    data_frame = pd.DataFrame({'path': df.iloc[index,0]}, index=[index])
    data_frame['id'] = data_frame.path.map(lambda x: x.split('st/')[1].split('.')[0])
    img_path = data_frame.iloc[0,0]
    test_img = cv2.imread(img_path)
    test_img = cv2.resize(test_img,(96,96))
    test_img = np.expand_dims(test_img, axis = 0)  
    predictionsTTA = []
    for i in range(0, tta_steps):
        preds = model.predict_generator(tta_datagen.flow_from_dataframe(dataframe = data_frame,
                                                                           directory = None,
                                                                           x_col = 'path',
                                                                           target_size = (96, 96),
                                                                           class_mode = None,
                                                                           batch_size = 1,
                                                                           shuffle = False), steps = 1)
        predictionsTTA.append(preds)
    clear_output()
    prediction_entry = np.array(np.round(np.mean(predictionsTTA)))
    data_frame['label'] = prediction_entry
    final = pd.concat([final, data_frame[['id', 'label']]])

In [None]:
final.set_index('id')
final.head(10)