# Importing Modules

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

from __future__ import absolute_import, division, print_function, unicode_literals

import os
import glob
import shutil
import random

import cv2
import tensorflow as tf
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mimg

color = sns.color_palette()
%matplotlib inline
seed_number = 24

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
root = "../input"
print(f"Main directories\t: {os.listdir(root)}")
input_dir = os.path.join(root, "chest-xray-pneumonia", "chest_xray")
print(f"Dataset sub-directories\t: {os.listdir(input_dir)}")
print(f"Train set directory\t: {os.listdir(os.path.join(input_dir, 'train'))}")

# Any results you write to the current directory are saved as output.

In [None]:
# Import packages for data handling
import h5py
from PIL import Image
from skimage.io import imread
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from mlxtend.plotting import plot_confusion_matrix

In [None]:
# Import deep learning package (tensorflow)
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.applications import vgg16, xception
from tensorflow.keras.preprocessing.image import ImageDataGenerator,load_img, img_to_array
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Input, Flatten, SeparableConv2D
from tensorflow.keras.layers import GlobalAveragePooling2D, BatchNormalization, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, Callback, EarlyStopping
from tensorflow.keras.utils import to_categorical

In [None]:
# Set seed nunmber to all packages
random.seed(seed_number)
np.random.seed(seed_number)
tf.random.set_seed(seed_number)

# Observing the Dataset
Grasping some of the dataset information

In [None]:
# Create train, val and test directories
train_dir = os.path.join(input_dir, 'train')
val_dir = os.path.join(input_dir, 'val')
test_dir = os.path.join(input_dir, 'test')

dir_dict = {'train': train_dir, 'val': val_dir, 'test': test_dir}
label_name = os.listdir(os.path.join(input_dir, 'train'))
case_count, img_disp, set_length  = {}, {}, {}

for key, val in dir_dict.items():
    case_count[key] = {}
    img_disp[key] = {}
    set_count = 0
    
    for label in label_name:
        label_list = list(glob.glob(os.path.join(val, label, "*.jpeg")))
        case_count[key][label] = len(label_list)
        set_count += len(label_list)
        
        select_img_id = random.randint(1, len(label_list)-1)
        img_disp[key][label] = label_list[select_img_id]
        
    set_length[key] = set_count

case_count_df = pd.DataFrame(case_count)
img_disp_df = pd.DataFrame(img_disp)
print(f"Dataset summary:\n\n{case_count_df}")

In [None]:
# Visualizing some of the data set
num_classes = len(label_name)
f, ax = plt.subplots(num_classes, 3, figsize=(30, 18))

for k in range(num_classes*3):
    j, i = k//3, k%3  # Image indexing
    
    img = imread(img_disp_df.iloc[j, i])
    ax[j, i].imshow(img, cmap='gray')
    ax[j, i].set_title(f"{img_disp_df.columns[i].upper()}: {img_disp_df.index[j].capitalize()}", fontsize=32)
    ax[j, i].axis('off')
    ax[j, i].set_aspect('auto')
plt.show()

# Dataset Problem
Instantiate dataset object for training procedure (e.g., train, val, and test)

In [None]:
# Instantiate data generator for training procedure
train_datagen = ImageDataGenerator(rescale = 1./255,
                                   rotation_range = 5,
                                   width_shift_range = 0.1,
                                   height_shift_range = 0.05,
                                   shear_range = 0.1,
                                   zoom_range = 0.15,
                                   horizontal_flip = True)

val_datagen = ImageDataGenerator(rescale = 1./255)
test_datagen = ImageDataGenerator(rescale=1./ 255)

In [None]:
# Define dataset properties
train_batch_size = 32
val_batch_size = 32
img_width = 299
img_height = 299

# Generate dataset for train, val and test
train_gen = train_datagen.flow_from_directory(train_dir,
                                              batch_size = train_batch_size,
                                              class_mode = 'binary',
                                              target_size = (img_width, img_height),
                                              seed = seed_number)

val_gen = val_datagen.flow_from_directory(val_dir,
                                          batch_size = val_batch_size,
                                          class_mode = 'binary',
                                          target_size = (img_width, img_height),
                                          seed = seed_number)

test_gen = test_datagen.flow_from_directory(test_dir,
                                            batch_size = 1,
                                            class_mode = 'binary',
                                            target_size = (img_width, img_height),
                                            seed = seed_number,
                                            shuffle=False)

In [None]:
# Displaying the dataset generator information
print(f'Train set batch shape\t: {next(train_gen)[0].shape}')
print(f'Val set batch shape\t: {next(val_gen)[0].shape}')
print(f'Test set batch shape\t: {next(test_gen)[0].shape}')

# Generate Model
Using a pre-trained Xception model, provided by tensorflow

In [None]:
# Don't forget to turn on the Internet to download the respective pre-trained weights!
pretrain_net = xception.Xception(input_shape = (img_width, img_height, 3),
                                    include_top = False,
                                    weights = 'imagenet')

# load_param_path = '../input/xception/xception_weights_tf_dim_ordering_tf_kernels_notop.h5'  # Offline alternative
# pretrain_net.load_weights(load_param_path)  # Manually load the weights from the input directory

# ------ Freezing layer(s) up to a specific layer ------
freeze_to = 'block4_sepconv1_act'  # use 'None' for training all the layers instead!

if freeze_to:
    for layer in pretrain_net.layers:
        if layer.name == freeze_to:
            break
        else:
            layer.trainable = False


In [None]:
# Adding extra layer for our problem
x = pretrain_net.output
x = GlobalAveragePooling2D()(x)
x = Dense(units=1024, activation='relu', name='extra_fc1')(x)
x = Dropout(rate=0.3, name='extra_dropout1')(x)
x = Dense(units=512, activation='relu', name='extra_fc2')(x)
x = Dropout(rate=0.3, name='extra_dropout2')(x)
x = Dense(1, activation='sigmoid', name='classifier')(x)

model = Model(inputs=pretrain_net.input, outputs=x, name='xception_pneumonia')
print(model.summary())

# Perform Training
Define the training procedure

In [None]:
num_epochs = 30  # Set the number of epochs to train

model.compile(optimizer = Adam(lr=0.0001),
              loss = 'binary_crossentropy',
              metrics = ['acc'])

history = model.fit(train_gen,
                    steps_per_epoch = set_length['train'] // train_batch_size,
                    validation_data = val_gen,
                    validation_steps = set_length['val'] // val_batch_size,
                    epochs = num_epochs
                   )

In [None]:
# Plotting the train results
train_accuracy = history.history['acc']
val_accuracy = history.history['val_acc']
train_loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))
plt.figure(figsize=(12,4))

# Plotting the accuracy
plt.subplot(1,2,1)
plt.plot(epochs, train_accuracy, 'b', label='Training accuracy')
plt.plot(epochs, val_accuracy, 'r', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['train', 'val'], loc='lower right')

# Plotting the loss
plt.subplot(1,2,2)
plt.plot(epochs, train_loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['train', 'val'], loc='upper right')

plt.show()

# Results evaluation
Use this section to evaluate the model performance on the Test set.

In [None]:
# Test set accuracy and loss
test_scores = model.evaluate(test_gen)
print("Test results Accuracy: {0:.2f}% and Loss: {0:.2f}".format(test_scores[1]*100, test_scores[0]))

y_pred_value = model.predict(test_gen, steps=set_length['test'])
y_pred = np.argmax(y_pred, axis = 1)  # Need to evaluated for binary* case. *Use threshold instead!
y_true = test_gen.classes

In [None]:
# Confusion matrix result
confusion_matrix_result = confusion_matrix(y_true, y_pred)
plot_confusion_matrix(conf_mtx, figsize=(12,8), hide_ticks=True, alpha=0.7, cmap=plt.cm.Blues)
plt.xticks(range(2), ['Normal', 'Pneumonia'], fontsize=16)
plt.yticks(range(2), ['Normal', 'Pneumonia'], fontsize=16)
plt.show()

# Precision and Recall metrics
tn, fp, fn, tp = confusion_matrix_result.ravel()
precision = tp / (tp+fp)
recall = tp / (tp+fn)
f1_score = 2 * precision * recall / (precision+recall)
print("Precision\t: {:.2f}%".format(precision*100))
print("Recall\t: {:.2f}%".format(recall*100))
print("F1 Score\t: {:.2f}%".format(f1_score*100))

# Classification report
print(classification_report(y_true, y_pred, target_names=label_name))

In [None]:
# ROC Curve and AUC metrics
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(num_classes):
    fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_pred[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
                                 
plt.figure(figsize=(7, 5))

for i in range(num_classes):
    plt.plot(fpr[i], tpr[i], lw=2, label='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc[i]))
    
plt.plot(fpr[0], fpr[0], 'k-', label = 'random guessing')

plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc="lower right")

plt.tight_layout()