In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from keras.utils import np_utils
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Flatten, Conv2D, MaxPooling2D, Dropout, BatchNormalization
from keras.optimizers import Adam
from keras import regularizers
from sklearn.metrics import roc_curve, auc
from sklearn import model_selection
import random
from tensorflow.keras.callbacks import EarlyStopping
from keras.models import model_from_json
import csv
from keras.callbacks import History
from shutil import copyfile
import os
%matplotlib inline

In [None]:
# data_df contains Unnamed: 0	X_ray_image_name	Label	Dataset_type	Label_2_Virus_category	Label_1_Virus_category
data_df = pd.read_csv('../input/coronahack-chest-xraydataset/Chest_xray_Corona_Metadata.csv')
# meta_df contains Unnamed: 0	Label	Label_1_Virus_category	Label_2_Virus_category	Image_Count
meta_df = pd.read_csv('../input/coronahack-chest-xraydataset/Chest_xray_Corona_dataset_Summary.csv')

test_dir ='../input/coronahack-chest-xraydataset/Coronahack-Chest-XRay-Dataset/Coronahack-Chest-XRay-Dataset/test'
train_dir = '../input/coronahack-chest-xraydataset/Coronahack-Chest-XRay-Dataset/Coronahack-Chest-XRay-Dataset/train'

train_data = data_df[data_df['Dataset_type']=='TRAIN']
test_data = data_df[data_df['Dataset_type']=='TEST']

In [None]:
#Filtering out the data belonging to the following three labels and shuffling the data
train_data.fillna('NA', inplace = True)
train_dff = train_data[(train_data['Label'] =='Pnemonia') | (train_data['Label'] == 'Normal')]
train_dff = train_dff.sample(frac=1, random_state=8)  # this is to randomise the order of the dataframe
print(len(train_dff))
train_dff

In [None]:
#Similar as above for test data
test_data.fillna('NA', inplace = True)
test_df = test_data[(test_data['Label'] =='Pnemonia') | (test_data['Label'] == 'Normal')]
print(len(test_df))

In [None]:
X_train, X_validation = model_selection.train_test_split(train_dff, train_size=0.80, test_size=0.20, random_state=8)
X_train

In [None]:
# creating new folders with split validation and test data
os.mkdir('/kaggle/working/train')
for X_ray in X_train.X_ray_image_name:
    copyfile('../input/coronahack-chest-xraydataset/Coronahack-Chest-XRay-Dataset/Coronahack-Chest-XRay-Dataset/train/' + X_ray, 
             '/kaggle/working/train/' + X_ray)

os.mkdir('/kaggle/working/validation')
for X_ray in X_validation.X_ray_image_name:
    copyfile('../input/coronahack-chest-xraydataset/Coronahack-Chest-XRay-Dataset/Coronahack-Chest-XRay-Dataset/train/' + X_ray, 
             '/kaggle/working/validation/' + X_ray)

In [None]:
train_dir = '/kaggle/working/train'
validation_dir = '/kaggle/working/validation'

In [None]:
image_gen_train = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.15,
    height_shift_range=0.15,
    brightness_range=None,
    zoom_range=0.10,
    channel_shift_range=0.0,
    fill_mode="nearest",
    cval=0.0,
    horizontal_flip=True,
    rescale=1./255,
    preprocessing_function=None,
    dtype=None
)

image_gen_valtest = ImageDataGenerator(
    brightness_range=None,
    channel_shift_range=0.0,
    fill_mode="nearest",
    cval=0.0,
    rescale=1./255,
    preprocessing_function=None,
    dtype=None
)

In [None]:
# augmenting the training data
training_generators = []
for seed in range(9):
    train_datagen = image_gen_train.flow_from_dataframe(
        dataframe=X_train,
        directory=train_dir,
        x_col="X_ray_image_name",
        y_col="Label",
        classes = ['Normal','Pnemonia'],
        target_size=(256, 256),
        color_mode="rgb",
        class_mode="categorical",
        batch_size=32,
        seed=seed,
        shuffle=True
    )
    training_generators.append(train_datagen)
training_generators.append(image_gen_valtest.flow_from_dataframe(
        dataframe=X_train,
        directory=train_dir,
        x_col="X_ray_image_name",
        y_col="Label",
        classes = ['Normal','Pnemonia'],
        target_size=(256, 256),
        color_mode="rgb",
        class_mode="categorical",
        batch_size=32,
        seed=seed,
        shuffle=True
    ))

In [None]:
valid_datagen = image_gen_valtest.flow_from_dataframe( 
    dataframe=X_validation,
    directory=validation_dir,
    x_col="X_ray_image_name",
    y_col="Label",
    classes = ['Normal','Pnemonia'],
    target_size=(256, 256),
    color_mode="rgb",
    class_mode="categorical",
    batch_size=32,
    seed=25,
    shuffle=False
)

In [None]:
test_datagen = image_gen_valtest.flow_from_dataframe(
    dataframe=test_df,
    directory=test_dir,
    x_col="X_ray_image_name",
    y_col="Label",
    classes = ['Normal','Pnemonia'],
    target_size=(256, 256),
    color_mode="rgb",
    class_mode="categorical",
    batch_size=32,
    seed=25,
    shuffle=False
)

In [None]:
fig, ax = plt.subplots(3, 2, figsize=(16, 16))
for i,j in enumerate(training_generators[8]):
    
    
    for k in range(6):
        plt.subplot(3,2,k+1)
        plt.imshow((j[0])[k])
        ax[0,0].set_title((j[1])[k])
        
        
    #print(j[1])
    if i == 0:
        break 

In [None]:
def generateHyperparameters():

    param_grid = {
        'no_layers': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
        'batch_norm': [True, False],
        'learning_rate': list(np.logspace(np.log10(0.001), np.log10(0.01), base=10, num=100)),
        'kernel_size': list(int(x) for x in np.logspace(np.log2(4), np.log2(512), base=2, num=8)),
        'dropout': [False, True],
        'dropout_rate': [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5],
        'pool_size': [2,3,4],
        'should_channels': [True, False],
        'kernel_size': [3, 4, 5, 6],
        'weight_decay':list(np.logspace(np.log10(0.00001), np.log10(0.0005), base=10, num=10))
    }


    no_layers = random.choice(param_grid['no_layers'])
    kernel_size = random.choice(param_grid['kernel_size'])
    batch_norm = random.choice(param_grid['batch_norm'])
    learning_rate = round(random.choice(param_grid['learning_rate']), 5)
    kernel_size = random.choice(param_grid['kernel_size'])
    dropout = random.choice(param_grid['dropout'])
    dropout_rate = random.choice(param_grid['dropout_rate'])
    pool_size = random.choice(param_grid['pool_size'])
    should_channels = random.choice(param_grid['should_channels'])
    weight_decay = round(random.choice(param_grid['weight_decay']), 5)
    

    while (pool_size > kernel_size):
        pool_size = random.choice(param_grid['pool_size'])
    channels = [32] * no_layers
    if should_channels:
        for i in range (no_layers):
            channels[i] = 2 ** (i+5)

    hyper_grid = {
        'no_layers': no_layers,
        'batch_norm': batch_norm,
        'learning_rate': learning_rate,
        'kernel_size': kernel_size,
        'dropout': dropout,
        'dropout_rate': dropout_rate,
        'pool_size': pool_size,
        'should_channels': should_channels,
        'kernel_size': kernel_size,
        'weight_decay': weight_decay,
        'channels': channels
    }


    print(hyper_grid)
    
    return hyper_grid

In [None]:
def buildModel(hyper_grid):
    
    model = Sequential()
    
    

    model.add(Conv2D(32, (hyper_grid['kernel_size'],hyper_grid['kernel_size']), padding='same', input_shape=[256,256,3], kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(hyper_grid['weight_decay'])))
    if hyper_grid['batch_norm']:
        model.add(BatchNormalization())
    model.add(Activation('relu'))
    if hyper_grid['dropout']:
        model.add(Dropout(hyper_grid['dropout_rate']))
    model.add(MaxPooling2D(pool_size=(hyper_grid['pool_size'],hyper_grid['pool_size'])))

    for i in range(hyper_grid['no_layers'] - 1):
        if (model.output_shape[1] > hyper_grid['pool_size']):
            model.add(Conv2D(hyper_grid['channels'][i], (hyper_grid['kernel_size'],hyper_grid['kernel_size']), padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(hyper_grid['weight_decay'])))
            if hyper_grid['batch_norm']:
                model.add(BatchNormalization())
            model.add(Activation('relu'))
            if hyper_grid['dropout']:
                model.add(Dropout(hyper_grid['dropout_rate']))

            model.add(MaxPooling2D(pool_size=(hyper_grid['pool_size'],hyper_grid['pool_size'])))



    model.add(Flatten())
    model.add(Dense(2, activation='softmax'))
    model.summary()
    
    return model

In [None]:
def get_auc_score(val_labels, X_pred_keras):
    fpr_keras, tpr_keras, thresholds_keras = roc_curve(val_labels, X_pred_keras)
    auc_keras = auc(fpr_keras, tpr_keras)
    return auc_keras

In [None]:
def writeToCsv(records, model):
    
    history_dict = records.history
    print(history_dict.keys())


    acc = records.history['accuracy']
    val_acc = records.history['val_accuracy']
    auc_score = plotRoc(model, valid_datagen, 'simrenModel')
    

    toPrint = {'Accuracy': acc, 'ValAccuracy': val_acc, 'AUC score': auc_score}
    print(toPrint)
    
    with open ('mycsv.csv', 'a', newline='') as f:
        thewriter = csv.writer(f)
        thewriter.writerow(hyper_grid)
        thewriter.writerow(hyper_grid.values())
        thewriter.writerow(toPrint)
        thewriter.writerow(toPrint.values())
        print("wrote to csv")

In [None]:
# Training code here
def trainModel(training_generators, valid_datagen,hyper_grid, epochs =1):
    
    class_weight = {0: 3,  # Normal
                    1: 1}  # Pneumonia
    
    model = buildModel(hyper_grid)

    optimizer = Adam(learning_rate=hyper_grid['learning_rate'])
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    callback = EarlyStopping(monitor='val_loss', patience=3)
    
    save_records = []

    for train_datagen in training_generators:
        records = model.fit_generator(train_datagen, epochs=epochs, validation_data=valid_datagen, class_weight=class_weight, callbacks=[callback])
        save_records.append(records)
        
    writeToCsv(records, model)
    print("wrote")
    
    return model, records, save_records

In [None]:
def plotRoc(model, datagen, title):
    
    X_pred_keras = model.predict(datagen)[:, 1]
    labels = [l for l in datagen.labels]
    fpr_keras, tpr_keras, thresholds_keras = roc_curve(labels, X_pred_keras)
    auc_keras = auc(fpr_keras, tpr_keras)

    plt.figure(1)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr_keras, tpr_keras, label='Simren Model (area = {:.3f})'.format(auc_keras))
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(loc='best')
    plt.savefig('/kaggle/working/{}.png'.format(title), dpi = 300)
    # Zoom in view of the upper left corner.
    plt.figure(2)
    plt.xlim(0, 0.3)
    plt.ylim(0.7, 1)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr_keras, tpr_keras, label='Simren Model (area = {:.3f})'.format(auc_keras))
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve (zoomed in at top left)')
    plt.legend(loc='best')
    plt.savefig('/kaggle/working/{}_zoomed.png'.format(title), dpi = 300)
    
    auc_score = get_auc_score(labels, X_pred_keras)
    
    return auc_score
    

In [None]:
#Accuracy Plots for train and validation over epochs

def accuracyPlot(records):
    
    acc = []
    val_acc = []
    loss = []
    val_loss = []
    
    fake_acc = []
    fake_val_acc = []
    fake_loss = []
    fake_val_loss = []
    
    
    for i in range (len(records)):
        fake_acc.append(records[i].history['accuracy'])
        fake_val_acc.append(records[i].history['val_accuracy'])
        fake_loss.append(records[i].history['loss'])
        fake_val_loss.append(records[i].history['val_loss'])
        
    for x in range (len(fake_acc)):
        for y in range (len(fake_acc[x])):
            acc.append(fake_acc[x][y])
            val_acc.append(fake_val_acc[x][y])
            loss.append(fake_loss[x][y])
            val_loss.append(fake_val_loss[x][y])
    
    
    print(acc)
    
    epochs = range(1, len(acc) + 1)
    
    print("epochs: "+ str(epochs))

    plt.plot(epochs, acc, 'bo', label='Training acc')
    plt.plot(epochs, val_acc, 'b', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.savefig('/kaggle/working/epochsAcc.png', dpi = 300)
    plt.figure()
    plt.plot(epochs, loss, 'bo', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    plt.savefig('/kaggle/working/epochsLoss.png', dpi = 300)
    plt.show()
      

In [None]:
#Once hyperparameters are decided
dropout = False
batch_norm = False
dropout_rate = 0.2
kernel_size = 3
learning_rate = 0.00392
no_layers = 5
pool_size = 3
channels = [32] * no_layers
should_channels = False
weight_decay = 0.00004


if should_channels:
    for i in range (no_layers):
        channels[i] = 2 ** (i+5)

hyper_grid = {
    'no_layers': no_layers,
    'batch_norm': batch_norm,
    'learning_rate': learning_rate,
    'kernel_size': kernel_size,
    'dropout': dropout,
    'dropout_rate': dropout_rate,
    'pool_size': pool_size,
    'should_channels': should_channels,
    'kernel_size': kernel_size,
    'weight_decay': weight_decay,
    'channels': channels
    }


model, records, save_records = trainModel(training_generators, valid_datagen, hyper_grid, epochs = 3)

In [None]:
print(save_records)
accuracyPlot(save_records)

In [None]:
#saving model
model_json = model.to_json()
with open('/kaggle/working/model_CNN_Simren', 'w') as json_file:
    json_file.write(model_json)
# saving the model weight separately
model.save_weights('/kaggle/working/model_weights_CNN_Simren.h5')

In [None]:
#Test set

log=model.evaluate(
    test_datagen,
    batch_size=32,
    verbose=1,
    sample_weight=None,
    steps=624/32,
    callbacks=None,
    max_queue_size=10,
    workers=1,
    use_multiprocessing=False,
    return_dict=False,
)

plotRoc(model, test_datagen, "testROC")