In [6]:
#%pip install googledrivedownloader
# from google_drive_downloader import GoogleDriveDownloader as gdd

# gdd.download_file_from_google_drive(file_id='1d_93d9oFNRBK9Vg6BRxs9wvRbKtNTylY',
#                                     dest_path='content/pneumonia_dataset.zip',
#                                     unzip=True)

In [67]:
#!apt-get update
#!apt-get install ffmpeg libsm6 libxext6 -y
#%pip install pandas numpy opencv-python scikit-learn keras
import pandas as pd                                     # Data analysis and manipultion tool
import numpy as np                                      # Fundamental package for linear algebra and multidimensional arrays
import tensorflow as tf                                 # Deep Learning Tool
import os                                               # OS module in Python provides a way of using operating system dependent functionality
import cv2                                              # Library for image processing
from sklearn.model_selection import train_test_split    # For splitting the data into train and validation set
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from keras.layers.normalization import BatchNormalization
from keras.callbacks import ModelCheckpoint
from keras import callbacks
from keras import backend as K
from keras.models import Model
from keras.models import load_model
import gc
from keras.metrics import binary_crossentropy
from keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import RMSprop

In [58]:
data=[]
img_size = 100
def create_data():
        for item in ['normal','pneumonia']:
            path='./content/pneumonia_dataset/train/' + item+"/"
            
            for img in os.listdir(path):         # os.listdir gets you all the list of name of files located in the given path
                try:
                    img_array=cv2.imread(os.path.join(path,img),cv2.IMREAD_GRAYSCALE)    # converts the image to pixels and gray scales the images
                    norm_image = cv2.normalize(img_array, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)
                    new_img_array=cv2.resize(norm_image,(img_size,img_size))
                    if item == 'normal':
                        data.append([new_img_array,0])
                    else:
                        data.append([new_img_array, 1]) # appending the list of image pixels and respective target value in data
                except Exception as e:
                    pass                                      # try and except is exception handling case in python, saves you from getting errors
                
            
create_data()

In [59]:
len(data)

2425

In [60]:
np.random.shuffle(data)
x = []
y = []
for image in data:
  x.append(image[0])
  y.append(image[1])

# converting x & y to numpy array as they are list
x = np.array(x)
y = np.array(y)
np.unique(y, return_counts=True)

(array([0, 1]), array([1280, 1145]))

In [61]:
## Convert into 4D Array
x =  x.reshape(-1, 100, 100, 1)

In [62]:
print(x.shape)
print(y.shape)

(2425, 100, 100, 1)
(2425,)


In [63]:
def data_aug(X_train,X_test,y_train,y_test,train_batch_size,test_batch_size):
    train_datagen = ImageDataGenerator(
        rotation_range=40,
        width_shift_range=0.2,
        height_shift_range=0.2,
        rescale=1.0/255,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest')
    test_datagen = ImageDataGenerator(rescale=1.0/255)
    train_batch = train_datagen.flow(X_train,y_train,batch_size=train_batch_size)
    test_batch = test_datagen.flow(X_test,y_test,batch_size=test_batch_size)
    return (train_batch,test_batch)

In [77]:
def create_model(img_size=100,channels=1):
    model = tf.keras.models.Sequential([
        tf.keras.layers.Conv2D(filters=16, kernel_size=(3, 3), activation='relu', padding='same', input_shape=(img_size, img_size, 1)),
        tf.keras.layers.MaxPooling2D((2, 2)),

        tf.keras.layers.Conv2D(filters=32, kernel_size=(3, 3), activation='relu', padding='same'),
        tf.keras.layers.MaxPooling2D((2, 2)),

        tf.keras.layers.Conv2D(filters=64, kernel_size=(3, 3), activation='relu', padding='same'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.MaxPooling2D((2, 2)),

        tf.keras.layers.Conv2D(filters=128, kernel_size=(3, 3), activation='relu', padding='same'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.MaxPooling2D((2, 2)),

        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')      
    ])
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    #model.compile(loss='binary_crossentropy', optimizer=RMSprop(lr=0.1), metrics=['accuracy'])
    
    return model

def callback(tf_log_dir_name='./tf-log/',patience_lr=10):
    cb = []
    """
    Tensorboard log callback
    """
    tb = callbacks.TensorBoard(log_dir=tf_log_dir_name, histogram_freq=0)
    cb.append(tb)
        
    return cb

In [78]:
kfold = KFold(n_splits=2, shuffle=True, random_state=42)
cvscores=[]
Fold=1
for train, val in kfold.split(x,y):
    gc.collect()
    K.clear_session()
    print('Fold: {}'.format(Fold))
    
    X_train = x[train]
    X_val = x[val]
    y_train = y[train]
    y_val = y[val]
    
    # Data Augmentation and Normalization(OPTIONAL) UNCOMMENT THIS FOR AUGMENTATION !!
#     batch_size = 32
#     train_batch, val_batch = data_aug(X_train,X_val,y_train,y_val, batch_size, batch_size)
    
    cb=callback()
    
    model=create_model(100,1)
    
    # Fit generator for Data Augmentation
#     epochs = 10 
#     model.fit(train_batch, validation_data=val_batch, epochs=epochs, validation_steps= X_val.shape[0] // batch_size, 
#                        steps_per_epoch= X_train.shape[0] // batch_size, callbacks=cb, verbose=1)

# Fit the model for without Data Augmentation
    batch_size=16
    epochs=10
    model.fit(X_train, y_train, validation_data=(X_val,y_val), epochs=epochs, batch_size=batch_size, callbacks=cb, verbose=1)
    
    model_name = 'cnn_keras_aug_Fold_'+str(Fold)+'.h5'
    model.save(model_name)
    
    # evaluate the model
    scores = model.evaluate(X_val, y_val, verbose=0)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)
    
#     # save the probability prediction of each fold in separate csv file
#     proba = model.predict(X_test_all,batch_size=None,steps=1)
#     labels=[np.argmax(pred) for pred in proba]
#     keys=[get_key(path) for path in paths_test_all ]
#     csv_name= 'submission_CNN_keras_aug_Fold'+str(Fold)+'.csv'
#     create_submission(predictions=labels,keys=keys,path=csv_name)
    
    
    Fold = Fold +1

print("%s: %.2f%%" % ("Mean Accuracy: ",np.mean(cvscores)))
print("%s: %.2f%%" % ("Standard Deviation: +/-", np.std(cvscores)))

Fold: 1
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
accuracy: 64.80%
Fold: 2
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
accuracy: 63.20%
Mean Accuracy: : 64.00%
Standard Deviation: +/-: 0.80%


In [21]:
def ensemble(models, model_input):
    
    Models_output=[model(model_input) for model in models]
    Avg = tf.keras.layers.average(Models_output)
    
    modelEnsemble = Model(inputs=model_input, outputs=Avg, name='ensemble')
    modelEnsemble.summary()
    modelEnsemble.compile(tf.keras.optimizers.Adam(lr=.0001), loss='binary_crossentropy', metrics=['accuracy'])
    return modelEnsemble

model_1 = create_model(img_size,1) 
model_2 = create_model(img_size,3) 
model_3 = create_model(img_size,3) 
model_4 = create_model(img_size,3) 

models = []

# Load weights 
model_1.load_weights('cnn_keras_aug_Fold_1.h5')
models.append(model_1)

model_2.load_weights('cnn_keras_aug_Fold_2.h5')
models.append(model_2)

model_3.load_weights('cnn_keras_aug_Fold_3.h5')
models.append(model_3)

model_4.load_weights('cnn_keras_aug_Fold_4.h5')
models.append(model_4)

model_input = tf.keras.layers.Input(shape=models[0].input_shape[1:])
ensemble_model = ensemble(models, model_input)

X_train, X_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)
scores = ensemble_model.evaluate(X_val, y_val, verbose=0)
print("%s: %.2f%%" % (ensemble_model.metrics_names[1], scores[1]*100))

Model: "ensemble"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 100, 100, 1) 0                                            
__________________________________________________________________________________________________
sequential_1 (Sequential)       (None, 1)            2531713     input_1[0][0]                    
__________________________________________________________________________________________________
sequential_2 (Sequential)       (None, 1)            2531713     input_1[0][0]                    
__________________________________________________________________________________________________
sequential_3 (Sequential)       (None, 1)            2531713     input_1[0][0]                    
___________________________________________________________________________________________

In [22]:
model_name = 'cnn_keras_ensebmle.h5'
ensemble_model.save(model_name)

In [23]:
# Loading the order of the image's name that has been provided
test_image_order = pd.read_csv("./content/pneumonia_dataset/test.csv")
test_image_order.head()
file_paths = [[fname, './content/pneumonia_dataset/test/' + fname] for fname in test_image_order['filename']]
# Confirm if number of images is same as number of labels given
if len(test_image_order) == len(file_paths):
    print('Number of image names i.e. ', len(test_image_order), 'matches the number of file paths i.e. ', len(file_paths))
else:
    print('Number of image names does not match the number of filepaths')


Number of image names i.e.  606 matches the number of file paths i.e.  606


In [24]:
test_images = pd.DataFrame(file_paths, columns=['filename', 'filepaths'])
test_images.head()
test_pixel_data = []     # initialize an empty numpy array
for i in range(len(test_images)):
  
  img_array = cv2.imread(test_images['filepaths'][i], cv2.IMREAD_GRAYSCALE)   # converting the image to gray scale
  new_img_array=cv2.resize(img_array,(img_size,img_size))
  test_pixel_data.append(new_img_array)
test_pixel_data = np.asarray(test_pixel_data)
test_pixel_data =  test_pixel_data.reshape(-1, 100, 100, 1)

In [25]:
pred = ensemble_model.predict(test_pixel_data)

In [26]:
predictions = []
for item in pred:
  if item <= 0.5:
    predictions.append('normal')
  else:
    predictions.append('pneumonia')

In [27]:
res = pd.DataFrame({'filename': test_images['filename'], 'label': predictions})  # prediction is nothing but the final predictions of your model on input features of your new unseen test data
res.to_csv("submission.csv", index = False)      # the csv file will be saved locally on the same location where this notebook is located.