In [1]:
#%pip install googledrivedownloader
# from google_drive_downloader import GoogleDriveDownloader as gdd

# gdd.download_file_from_google_drive(file_id='1d_93d9oFNRBK9Vg6BRxs9wvRbKtNTylY',
#                                     dest_path='content/pneumonia_dataset.zip',
#                                     unzip=True)

In [33]:
#!apt-get update
#!apt-get install ffmpeg libsm6 libxext6 -y
#%pip install pandas numpy opencv-python scikit-learn keras
import pandas as pd                                     # Data analysis and manipultion tool
import numpy as np                                      # Fundamental package for linear algebra and multidimensional arrays
import tensorflow as tf                                 # Deep Learning Tool
import os                                               # OS module in Python provides a way of using operating system dependent functionality
import cv2                                              # Library for image processing
from sklearn.model_selection import train_test_split    # For splitting the data into train and validation set
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from keras.layers.normalization import BatchNormalization
from keras.callbacks import ModelCheckpoint
from keras import callbacks
from keras import backend as K
from keras.models import Model
from keras.models import load_model
import gc
from keras.metrics import binary_crossentropy

In [3]:
data=[]
img_size = 100
def create_data():
        for item in ['normal','pneumonia']:
            path='./content/pneumonia_dataset/train/' + item+"/"
            
            for img in os.listdir(path):         # os.listdir gets you all the list of name of files located in the given path
                try:
                    img_array=cv2.imread(os.path.join(path,img),cv2.IMREAD_GRAYSCALE)    # converts the image to pixels and gray scales the images
                    new_img_array=cv2.resize(img_array,(img_size,img_size))
                    # print(img_array)
                    if item == 'normal':
                        data.append([new_img_array,0])
                    else:
                        data.append([new_img_array, 1]) # appending the list of image pixels and respective target value in data
                except Exception as e:
                    pass                                      # try and except is exception handling case in python, saves you from getting errors
                
            
create_data()

In [4]:
len(data)

7641

In [5]:
np.random.shuffle(data)
x = []
y = []
for image in data:
  x.append(image[0])
  y.append(image[1])

# converting x & y to numpy array as they are list
x = np.array(x)
y = np.array(y)
np.unique(y, return_counts=True)

(array([0, 1]), array([2621, 5020]))

In [6]:
## Convert into 4D Array
x =  x.reshape(-1, 100, 100, 1)

In [7]:
def create_model(img_size=100,channels=1):
    model = tf.keras.models.Sequential([
        tf.keras.layers.Conv2D(filters=16, kernel_size=(3, 3), activation='relu', padding='same', input_shape=(img_size, img_size, 1)),
        tf.keras.layers.MaxPooling2D((2, 2)),

        tf.keras.layers.Conv2D(filters=32, kernel_size=(3, 3), activation='relu', padding='same'),
        tf.keras.layers.MaxPooling2D((2, 2)),

        tf.keras.layers.Conv2D(filters=64, kernel_size=(3, 3), activation='relu', padding='same'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.MaxPooling2D((2, 2)),

        tf.keras.layers.Conv2D(filters=128, kernel_size=(3, 3), activation='relu', padding='same'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.MaxPooling2D((2, 2)),

        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(512, activation='relu'),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')    
    ])
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

def callback(tf_log_dir_name='./tf-log/',patience_lr=10):
    cb = []
    """
    Tensorboard log callback
    """
    tb = callbacks.TensorBoard(log_dir=tf_log_dir_name, histogram_freq=0)
    cb.append(tb)
    
#     """
#     Reduce Learning Rate
#     """
#     reduce_lr_loss = callbacks.ReduceLROnPlateau(monitor='loss', factor=0.1, patience=patience_lr, verbose=1, min_delta=1e-4, mode='min')
#     cb.append(reduce_lr_loss)
    
#     """
#     Early Stopping callback
#     """
#     early_stop = callbacks.EarlyStopping(monitor='val_acc', min_delta=0, patience=5, verbose=1, mode='auto',save_best_only=True)
#     cb.apppend(early_stop)
        
    return cb

In [8]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cvscores=[]
Fold=1
for train, val in kfold.split(x,y):
    gc.collect()
    K.clear_session()
    print('Fold: {}'.format(Fold))
    
    X_train = x[train]
    X_val = x[val]
    y_train = y[train]
    y_val = y[val]
    
    ## Data Normalization
#     X_train /= 255
#     X_val /= 255
    
    cb=callback()
    
    model=create_model(100,1)
    
    batch_size=16
    epochs=10
    model.fit(X_train, y_train, validation_data=(X_val,y_val), epochs=epochs, batch_size=batch_size, callbacks=cb, verbose=2)
    
    model_name = 'cnn_keras_aug_Fold_'+str(Fold)+'.h5'
    model.save(model_name)
    
    # evaluate the model
    scores = model.evaluate(X_val, y_val, verbose=0)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)
    
#     # save the probability prediction of each fold in separate csv file
#     proba = model.predict(X_test_all,batch_size=None,steps=1)
#     labels=[np.argmax(pred) for pred in proba]
#     keys=[get_key(path) for path in paths_test_all ]
#     csv_name= 'submission_CNN_keras_aug_Fold'+str(Fold)+'.csv'
#     create_submission(predictions=labels,keys=keys,path=csv_name)
    
    
    Fold = Fold +1

print("%s: %.2f%%" % ("Mean Accuracy: ",np.mean(cvscores)))
print("%s: %.2f%%" % ("Standard Deviation: +/-", np.std(cvscores)))

Fold: 1
Epoch 1/10
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
382/382 - 57s - loss: 0.3976 - accuracy: 0.8130 - val_loss: 0.5837 - val_accuracy: 0.7260
Epoch 2/10
382/382 - 54s - loss: 0.2927 - accuracy: 0.8578 - val_loss: 0.2818 - val_accuracy: 0.8738
Epoch 3/10
382/382 - 52s - loss: 0.2703 - accuracy: 0.8647 - val_loss: 0.2892 - val_accuracy: 0.8607
Epoch 4/10
382/382 - 56s - loss: 0.2499 - accuracy: 0.8753 - val_loss: 0.4262 - val_accuracy: 0.7907
Epoch 5/10
382/382 - 52s - loss: 0.2452 - accuracy: 0.8793 - val_loss: 0.5341 - val_accuracy: 0.7534
Epoch 6/10
382/382 - 56s - loss: 0.2238 - accuracy: 0.8937 - val_loss: 0.2282 - val_accuracy: 0.8855
Epoch 7/10
382/382 - 53s - loss: 0.2047 - accuracy: 0.8987 - val_loss: 0.3017 - val_accuracy: 0.8443
Epoch 8/10
382/382 - 55s - loss: 0.2113 - accuracy: 0.8955 - val_loss: 0.2330 - val_accuracy: 0.8875
Epoch 9/10
382/382 - 56s - loss: 0.1966 - accuracy: 0.9067 - val_loss: 0.2792 - val_accuracy: 0.8803
Epoch 10/10

In [25]:
def ensemble(models, model_input):
    
    Models_output=[model(model_input) for model in models]
    Avg = tf.keras.layers.average(Models_output)
    
    modelEnsemble = Model(inputs=model_input, outputs=Avg, name='ensemble')
    modelEnsemble.summary()
    modelEnsemble.compile(tf.keras.optimizers.Adam(lr=.0001), loss='binary_crossentropy', metrics=['accuracy'])
    return modelEnsemble

model_5 = create_model(img_size,1) 
model_2 = create_model(img_size,3) 
model_3 = create_model(img_size,3) 
model_4 = create_model(img_size,3) 

models = []

# Load weights 
model_5.load_weights('cnn_keras_aug_Fold_1.h5')
models.append(model_5)

model_2.load_weights('cnn_keras_aug_Fold_2.h5')
models.append(model_2)

model_3.load_weights('cnn_keras_aug_Fold_3.h5')
models.append(model_3)

model_4.load_weights('cnn_keras_aug_Fold_4.h5')
models.append(model_4)

model_input = tf.keras.layers.Input(shape=models[0].input_shape[1:])
ensemble_model = ensemble(models, model_input)

X_train, X_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)
scores = ensemble_model.evaluate(X_val, y_val, verbose=0)
print("%s: %.2f%%" % (ensemble_model.metrics_names[1], scores[1]*100))

Model: "ensemble"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 100, 100, 1) 0                                            
__________________________________________________________________________________________________
sequential_14 (Sequential)      (None, 1)            2531713     input_5[0][0]                    
__________________________________________________________________________________________________
sequential_15 (Sequential)      (None, 1)            2531713     input_5[0][0]                    
__________________________________________________________________________________________________
sequential_16 (Sequential)      (None, 1)            2531713     input_5[0][0]                    
___________________________________________________________________________________________

In [26]:
model_name = 'cnn_keras_ensebmle.h5'
ensemble_model.save(model_name)

In [27]:
# Loading the order of the image's name that has been provided
test_image_order = pd.read_csv("./content/pneumonia_dataset/test.csv")
test_image_order.head()
file_paths = [[fname, './content/pneumonia_dataset/test/' + fname] for fname in test_image_order['filename']]
# Confirm if number of images is same as number of labels given
if len(test_image_order) == len(file_paths):
    print('Number of image names i.e. ', len(test_image_order), 'matches the number of file paths i.e. ', len(file_paths))
else:
    print('Number of image names does not match the number of filepaths')


Number of image names i.e.  606 matches the number of file paths i.e.  606


In [28]:
test_images = pd.DataFrame(file_paths, columns=['filename', 'filepaths'])
test_images.head()
test_pixel_data = []     # initialize an empty numpy array
for i in range(len(test_images)):
  
  img_array = cv2.imread(test_images['filepaths'][i], cv2.IMREAD_GRAYSCALE)   # converting the image to gray scale
  new_img_array=cv2.resize(img_array,(img_size,img_size))
  test_pixel_data.append(new_img_array)
test_pixel_data = np.asarray(test_pixel_data)
test_pixel_data =  test_pixel_data.reshape(-1, 100, 100, 1)

In [29]:
pred = ensemble_model.predict(test_pixel_data)

In [30]:
predictions = []
for item in pred:
  if item <= 0.5:
    predictions.append('normal')
  else:
    predictions.append('pneumonia')

In [31]:
res = pd.DataFrame({'filename': test_images['filename'], 'label': predictions})  # prediction is nothing but the final predictions of your model on input features of your new unseen test data
res.to_csv("submission.csv", index = False)      # the csv file will be saved locally on the same location where this notebook is located.