### Imports & declarations

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time

from sklearn.model_selection import train_test_split

import keras
from keras.preprocessing.image import img_to_array, load_img, array_to_img, ImageDataGenerator
from keras import regularizers
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Sequential
from keras.layers import *
import tensorflow as tf

LOCAL_PATH_MINI = '../data/raw/kaggle_mini_subsample'
LOCAL_PATH_TRAIN = '../data/raw/train'
COLAB_PATH_TRAIN = '/content/drive/MyDrive/Data Science/Colab Notebooks/Module 4 Project/data/raw/train'
COLAB_PATH_MINI = '/content/drive/MyDrive/Data Science/Colab Notebooks/Module 4 Project/data/raw/mini_subsample'
KAGGLE_PATH_TRAIN = '/kaggle/input/chest-xray-pneumonia/chest_xray/chest_xray/train/'
KAGGLE_PATH_TEST = '/kaggle/input/chest-xray-pneumonia/chest_xray/chest_xray/test/'


RANDOM_STATE = 2020

# #Comment when running locally, uncomment for Google Colab
# from google.colab import drive
# drive.mount('/content/drive')

### Define basic functions

In [3]:
def make_flat(x):
    s = x.shape
    dim_0 = s[0]
    dim_1 = s[1]*s[2]*s[3]
    x_flat = x.reshape(dim_0, dim_1)
    shape = (x_flat.shape[1],)
    return x_flat, shape

In [4]:
def plot_results(history, metrics=['accuracy'], val=True):
    
    for metric in metrics:
        x = range(len(history[metric]))

        plt.figure()
        plt.plot(x, history[metric], label='Train')

        if val == True:
            plt.plot(x, history['val_'+metric], label='Validation')
        plt.title(metric)
        plt.legend();

## Mini Dataset

#### Import & Clean

In [None]:
# # TESTING TO GET LABEL ENCODING CORRECT
# path = KAGGLE_PATH_TRAIN
# data_tf = tf.keras.preprocessing.image_dataset_from_directory(path, 
#                                                 image_size=(32, 32),
#                                                 class_names=['NORMAL', 'PNEUMONIA'],
#                                                 seed=RANDOM_STATE)

# print('CLASS NAMES:', data_tf.class_names, 'CORRESPOND TO [0,1]')
# for i, item in enumerate(data_tf.as_numpy_iterator()):
#     if i == 0:
#         images = np.array(item[0])
#         labels = np.array(item[1])
#     else:
#         images = np.concatenate([images, item[0]], axis=0)
#         labels = np.concatenate([labels, item[1]], axis=0)
#         if i % 10 == 0:
#             print(int(i/(5232/BATCH_SIZE)*100), '%')

# images /= 256

# labels_inverted = np.array([0 if i == 1 else 1 for i in labels])
# labels = np.concatenate([labels_inverted.reshape(-1,1), labels.reshape(-1,1)], axis=1)


# # print('Data shape:', images.shape)
# # print('Label shape:', labels.shape)

In [None]:
# BATCH_SIZE = 250
# TARGET_SIZE = [128, 128]

# path = COLAB_PATH_MINI
# image_generator = ImageDataGenerator().flow_from_directory(path, batch_size=BATCH_SIZE, target_size=TARGET_SIZE);
# images, labels = next(image_generator)
# images_scaled = images / 255

# display('Example image:', array_to_img(images[0]))
# print('Indices:', image_generator.class_indices)
# print('Dataset shape:', images_scaled.shape)

#### Mini Dataset Model

In [None]:
# x, shape = make_flat(images_scaled)

# model = Sequential()
# model.add(Dense(32, activation='relu', input_shape = shape))
# model.add(Dense(2, activation='softmax'))
# model.compile(optimizer='SGD', loss='categorical_crossentropy', metrics=['accuracy'])

# history = model.fit(x=x, y=labels, epochs=50)

# plot_results(history.history, val=False)

# model.summary()

## Full Dataset

### Import and clean

In [5]:
BATCH_SIZE = 1349+3883
TARGET_SIZE = 128
generator = ImageDataGenerator()
gen_train = generator.flow_from_directory(directory=KAGGLE_PATH_TRAIN, 
                                          target_size=(TARGET_SIZE, TARGET_SIZE), 
                                          batch_size=BATCH_SIZE, 
                                          seed=RANDOM_STATE)
print('Generated')
train = next(gen_train)
print('Gathered "next"')
images = train[0]
labels = train[1]
del gen_train, train # reduce memory usage
print('Complete')

Found 5216 images belonging to 2 classes.
Generated
Gathered "next"
Complete


In [None]:
# path = KAGGLE_PATH_TRAIN
# BATCH_SIZE = 32
# data_tf = tf.keras.preprocessing.image_dataset_from_directory(path, 
#                                                 image_size=(256, 256), # Can work up to 256
#                                                 batch_size=BATCH_SIZE,
#                                                 class_names=['NORMAL', 'PNEUMONIA'],                                                
#                                                 seed=RANDOM_STATE)

# print('CLASS NAMES:', data_tf.class_names, 'CORRESPOND TO [0,1]')

# # for i, item in enumerate(data_tf.as_numpy_iterator()):
# #     if i == 0:
# #         images = np.array(item[0])
# #         labels = np.array(item[1])
# #     else:
# #         images = np.concatenate([images, item[0]], axis=0)
# #         labels = np.concatenate([labels, item[1]], axis=0)
# #         if i % 10 == 0:
# #             print(int(i/(5232/BATCH_SIZE)*100), '%')

            
            
# # TRY TO SPEED UP:
# # for i, item in enumerate(data_tf.as_numpy_iterator()):
# #     if i == 0:
# #         images = np.array(item[0])
# #         labels = np.array(item[1])
# #     else:
# #         images = np.concatenate([images, item[0]], axis=0)
# #         labels = np.concatenate([labels, item[1]], axis=0)
# #         if i % 10 == 0:
# #             print(int(i/(5232/BATCH_SIZE)*100), '%')
    
# images_temp = np.array(data_tf.as_numpy_iterator()[0][0])
# images_temp
# # images /= 256

# # labels_inverted = np.array([0 if i == 1 else 1 for i in labels])
# # labels = np.concatenate([labels_inverted.reshape(-1,1), labels.reshape(-1,1)], axis=1)

# # del data_tf # clear RAM
# # del labels_inverted # clear RAM
# # # data_tf = None # clear RAM
# # # labels_inverted = None # clear RAM
# # print('Data shape:', images.shape)
# # print('Label shape:', labels.shape)

In [None]:
# BATCH_SIZE_TRAIN = 1341+3875
# TARGET_SIZE = [256, 256]

# path = COLAB_PATH_TRAIN

# t0 = time.time()
# generator = ImageDataGenerator().flow_from_directory(path,
#                                                      #batch_size=BATCH_SIZE_TRAIN,
#                                                      target_size=TARGET_SIZE)

# images, labels = next(generator)
# images = images / 256
# t1 = time.time()

# print('Runtime:', t1-t0)
# print(images.shape, labels.shape)

CHANGE TERMINOLOGY TO FIRST SIMPLE MODEL, INSTEAD OF BASELINE. BASELINE IS RANDOM GUESS

Notes:
- Remove MSE

### Baseline Model

In [11]:
labels.shape

(5216, 2)

In [10]:
x.shape

(5216, 49152)

In [32]:
TARGET_SIZE = 128
model = Sequential()
model.add(Conv2D(32, 
                 activation='relu', 
                 kernel_size=8, 
                 padding='same', 
                 input_shape=(TARGET_SIZE, TARGET_SIZE, 3)))
model.add(Flatten())
model.add(Dense(2, activation='softmax'))
model.compile(optimizer='SGD', metrics=['accuracy'], loss='categorical_crossentropy')

model.fit(x=images, 
          y=labels,
         epochs=25,
         validation_split=0.25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7f8b583c2550>

In [9]:
x, shape = make_flat(images)

model = Sequential()
model.add(Dense(16, activation='relu', input_shape = shape))
model.add(Dense(2, activation='softmax'))
model.compile(optimizer='SGD', metrics=['accuracy'], loss='categorical_crossentropy')


history = model.fit(x=x, y=labels, 
                    epochs=100, 
                    verbose=1,
                    validation_split=0.25)

display(model.summary())


plot_results(history.history, ['accuracy'])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100

KeyboardInterrupt: 

### Define Callbacks

In [6]:
callbacks = [EarlyStopping(monitor='val_loss', patience=25),
             ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)]


### Entire train dataset, validation split

##### Create and fit model

In [9]:
x, shape = make_flat(images)

model = Sequential()
model.add(Dense(32, activation='relu', input_shape = shape))
model.add(Dense(2, activation='softmax'))
model.compile(optimizer='SGD', loss='categorical_crossentropy', metrics=['accuracy', 'mse'])

history = model.fit(x=x, y=labels, 
                    callbacks = callbacks,
                    epochs=500, 
                    validation_split=0.25, 
                    verbose=1)

#best_model = keras.models.load_model('best_model.h5')

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
  1/123 [..............................] - ETA: 0s - loss: 0.6280 - accuracy: 0.6875 - mse: 0.2176

KeyboardInterrupt: 

In [None]:
display(model.summary())

plot_results(history.history, ['accuracy', 'mse'])

### Create balanced dataset

### Create model with resampled dataset

### Add Regularization

In [8]:
LAMBDA = 0.005
x, shape = make_flat(images)


model = Sequential()
model.add(Dense(256, activation='relu', input_shape = shape, kernel_regularizer=regularizers.l2(LAMBDA)))
model.add(Dense(64, activation='relu', kernel_regularizer=regularizers.l2(LAMBDA)))
model.add(Dense(16, activation='relu', kernel_regularizer=regularizers.l2(LAMBDA)))
model.add(Dense(2, activation='softmax'))
model.compile(optimizer='SGD', metrics=['accuracy', 'mse'], loss='categorical_crossentropy')

history = model.fit(x=x, y=labels, 
                    callbacks=callbacks, 
                    epochs=500, 
                    validation_split=0.25,
                    verbose=1)

plot_results(history.history, ['accuracy', 'mse'])

best_model = keras.models.load_model('best_model.h5')
best_model.summary()

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500

KeyboardInterrupt: 

### Add Dropout

In [None]:
x, shape = make_flat(images)


LAMBDA = 0.005
DROPOUT = 0.3

model = Sequential()
model.add(Dense(256, activation='relu', input_shape = shape, kernel_regularizer=regularizers.l2(LAMBDA)))
model.add(Dropout(DROPOUT))

model.add(Dense(64, activation='relu', kernel_regularizer=regularizers.l2(LAMBDA)))
model.add(Dropout(DROPOUT))

model.add(Dense(16, activation='relu', kernel_regularizer=regularizers.l2(LAMBDA)))
model.add(Dropout(DROPOUT))

model.add(Dense(2, activation='softmax'))

model.compile(optimizer='SGD', metrics=['accuracy', 'mse'], loss='categorical_crossentropy')

history = model.fit(x=x, y=labels, 
                    callbacks=callbacks, 
                    epochs=500,
                    validation_split=0.25)

plot_results(history.history, ['accuracy', 'mse'])

best_model = keras.models.load_model('best_model.h5')
best_model.summary()

### Remove the layer of 256 notes from previous model

In [None]:
shp = x.shape
dim_0 = s[0]
dim_1 = s[1]*s[2]*s[3]
images_flat = images.reshape(shp[0], shp[1]*shp[2]*shp[3])
shape = (x_flat.shape[1],)
return x_flat, shape

In [7]:
#x, shape = make_flat(images)



shp = images.shape # temp variable
images_flat = images.reshape(shp[0], 
                             shp[1]*shp[2]*shp[3])
shape = (images_flat.shape[1],)


LAMBDA = 0.005
DROPOUT = 0.3

model = Sequential()

model.add(Dense(64, activation='relu', input_shape = shape, kernel_regularizer=regularizers.l2(LAMBDA)))
model.add(Dropout(DROPOUT))

model.add(Dense(16, activation='relu', kernel_regularizer=regularizers.l2(LAMBDA)))
model.add(Dropout(DROPOUT))

model.add(Dense(2, activation='softmax'))

model.compile(optimizer='SGD', metrics=['accuracy', 'mse'], loss='categorical_crossentropy')

history = model.fit(x=images_flat, y=labels, 
                    callbacks=callbacks, 
                    epochs=500,
                    validation_split=0.25)

plot_results(history.history, ['accuracy', 'mse'])

best_model = keras.models.load_model('best_model.h5')
best_model.summary()

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500

KeyboardInterrupt: 

### Use Conv2D() instead of Dense() for first two layers

In [None]:
images.shape

In [None]:
labels.shape

In [None]:
LAMBDA = 0.005
DROPOUT = 0.3

model = Sequential()

model.add(Conv2D(32, activation='relu', kernel_size=5, padding='same', input_shape=images.shape))
model.add(Conv2D(12, activation='relu', kernel_size=5, padding='same'))
model.add(Flatten())
model.add(Dense(2, activation='softmax'))

model.compile(optimizer='SGD', metrics=['accuracy', 'mse'], loss='categorical_crossentropy')

In [None]:
history = model.fit(x=images, y=labels, 
                    callbacks=callbacks, 
                    epochs=500,
                    validation_split=0.25)

plot_results(history.history, ['accuracy', 'mse'])

best_model = keras.models.load_model('best_model.h5')
best_model.summary()

### Run top performing model on Test data

#### Import test data

In [None]:
KAGGLE_PATH_TEST = '/kaggle/input/chest-xray-pneumonia/chest_xray/chest_xray/test/'

In [None]:
path = COLAB_PATH_TEST
BATCH_SIZE = 32
data_tf_test = tf.keras.preprocessing.image_dataset_from_directory(path, 
                                                image_size=(256, 256), # Can work up to 256
                                                batch_size=BATCH_SIZE,
                                                class_names=['NORMAL', 'PNEUMONIA'],                                                
                                                seed=RANDOM_STATE)

print('CLASS NAMES:', data_tf_test.class_names, 'CORRESPOND TO [0,1]')

for i, item in enumerate(data_tf_test.as_numpy_iterator()):
    if i == 0:
        images_test = np.array(item[0])
        labels_test = np.array(item[1])
    else:
        images_test = np.concatenate([images_test, item[0]], axis=0)
        labels_test = np.concatenate([labels_test, item[1]], axis=0)
        if i % 10 == 0:
            print(int(i/(5232/BATCH_SIZE)*100), '%')

images_test /= 256

labels_test_inverted = np.array([0 if i == 1 else 1 for i in labels_test])
labels_test = np.concatenate([labels_test_inverted.reshape(-1,1), labels_test.reshape(-1,1)], axis=1)

data_tf_test = None # clear RAM
print('Data shape:', images_test.shape)
print('Label shape:', labels_test.shape)

In [None]:
best_model.evaluate(images_test, labels_test)