In [None]:
#Lots of code was used from Huseyinefe's kernel, to help me with reading/processing data
#Import the necessary packages
import tensorflow as tf
import keras
import numpy as np
import matplotlib
import sklearn
import cv2
import os
import glob
from matplotlib import pyplot
from sklearn.model_selection import train_test_split, KFold
from keras import models
from keras.preprocessing.image import load_img,img_to_array,ImageDataGenerator
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Dense, BatchNormalization, Flatten, Dropout
from keras.optimizers import Adam, SGD
from PIL import Image
from keras.utils import to_categorical

#-----Define important functions that will be used to process data-----

def read_images(path, number_of_images):
    arr = np.zeros((number_of_images, 224, 224, 3))
    i = 0
    for image in os.listdir(path): #image will be the name of the file
        image_path = path + "/" + image #creating a full path for the image 
        image = Image.open(image_path, mode='r')
        image_data = np.asarray(image, dtype='uint8')
        arr[i] = image_data
        i += 1
    return arr

#Read the images in a path for each of the different categories
#0: No DR, 1: Mild, 2: Moderate, 3: Proliferate, 4: Severe
def read_images_in_path(category):
    if category == 0: #No_DR
        path = r"/kaggle/input/diabetic-retinopathy-224x224-gaussian-filtered/gaussian_filtered_images/gaussian_filtered_images/No_DR"
    elif category == 1: #Mild
        path = r"/kaggle/input/diabetic-retinopathy-224x224-gaussian-filtered/gaussian_filtered_images/gaussian_filtered_images/Mild"
    elif category == 2: #Moderate
        path = r"/kaggle/input/diabetic-retinopathy-224x224-gaussian-filtered/gaussian_filtered_images/gaussian_filtered_images/Moderate"
    elif category == 3: #Proliferate_DR
        path = r"/kaggle/input/diabetic-retinopathy-224x224-gaussian-filtered/gaussian_filtered_images/gaussian_filtered_images/Proliferate_DR"
    elif category == 4: #Severe
        path = r"/kaggle/input/diabetic-retinopathy-224x224-gaussian-filtered/gaussian_filtered_images/gaussian_filtered_images/Severe"
    else:
        raise ValueError('Invalid category')
    end_path = path + '/*'
    num_in_path = len(glob.glob(end_path))
    images = read_images(path, num_in_path)
    images = images.astype('uint8')
    return num_in_path, images

#Normalizes pixels for faster training
def normalize_pixels(images):
    images = images.astype('float32')
    images = images/255
    return images

#Decreases square image to res x res
def decrease_res(images, num_images, res):
    new_images = np.zeros((num_images, res, res, 3))
    i = 0
    for image in images:
        new_image     = cv2.resize(image, (res,res))
        new_images[i] = new_image
        i += 1
    return new_images

Now we get the number of images in each category, as well as the images.

In [None]:
No_DR_num, No_DR_images   = read_images_in_path(0) #1805
Mild_num, Mild_images     = read_images_in_path(1) #370
Mod_num, Mod_images       = read_images_in_path(2) #999
Prolif_num, Prolif_images = read_images_in_path(3) #295
Severe_num, Severe_images = read_images_in_path(4) #193

In [None]:
pyplot.imshow(Mild_images[29])
pyplot.axis("off")
pyplot.show()

Now, we normalize the pixels so that their values are in between 0 and 1, instead of 0 and 255. We do this to accelerate learning.

In [None]:
No_DR_images  = normalize_pixels(No_DR_images)
Mild_images   = normalize_pixels(Mild_images)
Mod_images    = normalize_pixels(Mod_images)
Prolif_images = normalize_pixels(Prolif_images)
Severe_images = normalize_pixels(Severe_images)

We now generate a corresponding label vector that will match images to a category.

In [None]:
no_DR  = np.zeros(No_DR_num)
mild   = np.ones(Mild_num)
mod    = np.full(Mod_num, 2)
prolif = np.full(Prolif_num, 3)
severe = np.full(Severe_num, 4)
labels = np.concatenate((no_DR, mild, mod, prolif, severe), axis=0)
labels = to_categorical(labels)

We encode data in a way that we can get a one-to-one correspondence with the labels.

In [None]:
x = np.concatenate((No_DR_images, Mild_images, Mod_images, Prolif_images, Severe_images))
y = labels

Now, let's split x into training, validation, and testing data. We will use 20% of the data as a held-out test set that will evaluate the model. Within the 80% remaining data, we will use 20% of that data as a validation set, so that we can evaluate how good our model is and update parameters before finally applying the model to our held-out test set.

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train,y_train, test_size=0.2, random_state=42)

Now, we decrease the resolution of each image from 224x224 to 64x64. There are two reasons for this. Firstly, having fewer pixels will increase training time significantly. Secondly, we are building a VGG8 architecture from scratch, which takes in inputs of size 64x64.

In [None]:
x_train  = decrease_res(x_train, np.shape(x_train)[0], res=64)
x_val    = decrease_res(x_val, np.shape(x_val)[0], res=64)
x_test   = decrease_res(x_test, np.shape(x_test)[0], res=64)

Let's print out the shapes to see the dimensions of the training, validation, and test sets.

In [None]:
print(np.shape(x_train))
print(np.shape(x_val))
print(np.shape(x_test))

Let's get around to defining the VGG8 model, whose architecture is like a compact version of the VGG16 model. Documentation can be found [here](https://books.google.com/books?id=YfvHDwAAQBAJ&pg=PA347&lpg=PA347&dq=%22vgg+8%22+architecture&source=bl&ots=AEJ6W_yNaO&sig=ACfU3U3dXUXmucNmRTwKDqRDeArvPhO8UQ&hl=en&sa=X&ved=2ahUKEwjT_5WE3cLoAhWWAZ0JHT0SDtIQ6AEwC3oECAgQAQ#v=onepage&q=%22vgg%208%22%20architecture&f=false).

In [None]:
def define_VGG8():
    model = Sequential()
    model.add(Conv2D(32, (3,3), activation='relu', kernel_initializer='he_uniform', input_shape=(64,64,3)))
    model.add(MaxPooling2D((2,2), strides=(2,2)))
    model.add(Conv2D(64, (3,3), activation='relu', kernel_initializer='he_uniform'))
    model.add(Conv2D(64, (3,3), activation='relu', kernel_initializer='he_uniform'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))
    model.add(Conv2D(128, (3,3), activation='relu', kernel_initializer='he_uniform'))
    model.add(Conv2D(128, (3,3), activation='relu', kernel_initializer='he_uniform'))
    model.add(Conv2D(128, (3,3), activation='relu', kernel_initializer='he_uniform'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))
    model.add(Flatten())
    model.add(Dense(512, activation = 'relu', kernel_initializer = 'he_uniform'))
    model.add(Dropout(0.5))
    model.add(Dense(5, activation = 'softmax'))
    opt = SGD(lr=0.01, momentum=0.9)
    model.compile(optimizer=opt, loss=keras.losses.categorical_crossentropy, metrics=['accuracy'])
    return model

These functions will fit the model to the training set, then will evaluate performance on the validation set.

In [None]:
def evaluate_model(trainX, trainY, valX, valY, model, batch_size, epochs):
    hist = model.fit(trainX, trainY, batch_size, epochs, verbose=1, validation_data=(valX, valY))
    _, train_score = model.evaluate(trainX,trainY)
    _, val_score  = model.evaluate(valX,valY)
    return hist, train_score, val_score

#Summarizing results of a particular model
def results_summary(hist):
    pyplot.subplot(2,1,1)
    pyplot.title('Loss')
    pyplot.plot(hist.history['loss'], color='blue',label='Train')
    pyplot.plot(hist.history['val_loss'], color='orange', label='Validation')
    pyplot.subplot(2,1,2)
    pyplot.title('Accuracy')
    pyplot.plot(hist.history['accuracy'], color='blue', label='Train')
    pyplot.plot(hist.history['val_accuracy'], color='orange', label = 'Validation')
    pyplot.show()

Finally, let's fit the model to the training data and evaluate performance on the validation set.

In [None]:
model   = define_VGG8()
hist, train_score, val_score = evaluate_model(x_train, y_train, x_val, y_val, model, batch_size=32, epochs=75)

Let's look at the performance.

In [None]:
results_summary(hist)

Clearly, the model is overfitting, as the validation loss continues to go up. However, the validation acucuracy is fairly stable. We should try changing hyperparameters such decreasing batch size and epochs. 

In [None]:
model2 = define_VGG8()
model3 = define_VGG8()
model4 = define_VGG8()

hist2, train_score2, val_score2 = evaluate_model(x_train, y_train, x_val, y_val, model2, batch_size=32, epochs=30)
hist3, train_score3, val_score3 = evaluate_model(x_train, y_train, x_val, y_val, model3, batch_size=16, epochs=30)
hist4, train_score4, val_score4 = evaluate_model(x_train, y_train, x_val, y_val, model4, batch_size= 8, epochs=50)

Now, let's look at the performance of these new models

In [None]:
results_summary(hist2)
results_summary(hist3)
results_summary(hist4)

Out of the models, Model 4 seems to perform the best. It doesn't seem to overfit and achieves the best performance. Let's use these specifications to create the final model.

In [None]:
def fit_model(x_train, y_train, model, batch_size, epochs):
    history = model.fit(x_train, y_train, batch_size, epochs, verbose=1)
    _, train_score = model.evaluate(x_train, y_train)
    model.save('Diabetic_Retinopathy_Model.h5')
    return history, train_score

In [None]:
final_model = define_VGG8()
hist, train_score = fit_model(x_train, y_train, final_model, batch_size=8, epochs=50)

Let's finally see how the model runs on the test set.

In [None]:
final_model = models.load_model('Diabetic_Retinopathy_Model.h5')
_, score = final_model.evaluate(x_test, y_test)
print(score)

Not bad!