# Uncomment to install all necessary libraries

In [None]:
#!pip install numpy pandas matplotlib torch scikit-learn Pillow seaborn

# Import all necessary libraries

In [None]:
import warnings
warnings.filterwarnings("ignore")

import os, zipfile, fnmatch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset

from sklearn.utils import shuffle
from PIL import Image
from glob import glob
from random import sample

In [None]:
# Enable GPU if available (defaults to CPU)
if torch.cuda.is_available():
    print('Using GPU for this demo...')
    device = torch.device("cuda:0")
else:
    print('Using CPU for this demo...')
    device = torch.device("cpu")

# Unzip imagefiles (training data)

In [None]:
import zipfile, fnmatch

indir = 'train_images_resized_500'
pattern = '*.zip'

datadir = 'train_images_500' # where we will be storing all training images
os.system(f'rm -r {datadir}')
os.system(f'mkdir {datadir}')

for root, dirs, files in os.walk(indir):
    for filename in fnmatch.filter(files, pattern):
        filepath = os.path.join(indir, filename)
        
        # Unzip files
        print(f'Unzipping {filepath}')
        os.system(f'unzip -q {filepath}')
        
        # Move all imagefiles under nested directory to main data directory
        subdir = filepath.replace('.zip', '')
        os.system(f'mv {subdir}/* {datadir}')
        os.system(f'rm -r {subdir}')

# Get training data

In [None]:
def get_abspath(directory):
    """Function returns list of absolute paths under root directory"""
    abspaths = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            abspaths.append(os.path.join(root, file))
    return abspaths

In [None]:
def preprocess(filepath, dirpath):
    """Function takes in csv filepath and images directory"""
    
    # Read data into pandas dataframe
    data = pd.read_csv(filepath)
    #print(data.head())
    
    # Understanding data column types
    print(f'* Get type of each data column: \n {data.dtypes} \n')

    # Get all image files under directory
    imagefiles = get_abspath(dirpath)
    
    # Create a new column for image paths
    image_paths = []
    for idx,row in data.iterrows():
        id_code = row['id_code']
        count = 0
        for imagefile in imagefiles:
            if id_code in imagefile:
                count += 1
                image_paths.append(imagefile)
        if count == 0: image_paths.append('')
    
    # Update data with image paths column
    data['image_path'] = image_paths
    data = data[data['image_path']!=''] # in case there are missing image paths
    print(f'* Updated data ({len(data)} rows)...')
    print(f'{data.head()} \n')

    # Use grouby and count to see if dataset is balanced
    try:
        count = data.groupby(['diagnosis'])['diagnosis'].count()
        print(f'* Get count of each class label: \n {count}')
    except: pass
    
    return data

In [None]:
# Training data (which will be split for training and validation)
trainfile = 'train.csv'
data = preprocess(trainfile, datadir)

# Display training data

In [None]:
sample_images = sample(get_abspath(datadir), 50)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

for image in sample_images:
    plt.figure()
    img = mpimg.imread(image)
    imgplot = plt.imshow(img)
    plt.title(image)
    plt.show()

# Split dataset (training + validation + testing)

In [None]:
from sklearn.model_selection import train_test_split

train_set, validation_set = train_test_split(data, test_size=0.15)
print(f'Training: {len(train_set)} Validation: {len(validation_set)}')

In [None]:
class MyDataset(Dataset):
    
    def __init__(self,data,reshape=True,height=128,width=128,autoencoder=False):
        
        self.no_class = data[data['diagnosis']==0]['image_path'].tolist()
        self.mild_class = data[data['diagnosis']==1]['image_path'].tolist()
        self.moderate_class = data[data['diagnosis']==2]['image_path'].tolist()
        self.severe_class = data[data['diagnosis']==3]['image_path'].tolist()
        self.proliferative_class = data[data['diagnosis']==4]['image_path'].tolist()
        
        self.height = height
        self.width = width
        self.reshape = reshape
        self.autoencoder = autoencoder

        labels = [0 for i in range(len(self.no_class))]
        labels += [1 for i in range(len(self.mild_class))]
        labels += [2 for i in range(len(self.moderate_class))]
        labels += [3 for i in range(len(self.severe_class))]
        labels += [4 for i in range(len(self.proliferative_class))]

        links = self.no_class + self.mild_class + self.moderate_class + self.severe_class + self.proliferative_class

        self.dataframe = pd.DataFrame({"image":links, "labels":labels})
        self.dataframe.reset_index(inplace = True ,drop=True)
        
    def __len__(self):
        no_size = len(self.no_class)
        mild_size = len(self.mild_class)
        moderate_size = len(self.moderate_class)
        severe_size = len(self.severe_class)
        proliferative_size = len(self.proliferative_class)
        
        return no_size + mild_size + moderate_size + severe_size + proliferative_size
    
    def __getitem__(self,idx):

        image_list = self.dataframe["image"][idx]
        label_list = self.dataframe["labels"][idx]

        if type(image_list) == str: 
            image_list = [image_list]
            
        if not isinstance(label_list,np.int64):
            label_list = label_list.values
            
        image_array = []
        
        for image in image_list:
            image = Image.open(image).convert("L")
            
            if self.reshape:
                image = image.resize((self.height,self.width))
                
            array = np.asarray(image)
            
            array = array.reshape(1,self.height,self.width)
            
            image_array.append(array)
            
        return [torch.tensor(np.array(image_array),device=device),torch.tensor(label_list,device=device)]
    
    def __repr__(self):

        # Use grouby and count to see if dataset is balanced
        count = self.dataframe.groupby(['labels'])['labels'].count()
        print(f'* Get count of each class label: \n{count}\n')

        return str(self.dataframe.head(5))

In [None]:
train_set = MyDataset(train_set)
train_set

In [None]:
validation_set = MyDataset(validation_set)
validation_set

# Custom CNN Model

In [None]:
class Model(nn.Module):
    
    def __init__(self):      
        super().__init__()
        
        self.conv1 = nn.Sequential(
            nn.Conv2d(1,256,kernel_size=3), # 126*126*256
            nn.MaxPool2d(2,2), # 63*63*256
            nn.Conv2d(256,32,kernel_size=2) # 63-2+1 = 62*62*32
        )
        
        # n-f+2p/s +1 
        
        self.linear1 = nn.Linear(62,128)
        self.linear2 = nn.Linear(128,64)
        self.flat = nn.Flatten(1)
        self.linear3 = nn.Linear(126976,5) # 5 b/c there are 5 classes/labels
        
    def forward(self,x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.linear1(x))
        x = self.linear2(x)
        x = self.flat(x)
        x = self.linear3(x)
        
        return x

In [None]:
model = Model()
model.to(device)

In [None]:
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

In [None]:
epochs = 10
batch_size = 32
loss_list = []

for epoch in range(epochs):
    total_loss = 0.0
    
    for n in range(len(train_set)//batch_size):
        data , target = train_set[n*batch_size : (n+1)*batch_size]
        
        ypred = model.forward(data.float())
        loss = loss_fn(ypred,target)
        
        total_loss+=loss
        
        optimizer.zero_grad() # clear the gradients
        loss.backward() # calculate the gradient
        optimizer.step() # Wn = Wo - lr * gradient
        
    loss_list.append(total_loss/batch_size)

    print("Epochs {}  Training Loss {:.2f}".format(epoch+1,total_loss/n))

# Save out trained model

In [None]:
modelname = 'ellie-trained-model-cpu-500.pth'
torch.save(model.state_dict(), modelname)

# Display convergence

In [None]:
fig = plt.figure(figsize=(10,10))
plt.plot(list(range(epochs)),[x.tolist() for x in loss_list])
plt.title("Loss vs Epochs")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.show()

# Model evaluation

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

def evaluate(model, validation_set, plot='no'):
    """Function evaluates model by calculating accuracy"""
    
    # Map labels to reference class
    mapping = {0:"No DR", 1:"Mild", 2:"Moderate", 3:"Severe", 4:"Proliferative DR"}

    # Ground truths vs. model predictions
    total = len(validation_set)
    
    actuals, predictions = [], []
    for i in range(total):
        print(f'Predicting {i+1}/{total}...')
        
        data, target = validation_set[i]
        
        pred = torch.argmax(model.forward(data.float()), dim=1)
        
        actual = mapping[target.cpu().detach().item()]
        prediction = mapping[pred.cpu().detach().item()]
        
        actuals.append(actual)
        predictions.append(prediction)
    
        # Plot results...
        if plot == 'yes':
            plt.figure()
            plt.imshow(data[0][0].cpu())
            plt.title(f"Actual : {actual} | Prediction : {prediction}")
            plt.show()
    
    # Build confusion matrix
    plt.figure(figsize=(10,6))
    fx=sns.heatmap(confusion_matrix(actuals,predictions), annot=True, fmt=".2f", cmap="GnBu")
    fx.set_title('Confusion Matrix \n');
    fx.set_xlabel('\n Predicted Values\n')
    fx.set_ylabel('Actual Values\n');
    fx.xaxis.set_ticklabels(mapping.values())
    fx.yaxis.set_ticklabels(mapping.values())
    plt.show()
    
    # Calculate metrics
    print(classification_report(actuals, predictions))
    

In [None]:
evaluate(model, validation_set, plot='no')

# (Optional) Continue training

In [None]:
def continue_training(model, train_set, loss_list, epochs=5, batch_size=32):
    """Function continues training your model"""

    for epoch in range(epochs):
        total_loss = 0.0
    
        for n in range(len(train_set)//batch_size):
            data , target = train_set[n*batch_size : (n+1)*batch_size]
        
            ypred = model.forward(data.float())
            loss = loss_fn(ypred,target)
        
            total_loss+=loss
        
            optimizer.zero_grad() # clear the gradients
            loss.backward() # calculate the gradient
            optimizer.step() # Wn = Wo - lr * gradient
        
        loss_list.append(total_loss/batch_size)

        print("Epochs {}  Training Loss {:.2f}".format(epoch+1,total_loss/n))
        
    return model, loss_list

In [None]:
# Second pass
updated_model, updated_loss_list = continue_training(model, train_set, loss_list, epochs=5, batch_size=32)

# Save out updated model
modelname = 'ellie-trained-model-cpu-500-updated.pth'
torch.save(updated_model.state_dict(), modelname)

# Evaluate with updated model
evaluate(updated_model, validation_set, plot='no')

# Summary