## Preparing our data
This notebook contains the code to seperate the images in the given dataset into different directories based on their labels so that we can make use of Tensorflow's Image data generators.

trainLabels.csv contains the image name and level of diabetic retinopathy on a scale of 0 to 4, according to the following scale:
0 - No DR
1 - Mild
2 - Moderate
3 - Severe
4 - Proliferative DR

train/ contains the fundus images in the dataset

prepared_data/ contains the seperated data

In [1]:
import os
import shutil
import pandas as pd
import numpy as np

In [2]:
labels = pd.read_csv('trainLabels.csv')
labels.head()

Unnamed: 0,image,level
0,10_left,0
1,10_right,0
2,13_left,0
3,13_right,0
4,15_left,1


In [3]:
labels = labels.set_index('image')

In [4]:
os.makedirs('./prepared_data/train_data/no_dr/', exist_ok=True)
os.makedirs('./prepared_data/train_data/mild/', exist_ok=True)
os.makedirs('./prepared_data/train_data/moderate/', exist_ok=True)
os.makedirs('./prepared_data/train_data/severe/', exist_ok=True)
os.makedirs('./prepared_data/train_data/proliferative/', exist_ok=True)
os.makedirs('./prepared_data/test_data/no_dr/', exist_ok=True)
os.makedirs('./prepared_data/test_data/mild/', exist_ok=True)
os.makedirs('./prepared_data/test_data/moderate/', exist_ok=True)
os.makedirs('./prepared_data/test_data/severe/', exist_ok=True)
os.makedirs('./prepared_data/test_data/proliferative/', exist_ok=True)

In [5]:
labels['level'].dtype

dtype('int64')

In [6]:
labels.loc['10_left']

level    0
Name: 10_left, dtype: int64

In [7]:
dir = './train/'
n = len(os.listdir(dir))
n

18327

## Splitting the data
We perform a standard 80-20 train-test split. Before saving an image, we use PIL's Image to resize the image into 300x300 images.

In [8]:
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
from PIL import Image, ImageOps
dest = './prepared_data/train_data/'
for dirpath, dirname, filename in os.walk(dir):
    for f, i in zip(filename, range(0, int(0.8*n), 1)):

        im = Image.open(dir+f)
        width, height = im.size
        new_side = min(width, height)
        left = (width - new_side)/2
        top = (height - new_side)/2
        right = (width + new_side)/2
        bottom = (height + new_side)/2
        
        im = im.crop((left, top, right, bottom))
        im = im.resize((300, 300))
        im = ImageOps.grayscale(im)
        im.save(dir+f)

        ind = (f.split('.'))[0]
        if labels.loc[ind].values[0] == 0:
            shutil.move(src = dirpath+'/'+f, dst = dest + 'no_dr/')
        elif labels.loc[ind].values[0] == 1:
            shutil.move(src = dirpath+'/'+f, dst = dest + 'mild/')
        elif labels.loc[ind].values[0] == 2:
            shutil.move(src = dirpath+'/'+f, dst = dest + 'moderate/')
        elif labels.loc[ind].values[0] == 3:
            shutil.move(src = dirpath+'/'+f, dst = dest + 'severe/')
        elif labels.loc[ind].values[0] == 4:
            shutil.move(src = dirpath+'/'+f, dst = dest + 'proliferative/')

In [9]:
dest = './prepared_data/test_data/'
for dirpath, dirname, filename in os.walk(dir):
    for f in filename:
        
        im = Image.open(dir+f)
        width, height = im.size
        new_side = min(width, height)
        left = (width - new_side)/2
        top = (height - new_side)/2
        right = (width + new_side)/2
        bottom = (height + new_side)/2
        
        im = im.crop((left, top, right, bottom))
        im = im.resize((300, 300))
        im = ImageOps.grayscale(im)
        im.save(dir+f)

        ind = (f.split('.'))[0]
        if labels.loc[ind].values[0] == 0:
            shutil.move(src = dirpath+'/'+f, dst = dest + 'no_dr/')
        elif labels.loc[ind].values[0] == 1:
            shutil.move(src = dirpath+'/'+f, dst = dest + 'mild/')
        elif labels.loc[ind].values[0] == 2:
            shutil.move(src = dirpath+'/'+f, dst = dest + 'moderate/')
        elif labels.loc[ind].values[0] == 3:
            shutil.move(src = dirpath+'/'+f, dst = dest + 'severe/')
        elif labels.loc[ind].values[0] == 4:
            shutil.move(src = dirpath+'/'+f, dst = dest + 'proliferative/')