## Preparing our data
This notebook contains the code to seperate the images in the given dataset into different directories based on their labels so that we can make use of Tensorflow's Image data generators.

aptos_data/ contains the fundus images in the dataset and the csv files containing the labels.

train.csv, test.csv, val.csv contains the image name and level of diabetic retinopathy on a scale of 0 to 4, according to the following scale:

0 - No DR
1 - Mild
2 - Moderate
3 - Severe
4 - Proliferative DR

prepared_data_aptos/ contains the prepared data ready to be used.

In [32]:
import os
import shutil
import pandas as pd
import numpy as np
import cv2

In [33]:
trainLabels = pd.read_csv('./aptos_data/train.csv')
valLabels = pd.read_csv('./aptos_data/valid.csv')
testLabels = pd.read_csv('./aptos_data/test.csv')
trainLabels = trainLabels.set_index('id_code')
valLabels = valLabels.set_index('id_code')
testLabels = testLabels.set_index('id_code')
valLabels.head()

Unnamed: 0_level_0,diagnosis
id_code,Unnamed: 1_level_1
000c1434d8d7,2
001639a390f0,4
0024cdab0c1e,1
002c21358ce6,0
005b95c28852,0


In [34]:
print(trainLabels['diagnosis'].value_counts())
print(valLabels['diagnosis'].value_counts())
print(testLabels['diagnosis'].value_counts())

diagnosis
0    1434
2     808
1     300
4     234
3     154
Name: count, dtype: int64
diagnosis
0    172
2    104
1     40
4     28
3     22
Name: count, dtype: int64
diagnosis
0    199
2     87
4     33
1     30
3     17
Name: count, dtype: int64


In [35]:
os.makedirs('./prepared_data_aptos/val_data/0 - no_dr/', exist_ok=True)
os.makedirs('./prepared_data_aptos/val_data/1 - mild/', exist_ok=True)
os.makedirs('./prepared_data_aptos/val_data/2 - moderate/', exist_ok=True)
os.makedirs('./prepared_data_aptos/val_data/3 - severe/', exist_ok=True)
os.makedirs('./prepared_data_aptos/val_data/4 - proliferative/', exist_ok=True)
os.makedirs('./prepared_data_aptos/train_data/0 - no_dr/', exist_ok=True)
os.makedirs('./prepared_data_aptos/train_data/1 - mild/', exist_ok=True)
os.makedirs('./prepared_data_aptos/train_data/2 - moderate/', exist_ok=True)
os.makedirs('./prepared_data_aptos/train_data/3 - severe/', exist_ok=True)
os.makedirs('./prepared_data_aptos/train_data/4 - proliferative/', exist_ok=True)
os.makedirs('./prepared_data_aptos/test_data/0 - no_dr/', exist_ok=True)
os.makedirs('./prepared_data_aptos/test_data/1 - mild/', exist_ok=True)
os.makedirs('./prepared_data_aptos/test_data/2 - moderate/', exist_ok=True)
os.makedirs('./prepared_data_aptos/test_data/3 - severe/', exist_ok=True)
os.makedirs('./prepared_data_aptos/test_data/4 - proliferative/', exist_ok=True)

In [36]:
trainLabels['diagnosis'].dtype

dtype('int64')

## Tranforming the data
We make our images into 224x224 images so that it is compatible with the VGG16 model present in tensorflow.keras.applications.

In [37]:
import cv2
direc = './atpos_data/train_images/'
dest = './prepared_data_aptos/train_data/'
severity = { 
             0:'0 - no_dr',
             1:'1 - mild',
             2:'2 - moderate',
             3:'3 - severe',
             4:'4 - proliferative',
           }
for dirpath, dirname, filename in os.walk(direc):
    for f in filename:
        im = cv2.imread(direc+f)
        im = cv2.resize(im, (224, 224))
        ind = (f.split('.'))[0]
        sev = trainLabels.loc[ind].values[0]
        cv2.imwrite(dest + severity[sev]+'/' + f, im)

In [38]:
direc = './atpos_data/test_images/'
dest = './prepared_data_aptos/test_data/'
for dirpath, dirname, filename in os.walk(direc):
    for f in filename:
        im = cv2.imread(direc+f)
        im = cv2.resize(im, (224, 224))
        ind = (f.split('.'))[0]
        sev = testLabels.loc[ind].values[0]
        cv2.imwrite(dest + severity[sev]+'/' + f, im)

In [39]:
direc = './atpos_data/val_images/'
dest = './prepared_data_aptos/val_data/'
for dirpath, dirname, filename in os.walk(direc):
    for f in filename:
        im = cv2.imread(direc+f)
        im = cv2.resize(im, (224, 224))
        ind = (f.split('.'))[0]
        sev = valLabels.loc[ind].values[0]
        cv2.imwrite(dest + severity[sev]+'/' + f, im)