# Ocular Disease Recognition - Preprocessing

In [None]:
%%capture
!pip install openpyxl

In [None]:
import numpy as np
import pandas as pd
import os
import cv2
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
DATA_PATH = '/kaggle/input/ocular-disease-recognition-odir5k/ODIR-5K/ODIR-5K/data.xlsx'
IMG_DIR = '/kaggle/input/ocular-disease-recognition-odir5k/ODIR-5K/ODIR-5K/Training Images/'

In [None]:
main_df = pd.read_excel(DATA_PATH)
print(main_df.shape)
main_df.head()

I intend to feed the network 2 images at a time - left and right eye together. This would make more sense from the medical point of view, but also this way i will be able to use labeled columns directly (they do not specify which eye was affected). <br>
If i wanted to use one image at a time, i would have to search "Left-Diagnostic Keywords" and "Right-Diagnostic Keywords" columns for specific keywords. In some cases this wouldn't be a problem: e.g. cataract is always a "cataract", but complex diseases like diabetes could manifest differently, different terms could be used to describe the same thing, or even the same term could be written differently by different doctors: e.g. "nonproliferative" and "non proliferative".<br><br>

In this dataset, we have a significant dispersion of image sizes (height: 188 - 3456, width: 250 - 5184) with something like 150 images under 1000px size and 10 images under 200px. Most of the images are horizontal or square, but there are also 114 images that are vertical<br>
(these numbers come from my notebook on file structure exploration: https://www.kaggle.com/annaszal/ocular-files)<br><br>

Since small images make a rather small fraction of the whole set, i am going to choose size as to better preserve information, the small ones will get upscaled. I'm also going to remove black borders and crop all images to a square, then concatenate left and right images into one. I suppose i could stack the two images in channel dimension instead of dealing with non-square input, or create a new dimension, but i can still do that after i load the concatenated images, so it is more of a personal preference at this point.

In [None]:
IMG_SIZE = 512

In [None]:
def crop(image): 
    # Remove vertical black borders (the image must be already normalized)
    sums = image.sum(axis=0)
    sums = sums.sum(axis=1)
    filter_arr = []
    for s in sums:
        if s == 0:
            filter_arr.append(False)
        else:
            filter_arr.append(True)
    image = image[:, filter_arr]
    
    # Crop to a square shape
    h = image.shape[0]
    w = image.shape[1]    
    
    if h < w:
        x = (w - h)//2
        image = image[:, x:x+h, :]        
    elif h > w:
        x = (h - w)//2
        image = image[x:x+w, :, :]           
    else:
        pass
    
    return image

In [None]:
def preprocess_image(file_name):
    image = cv2.imread(os.path.join(IMG_DIR, file_name))
    
    norm_img = np.zeros(image.shape)
    norm_img = cv2.normalize(image,  norm_img, 0, 255, cv2.NORM_MINMAX)
    
    image = crop(norm_img)
    image = cv2.resize(image, (IMG_SIZE, IMG_SIZE))
    
    return image

def preprocess_patient(patient_id):
    left_eye_file = str(patient_id) + '_left.jpg'
    right_eye_file = str(patient_id) + '_right.jpg'
    image = cv2.hconcat([preprocess_image(left_eye_file), preprocess_image(right_eye_file)]) 
    return image

In [None]:
# example
patient_id = main_df.iloc[7]['ID']
image = preprocess_patient(patient_id)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
plt.imshow(image)

# Create image files

In [None]:
images = {}
for i in tqdm(range(main_df.shape[0])):
    patient_id = main_df.iloc[i]['ID']
    image = preprocess_patient(patient_id)
    images[patient_id] = image

In [None]:
output_dir = "ocular"
img_dir = os.path.join(output_dir, 'images')
os.makedirs(output_dir)
os.makedirs(img_dir)

In [None]:
os.listdir('/kaggle/working')

In [None]:
os.listdir(output_dir)

In [None]:
for i in tqdm(images.keys()):
    out_file_path = os.path.join(img_dir, str(i)+'.jpg')
    cv2.imwrite(out_file_path, images[i])

In [None]:
total_files = 0
for base, dirs, files in os.walk(img_dir):
    for Files in files:
        total_files += 1

total_files

In [None]:
# example
patient_id = 0
image = cv2.imread(os.path.join(img_dir, str(patient_id)+'.jpg'))
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
plt.imshow(image)

# Create CSV

In [None]:
# fix the comma-like symbol
for i in range(main_df.shape[0]):
    corrected_l = main_df.iloc[i]['Left-Diagnostic Keywords'].replace('，', ', ')  
    main_df.loc[i, 'Left-Diagnostic Keywords'] = corrected_l
    corrected_r = main_df.iloc[i]['Right-Diagnostic Keywords'].replace('，', ', ')  
    main_df.loc[i, 'Right-Diagnostic Keywords'] = corrected_r

main_df.head()

In [None]:
main_df.to_csv(os.path.join(output_dir, 'data.csv'), index=False)

In [None]:
os.listdir(output_dir)

In [None]:
df = pd.read_csv(os.path.join(output_dir, 'data.csv'))
df.tail()

In [None]:
!zip -r ocular_512x1024.zip ./ocular

In [None]:
!rm -R ocular