In [1]:
import glob
import os
import pandas as pd

In [2]:
# user defined functions

def process_files(df, origin, destination):
    '''
    Select images with normal fundus (left and/or right) and write male and female to different directories
    '''

    r_cols = ['right_fundus', 'right_diagnostic']    
    l_cols = ['left_fundus', 'left_diagnostic']    
    
    df_normal_left = df[df["left_diagnostic"] == "normal fundus"].copy()
    df_normal_right = df[df["right_diagnostic"] == "normal fundus"].copy()

    df_normal_left.drop(r_cols, axis=1, inplace=True)
    df_normal_right.drop(l_cols, axis=1, inplace=True)

    df_normal_left_male = df_normal_left[df_normal_left.sex == 'male'].copy()
    df_normal_left_female = df_normal_left[df_normal_left.sex == 'female'].copy()

    df_normal_right_male = df_normal_right[df_normal_right.sex == 'male'].copy()
    df_normal_right_female = df_normal_right[df_normal_right.sex == 'female'].copy()
    
    
    normal_left_male_list = df_normal_left_male.left_fundus.to_list()
    normal_left_female_list = df_normal_left_female.left_fundus.to_list()

    normal_right_male_list = df_normal_right_male.right_fundus.to_list()
    normal_right_female_list = df_normal_right_female.right_fundus.to_list()

    print(f'Copying images ...')
    
    for img in normal_left_male_list:
        os.system(f'cp {origin}/{img} ../dat/{destination}/male/')

    for img in normal_left_female_list:
        os.system(f'cp {origin}/{img} ../dat/{destination}/female/')    
        
    for img in normal_right_male_list:
        os.system(f'cp {origin}/{img} ../dat/{destination}/male/')

    for img in normal_right_female_list:
        os.system(f'cp {origin}/{img} ../dat/{destination}/female/')

# Data loading

In [3]:
df_train = pd.read_excel('../OIA_ODIR/Training_Set/Annotation/training_annotation.xlsx', engine='openpyxl')

In [4]:
df_val = pd.read_excel('../OIA_ODIR/Off-site_TestSet/Annotation/off-site_test_annotation.xlsx', engine='openpyxl')

In [5]:
df_test = pd.read_excel('../OIA_ODIR/On-site_TestSet/Annotation/on-site_test_annotation.xlsx', engine='openpyxl')

In [6]:
df_train.shape, df_test.shape, df_val.shape

((3500, 15), (1000, 15), (500, 15))

# Data preprocessing

In [7]:
# rename columns and convert some specific values to lowercase 
column_names = list(df_train.columns)

new_column_names = [
    'id',
    'age',
    'sex',
    'left_fundus',
    'right_fundus',
    'left_diagnostic',
    'right_diagnostic',
    'n',
    'd',
    'g',
    'c',
    'a',
    'h',
    'm',
    'o']

columns_dict = {column_names[i]: new_column_names[i] for i in range(len(column_names))}

df_train.rename(columns_dict, axis=1, inplace=True)
df_test.rename(columns_dict, axis=1, inplace=True)
df_val.rename(columns_dict, axis=1, inplace=True)

df_train['sex'] = df_train['sex'].str.lower()
df_test['sex'] = df_test['sex'].str.lower()
df_val['sex'] = df_val['sex'].str.lower()

In [8]:
# drop some columns
disease_columns = ['n', 'd', 'g', 'c', 'a', 'h', 'm', 'o']

df_train.drop(disease_columns, axis=1, inplace=True)
df_test.drop(disease_columns, axis=1, inplace=True)
df_val.drop(disease_columns, axis=1, inplace=True)

In [9]:
# copying images
# train
process_files(df_train, '../OIA_ODIR/Training_Set/Images/', 'train')
# test
process_files(df_test, '../OIA_ODIR/On-site_TestSet/Images/', 'test')
# val
process_files(df_val, '../OIA_ODIR/Off-site_TestSet/Images/', 'val')

Copying images ...
Copying images ...
Copying images ...


In [10]:
# counting the number of images
train_male = len(glob.glob("../dat/train/male/*.jpg"))
train_female = len(glob.glob("../dat/train/female/*.jpg"))

test_male = len(glob.glob("../dat/test/male/*.jpg"))
test_female = len(glob.glob("../dat/test/female/*.jpg"))

val_male = len(glob.glob("../dat/val/male/*.jpg"))
val_female = len(glob.glob("../dat/val/female/*.jpg"))

print(f'train: male = {train_male}, female = {train_female}, total = {(train_male + train_female)}')
print(f'test:  male = {test_male},  female = {test_female},  total = {(test_male + test_female)}')
print(f'val:   male = {val_male},  female = {val_female},  total = {(val_male + val_female)}')

train: male = 1545, female = 1271, total = 2816
test:  male = 426,  female = 338,  total = 764
val:   male = 230,  female = 182,  total = 412
