In [1]:
from pathlib import Path
import os
import random
from PIL import Image
import numpy as np
from tqdm.auto import tqdm
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Setup path to images folder and annotations file
data_path = Path('/kaggle/input')
image_path = data_path / 'ocular-disease-recognition-odir5k' / 'ODIR-5K' / 'ODIR-5K' / 'Training Images'
annotations_path = data_path / 'ocular-disease-recognition-odir5k' / 'ODIR-5K' / 'ODIR-5K' / 'data.xlsx'

In [None]:
#Descargar a data si non esta descargada??
if not image_path.is_dir():
    print(f'{image_path} doesnt exist')

In [4]:
odir_df = pd.read_excel(annotations_path)
odir_df

Unnamed: 0,ID,Patient Age,Patient Sex,Left-Fundus,Right-Fundus,Left-Diagnostic Keywords,Right-Diagnostic Keywords,N,D,G,C,A,H,M,O
0,0,69,Female,0_left.jpg,0_right.jpg,cataract,normal fundus,0,0,0,1,0,0,0,0
1,1,57,Male,1_left.jpg,1_right.jpg,normal fundus,normal fundus,1,0,0,0,0,0,0,0
2,2,42,Male,2_left.jpg,2_right.jpg,laser spot，moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,1
3,3,66,Male,3_left.jpg,3_right.jpg,normal fundus,branch retinal artery occlusion,0,0,0,0,0,0,0,1
4,4,53,Male,4_left.jpg,4_right.jpg,macular epiretinal membrane,mild nonproliferative retinopathy,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3495,4686,63,Male,4686_left.jpg,4686_right.jpg,severe nonproliferative retinopathy,proliferative diabetic retinopathy,0,1,0,0,0,0,0,0
3496,4688,42,Male,4688_left.jpg,4688_right.jpg,moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,0
3497,4689,54,Male,4689_left.jpg,4689_right.jpg,mild nonproliferative retinopathy,normal fundus,0,1,0,0,0,0,0,0
3498,4690,57,Male,4690_left.jpg,4690_right.jpg,mild nonproliferative retinopathy,mild nonproliferative retinopathy,0,1,0,0,0,0,0,0


In [5]:
odir_df = odir_df.set_index('ID')
odir_df

Unnamed: 0_level_0,Patient Age,Patient Sex,Left-Fundus,Right-Fundus,Left-Diagnostic Keywords,Right-Diagnostic Keywords,N,D,G,C,A,H,M,O
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,69,Female,0_left.jpg,0_right.jpg,cataract,normal fundus,0,0,0,1,0,0,0,0
1,57,Male,1_left.jpg,1_right.jpg,normal fundus,normal fundus,1,0,0,0,0,0,0,0
2,42,Male,2_left.jpg,2_right.jpg,laser spot，moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,1
3,66,Male,3_left.jpg,3_right.jpg,normal fundus,branch retinal artery occlusion,0,0,0,0,0,0,0,1
4,53,Male,4_left.jpg,4_right.jpg,macular epiretinal membrane,mild nonproliferative retinopathy,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4686,63,Male,4686_left.jpg,4686_right.jpg,severe nonproliferative retinopathy,proliferative diabetic retinopathy,0,1,0,0,0,0,0,0
4688,42,Male,4688_left.jpg,4688_right.jpg,moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,0
4689,54,Male,4689_left.jpg,4689_right.jpg,mild nonproliferative retinopathy,normal fundus,0,1,0,0,0,0,0,0
4690,57,Male,4690_left.jpg,4690_right.jpg,mild nonproliferative retinopathy,mild nonproliferative retinopathy,0,1,0,0,0,0,0,0


In [6]:
left_diagnostic_keys_array = odir_df['Left-Diagnostic Keywords'].to_numpy()
left_diagnostic_keys_string = '，'.join(left_diagnostic_keys_array)
left_diagnostic_keys_split_array = left_diagnostic_keys_string.split('，') #i copied and pasted the comma from the output above cause the editor one is not the same comma
values, counts = np.unique(left_diagnostic_keys_split_array, return_counts=True)
left_diagnostic_occurences = {str(value): count for value, count in zip(values, counts)}
print(f'Left diagnostics array: {left_diagnostic_keys_array[:4]}')
print(f'Left diagnostics string: {left_diagnostic_keys_string[:50]}')
print(f'Left diagnostics split array: {left_diagnostic_keys_split_array[:4]}')
print(f'Number of unique diagnostics: {len(left_diagnostic_occurences)}')
print('Unique diagnostics and occurences:')
print('{' + '\n'.join('{!r}: {!r},'.format(k, v) for k, v in left_diagnostic_occurences.items()) + '}')

Left diagnostics array: ['cataract' 'normal fundus'
 'laser spot，moderate non proliferative retinopathy' 'normal fundus']
Left diagnostics string: cataract，normal fundus，laser spot，moderate non pro
Left diagnostics split array: ['cataract', 'normal fundus', 'laser spot', 'moderate non proliferative retinopathy']
Number of unique diagnostics: 82
Unique diagnostics and occurences:
{'age-related macular degeneration': 1,
'anterior segment image': 1,
'arteriosclerosis': 1,
'asteroid hyalosis': 1,
'atrophic change': 1,
'atrophy': 1,
'branch retinal artery occlusion': 1,
'branch retinal vein occlusion': 16,
'cataract': 159,
'central retinal vein occlusion': 7,
'chorioretinal atrophy': 15,
'chorioretinal atrophy with pigmentation proliferation': 3,
'choroidal nevus': 1,
'depigmentation of the retinal pigment epithelium': 3,
'diabetic retinopathy': 29,
'diffuse chorioretinal atrophy': 1,
'diffuse retinal atrophy': 1,
'drusen': 87,
'dry age-related macular degeneration': 108,
'epiretinal membra

In [7]:
right_diagnostic_keys_array = odir_df['Right-Diagnostic Keywords'].to_numpy()
right_diagnostic_keys_string = '，'.join(right_diagnostic_keys_array)
right_diagnostic_keys_split_array = right_diagnostic_keys_string.split('，') #i copied and pasted the comma from the output above cause the editor one is not the same comma
values, counts = np.unique(right_diagnostic_keys_split_array, return_counts=True)
right_diagnostic_occurences = {str(value): count for value, count in zip(values, counts)}
print(f'Right diagnostics array: {right_diagnostic_keys_array[:4]}')
print(f'Right diagnostics string: {right_diagnostic_keys_string[:50]}')
print(f'Right diagnostics split array: {right_diagnostic_keys_split_array[:4]}')
print(f'Number of unique diagnostics: {len(right_diagnostic_occurences)}')
print('Unique diagnostics and occurences:')
print('{' + '\n'.join('{!r}: {!r},'.format(k, v) for k, v in right_diagnostic_occurences.items()) + '}')

Right diagnostics array: ['normal fundus' 'normal fundus' 'moderate non proliferative retinopathy'
 'branch retinal artery occlusion']
Right diagnostics string: normal fundus，normal fundus，moderate non prolifera
Right diagnostics split array: ['normal fundus', 'normal fundus', 'moderate non proliferative retinopathy', 'branch retinal artery occlusion']
Number of unique diagnostics: 84
Unique diagnostics and occurences:
{'abnormal pigment ': 3,
'age-related macular degeneration': 2,
'anterior segment image': 1,
'atrophic change': 6,
'atrophy': 1,
'branch retinal artery occlusion': 2,
'branch retinal vein occlusion': 10,
'cataract': 154,
'central retinal artery occlusion': 4,
'central retinal vein occlusion': 3,
'central serous chorioretinopathy': 1,
'chorioretinal atrophy': 11,
'chorioretinal atrophy with pigmentation proliferation': 1,
'congenital choroidal coloboma': 1,
'depigmentation of the retinal pigment epithelium': 2,
'diabetic retinopathy': 25,
'drusen': 93,
'dry age-related ma

In [8]:
all_diagnostic_keys_split_array = np.concatenate((left_diagnostic_keys_split_array, right_diagnostic_keys_split_array))
values, counts = np.unique(all_diagnostic_keys_split_array, return_counts=True)
all_diagnostic_occurences = {str(value): count for value, count in zip(values, counts)}
print(f'Both eyes diagnostics split array: {all_diagnostic_keys_split_array[:4]}')
print(f'Number of unique diagnostics: {len(all_diagnostic_occurences)}')
print('Unique diagnostics and occurences:')
print('{' + '\n'.join('{!r}: {!r},'.format(k, v) for k, v in all_diagnostic_occurences.items()) + '}')

Both eyes diagnostics split array: ['cataract' 'normal fundus' 'laser spot'
 'moderate non proliferative retinopathy']
Number of unique diagnostics: 105
Unique diagnostics and occurences:
{'abnormal pigment ': 3,
'age-related macular degeneration': 3,
'anterior segment image': 2,
'arteriosclerosis': 1,
'asteroid hyalosis': 1,
'atrophic change': 7,
'atrophy': 2,
'branch retinal artery occlusion': 3,
'branch retinal vein occlusion': 26,
'cataract': 313,
'central retinal artery occlusion': 4,
'central retinal vein occlusion': 10,
'central serous chorioretinopathy': 1,
'chorioretinal atrophy': 26,
'chorioretinal atrophy with pigmentation proliferation': 4,
'choroidal nevus': 1,
'congenital choroidal coloboma': 1,
'depigmentation of the retinal pigment epithelium': 5,
'diabetic retinopathy': 54,
'diffuse chorioretinal atrophy': 1,
'diffuse retinal atrophy': 1,
'drusen': 180,
'dry age-related macular degeneration': 228,
'epiretinal membrane': 187,
'epiretinal membrane over the macula': 14,
'