##  **Separating images into classes, train, test and validation**
### **Instituto de Matemática e Estatística da Universidade de São Paulo (IME-USP)**
### Rodrigo de Castro Michelassi
### 01.04.24

In [1]:
import os
import shutil
import pandas as pd 
import sklearn
from sklearn.model_selection import train_test_split

In [2]:
image_path = '/Users/rodrigomichelassi/Documents/USP/IQA-Motorola/data/brset/physionet.org/files/brazilian-ophthalmological/1.0.0/fundus_photos'
train_path = '/Users/rodrigomichelassi/Documents/USP/IQA-Motorola/data/brset/physionet.org/files/brazilian-ophthalmological/1.0.0/fundus_photos/train'
test_path = '/Users/rodrigomichelassi/Documents/USP/IQA-Motorola/data/brset/physionet.org/files/brazilian-ophthalmological/1.0.0/fundus_photos/test'
val_path = '/Users/rodrigomichelassi/Documents/USP/IQA-Motorola/data/brset/physionet.org/files/brazilian-ophthalmological/1.0.0/fundus_photos/validation'
labels = pd.read_csv('../labels.csv', sep=',')

labels['quality_label'] = [0] * len(labels['quality'])

labels['quality_label'] = labels['quality'].apply(lambda x: 1 if x == 'Adequate' else 0)

In [3]:
# divide the data into train, test and validation
train_df, test_df = train_test_split(labels, test_size=0.2)
train_df, val_df = train_test_split(train_df, test_size=0.2)

train_df.head()

Unnamed: 0,image_id,patient_id,camera,patient_age,comorbidities,diabetes_time_y,insuline,patient_sex,exam_eye,diabetes,...,vascular_occlusion,hypertensive_retinopathy,drusens,hemorrhage,retinal_detachment,myopic_fundus,increased_cup_disc,other,quality,quality_label
13809,img13810,7263,NIKON NF5050,,,,,1,1,no,...,0,0,0,0,0,0,0,0,Adequate,1
5957,img05958,3134,Canon CR,53.0,"SAH, hypercholesterolemia",,,1,2,no,...,0,0,0,0,0,0,0,0,Adequate,1
11938,img11939,6283,Canon CR,32.0,,,,2,2,no,...,0,0,0,0,0,0,0,0,Adequate,1
369,img00370,188,Canon CR,83.0,0,,,1,1,no,...,0,0,1,0,0,0,1,0,Adequate,1
10708,img10709,5620,Canon CR,49.0,0,,,1,1,no,...,0,0,0,0,0,0,0,0,Adequate,1


In [4]:
def contBalance(df):
    contAdequate = 0
    contInadequate = 0
    for i in df['quality']:
        if i == 'Adequate':
            contAdequate += 1
        if i == 'Inadequate':
            contInadequate += 1

    print("Adequados: {} / Inadequados: {} / Balanceamento adequadas: {}".format(contAdequate, contInadequate, contAdequate / (contAdequate + contInadequate)))

print("Treino: ")
contBalance(train_df)
print("Teste: ")
contBalance(test_df)
print("Validação: ")
contBalance(val_df)

Adequados: 9134 / Inadequados: 1275 / Balanceamento adequadas: 0.8775098472475742
Adequados: 2841 / Inadequados: 413 / Balanceamento adequadas: 0.8730792870313461
Adequados: 2304 / Inadequados: 299 / Balanceamento adequadas: 0.8851325393776411


In [5]:
def moveDirectory(img_path, dest_path, df):
    for index, row in df.iterrows():
        image_name = f"{row['image_id']}.jpg"
        quality = row['quality_label']

        original_image_path = os.path.join(img_path, image_name)

        if os.path.exists(original_image_path):
            destination_path = os.path.join(dest_path, str(quality), image_name)

            shutil.move(original_image_path, destination_path)
        else:
            print(f"A imagem {image_name} não foi encontrada.")

    print("Processo de movimentação concluído.")

moveDirectory(img_path=image_path, dest_path=train_path, df=train_df)
moveDirectory(img_path=image_path, dest_path=test_path, df=test_df)
moveDirectory(img_path=image_path, dest_path=val_path, df=val_df)

Processo de movimentação concluído.
Processo de movimentação concluído.
Processo de movimentação concluído.
