##  **First atempt on a ML Model for IQA on BRSet**
### **Instituto de Matemática e Estatística da Universidade de São Paulo (IME-USP)**
### Rodrigo de Castro Michelassi
### 06.03.24

### Importing packages and libraries

In [1]:
import numpy as np
import pandas as pd 
import tensorflow as tf
from itertools import islice
from matplotlib import pyplot as plt
from tensorflow.keras.metrics import AUC
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.preprocessing.image import ImageDataGenerator

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


### Importing labels data

In [2]:
labels = pd.read_csv('../labels.csv', sep=',')
labels.head(10)

Unnamed: 0,image_id,patient_id,camera,patient_age,comorbidities,diabetes_time_y,insuline,patient_sex,exam_eye,diabetes,...,amd,vascular_occlusion,hypertensive_retinopathy,drusens,hemorrhage,retinal_detachment,myopic_fundus,increased_cup_disc,other,quality
0,img00001,1,Canon CR,48.0,diabetes1,12,yes,1,1,yes,...,0,0,0,0,0,0,0,1,0,Adequate
1,img00002,1,Canon CR,48.0,diabetes1,12,yes,1,2,yes,...,0,0,0,0,0,0,0,1,0,Adequate
2,img00003,2,Canon CR,18.0,diabetes1,7,yes,2,1,yes,...,0,0,0,0,0,0,0,0,0,Adequate
3,img00004,2,Canon CR,18.0,diabetes1,7,yes,2,2,yes,...,0,0,0,0,0,0,0,0,0,Adequate
4,img00005,3,Canon CR,22.0,diabetes1,11,yes,1,1,yes,...,0,0,0,0,0,0,0,0,0,Adequate
5,img00006,3,Canon CR,22.0,diabetes1,11,yes,1,2,yes,...,0,0,0,0,0,0,0,0,0,Adequate
6,img00007,4,Canon CR,22.0,diabetes1,1,yes,1,1,yes,...,0,0,0,0,0,0,0,0,0,Adequate
7,img00008,4,Canon CR,22.0,diabetes1,1,yes,1,2,yes,...,0,0,0,0,0,0,0,0,0,Adequate
8,img00009,5,Canon CR,23.0,diabetes1,20,yes,1,1,yes,...,0,0,0,0,0,0,0,0,0,Adequate
9,img00010,5,Canon CR,23.0,diabetes1,20,yes,1,2,yes,...,0,0,0,0,0,0,0,0,0,Adequate


### Importing images data into test and validation

> This approach has the problem that we cannot divide the data into train/test/validation, in order to check if the data is overfitting on training

In [3]:
path = "/Users/rodrigomichelassi/Documents/USP/IQA-Motorola/data/brset/physionet.org/files/brazilian-ophthalmological/1.0.0/fundus_photos"

datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.3,
)

train_generator = datagen.flow_from_directory(
    path,
    batch_size=128,
    target_size=(224, 224),
    class_mode='categorical',   # importar as classes como one-hot-encoded
    shuffle=True,
    subset='training'
)

validation_generator = datagen.flow_from_directory(
    path,
    batch_size=128,
    target_size=(224, 224),
    class_mode='categorical',   # importar as classes como one-hot-encoded
    shuffle=True,
    subset='validation'
)

Found 11387 images belonging to 2 classes.
Found 4879 images belonging to 2 classes.


### Check how many items from each class are on the training set

In [4]:
class_indices = train_generator.class_indices
print("Mapeamento de classes:", class_indices)

index_to_class = {v: k for k, v in class_indices.items()}

class_counts = {class_name: 0 for class_name in class_indices.keys()}
for label in train_generator.labels:
    class_name = index_to_class[label]
    class_counts[class_name] += 1

print("Contagem de imagens por classe:", class_counts)


Mapeamento de classes: {'0': 0, '1': 1}
Contagem de imagens por classe: {'0': 1391, '1': 9996}


### Check how many items from each class are on the validation set

In [5]:
# Acessando o mapeamento de classe para índice
class_indices = validation_generator.class_indices
print("Mapeamento de classes:", class_indices)

# Invertendo o mapeamento para índice para classe
index_to_class = {v: k for k, v in class_indices.items()}

# Contando o número de amostras por classe
class_counts = {class_name: 0 for class_name in class_indices.keys()}
for label in validation_generator.labels:
    class_name = index_to_class[label]
    class_counts[class_name] += 1

print("Contagem de imagens por classe:", class_counts)


Mapeamento de classes: {'0': 0, '1': 1}
Contagem de imagens por classe: {'0': 596, '1': 4283}


### Generators are to return a list of tuples $(x, y)$, with $x$ being a numpy array containing a batch of file paths to images, and $y$ a numpy array of corresponding labels

> Validation generator is a list of size $39$, containing tuples

> Each tuple has size $(128 \times 128)$

> With that said, we got that $30$ tuples have approximately $3840$ images

> The $9$ remaining tuples we are using for a validation set

In [6]:
print(f"Numero de tuplas: {len(validation_generator)}" )
# list = []
# for i in range(len(validation_generator)):
#     x = validation_generator[i]
#     print(f"Tamanho de cada elemento da tupla {i}. x = {len(x[0])}, y = {len(x[1])}")

# criar o conjunto de validacao aqui
# val_generator = islice(validation_generator, 30, 39, None)

Numero de tuplas: 39


### Defining the model using VGG16 pre-trained on Image-Net and Fine Tuning it on BRSet

In [7]:
num_classes = 2
epochs = 15

# Load VGG16 Pre Trained CNN
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Freeze the base_model layers so they can not be changed
for layer in base_model.layers:
    layer.trainable = False

# Building the model and fine tuning
model = Sequential([
    base_model,
    Flatten(),
    tf.keras.layers.Dropout(0.5),
    Dense(256, activation='relu'),
    Dense(num_classes, activation='softmax')
])

model.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy', AUC(name='auc')])

### Training and saving weights step

> Problem: we could not separate the validation data from the test data, so we are not doing the test step

In [10]:
checkpoint_filepath = '/Users/rodrigomichelassi/Documents/USP/IQA-Motorola/algorithms/model/weights/model'

model_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath, save_weights_only=True, monitor='val_accuracy', save_best_only=True, mode='max')

history = model.fit(train_generator, epochs=epochs, validation_data=validation_generator, shuffle=True, callbacks=[model_callback])

model.load_weights(checkpoint_filepath)

Epoch 1/5


KeyboardInterrupt: 

### Analyzing data obtained

In [None]:
train_loss = history.history['loss']
val_loss = history.history['val_loss']

plt.plot(train_loss, color='royalblue', label='Training Loss')
plt.plot(val_loss, color='cornflowerblue', label='Training Loss')
plt.title("Loss Comparison")
plt.xlabel("Epochs")
plt.ylabel("Categorical Cross-Entropy Loss")
plt.legend(loc='upper right')
plt.imsave(fname="Loss Graph 1")

train_acc = history.history['acc']
val_acc = history.history['val_acc']

plt.plot(train_acc, color='royalblue', label='Training Accuracy')
plt.plot(val_acc, color='cornflowerblue', label='Training Accuracy')
plt.title("Accuracy Comparison")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend(loc='upper right')
plt.save(fname="Loss Graph 1")

train_auc = history.history['auc']
val_auc = history.history['val_auc']

plt.plot(train_auc, color='royalblue', label='Training AUC')
plt.plot(val_auc, color='cornflowerblue', label='Training AUC')
plt.title("AUC Comparison")
plt.xlabel("Epochs")
plt.ylabel("AUC")
plt.legend(loc='upper right')
plt.save(fname="Loss Graph 1")
