# MLFlow

In [None]:
!pip install dagshub mlflow



In [None]:
import mlflow
import dagshub

import mlflow.tensorflow
from mlflow.models.signature import infer_signature

# Set up MLflow tracking
dagshub.init(repo_owner='simoLoc', repo_name='ProgettoSE4AI', mlflow=True)

# Funzione che attiva il logging automatico di iperparametri, metriche ecc, a seconda della libreria usata.
# Eventualmente possiamo specificarne altre manualmente con mlflow.log_metric()
mlflow.autolog()

# Attiviamo l'autolog per TensorFlow
mlflow.tensorflow.autolog()

# Creiamo un esperimento in MLflow
mlflow.set_experiment("CNN Classification Report")



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=1949e468-f1df-48f7-93e2-c6e55fef5d01&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=e25a11ed810c9297ce14e9a9307ad4cdf417122f0c137cdfd07df0a3b23b552d




Output()

2025/06/01 16:13:19 INFO mlflow.tracking.fluent: Autologging successfully enabled for keras.
2025/06/01 16:13:21 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/06/01 16:13:21 INFO mlflow.tracking.fluent: Autologging successfully enabled for tensorflow.
2025/06/01 16:13:21 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.


<Experiment: artifact_location='mlflow-artifacts:/830f85127d2b48559d1927a1f9a1830a', creation_time=1748599390777, experiment_id='2', last_update_time=1748599390777, lifecycle_stage='active', name='CNN Classification Report', tags={}>

# Import Librerie

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# libreria di metriche di fairness
!pip install aif360
!pip install 'aif360[Reductions]'

Collecting aif360
  Downloading aif360-0.6.1-py3-none-any.whl.metadata (5.0 kB)
Downloading aif360-0.6.1-py3-none-any.whl (259 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m259.7/259.7 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: aif360
Successfully installed aif360-0.6.1
Collecting fairlearn~=0.7 (from aif360[Reductions])
  Downloading fairlearn-0.12.0-py3-none-any.whl.metadata (7.0 kB)
Downloading fairlearn-0.12.0-py3-none-any.whl (240 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.0/240.0 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fairlearn
Successfully installed fairlearn-0.12.0


In [None]:
import os
import pandas as pd
from tqdm import tqdm
import numpy as np
import shutil
from mpl_toolkits.mplot3d import Axes3D
from sklearn.metrics import *
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt # plotting
import pandas as pd # data processing
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.utils import Sequence
from tensorflow.keras.applications.imagenet_utils import preprocess_input
import seaborn as sns
from PIL import Image
import copy
from tqdm.keras import TqdmCallback

## Configurazione device

Un oggetto tensorflow.device rappresenta il dispositivo sul quale avverrà l'esecuzione.  Se il dispositivo disponibile è una GPU, allora la stringa in input sarà '/GPU:0', altrimenti nel nostro caso sarà la CPU passando in input '/CPU:0'.

In [None]:
device = tf.device('/GPU:0' if tf.config.list_physical_devices('GPU') else '/CPU:0')
!nvidia-smi

Sun Jun  1 16:14:14 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   46C    P0             27W /   70W |     102MiB /  15360MiB |      3%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# Apertura dataset già splittato in train e test

In [None]:
data_dir = '/content/drive/Shareddrives/ProgettoSE4AI/datasetUTK/'

There are 23705 rows and 3 columns
                                        age  gender  ethnicity
img_name                                                      
100_0_0_20170112213500903.jpg.chip.jpg  100       0          0
100_0_0_20170112215240346.jpg.chip.jpg  100       0          0
100_1_0_20170110183726390.jpg.chip.jpg  100       1          0
100_1_0_20170112213001988.jpg.chip.jpg  100       1          0
100_1_0_20170112213303693.jpg.chip.jpg  100       1          0


Creazione del `tf.data.Dataset`a partire dal dataframe in precedenza inizializzato, effettua il parsing e il preprocessing delle immagini. In particolare, le trasformazioni effettuate sono:
* `Rescaling`, il quale effettua uno scaling dei valori in un range da 0 a 1, poichè i modelli di deep learning funzionano meglio con valori normalizzati;
* `Normalize`, il quale di standardizzare i dati in un range da [-1, 1], sottraendo per la media e dividendo per la deviazione standard. Prende in input due tuple (che indicano media e deviazione standard) con tre elementi (Red, Green e Blue).



In [None]:
train_paths = np.load(data_dir + 'train_paths.npy')
test_paths = np.load(data_dir + 'test_paths.npy')
test_labels_np = np.load(data_dir + 'test_labels.npy')
train_labels_np = np.load(data_dir + 'train_labels.npy')


# layers di preprocessing
rescale = tf.keras.layers.Rescaling(1.0 / 255.0)

normalization = tf.keras.layers.Normalization(
    mean=[0.485, 0.456, 0.406],
    variance=[0.229, 0.224, 0.225]
)


# funzione di parsing e preprocessing
@tf.function
def parse_image(path):
    # leggi
    img = tf.io.read_file('/content/drive/Shareddrives/ProgettoSE4AI/' + path)
    # decodifica come immagine RGB
    img = tf.image.decode_image(img, channels=3, expand_animations=False)
    # le immagini originali sono 200x200
    img = tf.image.resize(img, [200, 200])
    # applica rescaling e normalization
    img = rescale(img)
    img = normalization(img)
    return img


# Prepara il dataset completo immagini + label
def create_dataset(image_paths, labels_array):
    AUTOTUNE = tf.data.AUTOTUNE

    images = tf.data.Dataset.from_tensor_slices(image_paths)
    labels = tf.data.Dataset.from_tensor_slices(labels_array)

    dataset = (
        tf.data.Dataset.zip((images, labels))
        .map(lambda path, label: (
            parse_image(path),
            {
                'age': label[0],
                'gender': label[1],
                'ethnicity': label[2]
            }
        ), num_parallel_calls=AUTOTUNE)
        .prefetch(AUTOTUNE)
    )
    return dataset

# Dataset unificati
train_dataset = create_dataset(train_paths, train_labels_np)
test_dataset = create_dataset(test_paths, test_labels_np)

# Verifica
for image, label in train_dataset.take(1):
    print("Image shape:", image.shape)
    print("Label:", label)

Image shape: (200, 200, 3)
Label: {'age': <tf.Tensor: shape=(), dtype=int64, numpy=1>, 'gender': <tf.Tensor: shape=(), dtype=int64, numpy=1>, 'ethnicity': <tf.Tensor: shape=(), dtype=int64, numpy=3>}


# Definizione CNN

In [None]:
def create_CNN_multi_output(input_shape,
                          num_ethnicity_classes=5,
                          num_age_classes=4,
                          dropout_rate=0.3,
                          activation='gelu',
                          learning_rate=1e-3):
    inputs = tf.keras.layers.Input(shape=input_shape)


    resnet_model = tf.keras.applications.EfficientNetB0(
      include_top=False,
      weights=None,
      input_tensor=inputs,
      input_shape=input_shape,
      pooling=None,
      classifier_activation='softmax'
    )

    x = resnet_model.output
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = tf.keras.layers.Dense(32, activation=activation)(x)
    x = tf.keras.layers.Dropout(dropout_rate)(x)

    # Output heads
    age_output = tf.keras.layers.Dense(num_age_classes, activation='softmax', name='age')(x)
    gender_output = tf.keras.layers.Dense(1, activation='sigmoid', name='gender')(x)
    ethnicity_output = tf.keras.layers.Dense(num_ethnicity_classes, activation='softmax', name='ethnicity')(x)

    model = tf.keras.Model(inputs, [age_output, gender_output, ethnicity_output])

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss={
            'age': 'sparse_categorical_crossentropy',
            'gender': 'binary_crossentropy',
            'ethnicity': 'sparse_categorical_crossentropy',
        },
        metrics={
            'age': 'accuracy',
            'gender': 'accuracy',
            'ethnicity': 'accuracy'
        }
    )
    return model

# Training

In [None]:
param_grid = {
    'dropout_rate': [0.2, 0.5],
    'batch_size': [32],
    'epochs': [10, 15],
    'activation': ['gelu'],
    'learning_rate': [1e-3],
}

In [None]:
all_history = []
best_score = 0
best_model = None
best_run = None

In [None]:
for dropout_rate in param_grid['dropout_rate']:
  for batch_size in param_grid['batch_size']:
    for epochs in param_grid['epochs']:
      for activation in param_grid['activation']:
        for learning_rate in param_grid['learning_rate']:

            print(f"Dropout rate: {dropout_rate}")
            print(f"Batch size: {batch_size}")
            print(f"Epochs: {epochs}")
            print(f"Activation: {activation}")
            print(f"Learning rate: {learning_rate}")

            with mlflow.start_run():

              mlflow.log_param("dropout_rate", dropout_rate)
              mlflow.log_param("batch_size", batch_size)
              mlflow.log_param("epochs", epochs)
              mlflow.log_param("activation", activation)
              mlflow.log_param("learning_rate", learning_rate)

              # Crea il modello
              model = create_CNN_multi_output(
                  input_shape=(200, 200, 3),
                  activation=activation,
                  dropout_rate=dropout_rate,
                  learning_rate=learning_rate
              )

              # Train
              batched_train = train_dataset.batch(batch_size)
              batched_test = test_dataset.batch(batch_size)

              history = model.fit(batched_train,
                                  epochs=epochs,
                                  Evaluation_data=batched_test,
                                  verbose=0,
                                  callbacks=[TqdmCallback(verbose=1)])

              # Log delle metriche per epoca
              for metric_name, values in history.history.items():
                for epoch, value in enumerate(values):
                  mlflow.log_metric(metric_name, value, step=epoch)

              # Log dello storico completo in un file
              history_path = "history.json"
              with open(history_path, 'w') as f:
                json.dump(history.history, f)
              mlflow.log_artifact(history_path)

              # Log del modello
              mlflow.keras.log_model(model, artifact_path="cnn_model")

              # Miglior accuracy di validazione per la configurazione
              # Per un modello multi-output, potresti dover accedere all'accuracy di una specifica uscita, ad es. 'val_ethnicity_accuracy'
              max_val_acc = max(history.history.get('val_ethnicity_accuracy', [0])) # Assicurati che la chiave corrisponda al nome della metrica nel modello

              mlflow.log_metric("max_val_ethnicity_accuracy", max_val_acc)

              # Salvataggio dell'history della configurazione
              run = {
                  'dropout_rate': dropout_rate,
                  'batch_size': batch_size,
                  'epochs': epochs,
                  'activation': activation,
                  'learning_rate': learning_rate,
                  'history': history.history
              }
              all_history.append(run)
              model_path = f'/content/drive/Shareddrives/ProgettoSE4AI/train_model/CNN_Model_b{batch_size}_eps{epochs}_lr_{learning_rate}_dr{dropout_rate}.keras'
              model.save(model_path)
              print(f"Model saved to: {model_path}")

              if max_val_acc > best_score:
                best_score = max_val_acc
                best_run = copy.copy(run)
                best_model = copy.copy(model)

Dropout rate: 0.2
Batch size: 64
Epochs: 5
Activation: gelu
Learning rate: 0.001


0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

KeyboardInterrupt: 

In [None]:
batch_size = best_run['batch_size']
epochs = best_run['epochs']
learning_rate = best_run['learning_rate']
dropout_rate = best_run['dropout_rate']
best_model_path = f'/content/drive/Shareddrives/ProgettoSE4AI/train_model/Best_Model_b{batch_size}_eps{epochs}_lr_{learning_rate}_dr{dropout_rate}.keras'
best_model.save(best_model_path)

print(f"Best model saved to: {best_model_path}")

In [None]:
history = best_run['history']

# Training vs Evaluation Loss
plt.figure()
plt.plot(history['loss'], marker='o', label='Training Loss')
plt.plot(history['val_loss'], marker='o', label='Evaluation Loss')
plt.title('Training vs Evaluation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.savefig('training_vs_validation_loss.png')
plt.show()

# Train Loss per task
plt.figure()
plt.plot(history['age_loss'], marker='o', label='Age Loss')
plt.plot(history['gender_loss'], marker='o', label='Gender Loss')
plt.plot(history['ethnicity_loss'], marker='o', label='Ethnicity Loss')
plt.title('Train Loss per Task')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.savefig('train_task_loss.png')
plt.show()

# Evaluation Loss per task
plt.figure()
plt.plot(history['val_age_loss'], marker='o', label='Eval Age Loss')
plt.plot(history['val_gender_loss'], marker='o', label='Eval Gender Loss')
plt.plot(history['val_ethnicity_loss'], marker='o', label='Eval Ethnicity Loss')
plt.title('Evaluation Loss per Task')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.savefig('validation_task_loss.png')
plt.show()

# Train Accuracy per task
plt.figure()
plt.plot(history['age_accuracy'], marker='o', label='Age Accuracy')
plt.plot(history['gender_accuracy'], marker='o', label='Gender Accuracy')
plt.plot(history['ethnicity_accuracy'], marker='o', label='Ethnicity Accuracy')
plt.title('Train Accuracy per Task')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.savefig('train_task_accuracy.png')
plt.show()

# Evaluation Accuracy per task
plt.figure()
plt.plot(history['val_age_accuracy'], marker='o', label='Eval Age Accuracy')
plt.plot(history['val_gender_accuracy'], marker='o', label='Eval Gender Accuracy')
plt.plot(history['val_ethnicity_accuracy'], marker='o', label='Eval Ethnicity Accuracy')
plt.title('Evaluation Accuracy per Task')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.savefig('validation_task_accuracy.png')
plt.show()

In [None]:
dic = {
  "age": {
    "0": "young",
    "1": "adult",
    "2": "senior",
    "3": "elderly"
  },
  "ethnicity": {
    "0": "White",
    "1": "Black",
    "2": "Asian",
    "3": "Indian",
    "4": "Others"
  },
  "gender": {
    "0": "Male",
    "1": "Female"
  }
}
