In [1]:
# === PARETO FRONT STUDIES NOTEBOOK #3 ===

# This notebook helps train an specific best trial id with the conv2D model and it's corresponding loss scheduler.
# This nb will generate a fingerprint with the training information and weights you can later plot on nb #4

# == Before running:
# Configure the same TFrecord paths you used in pareto_analysis.
# Configure the intermediate_dir and experiment_name you used before.
# I suggest checking out the best_trials.csv of your experiment and choosing the one you want to work with (based on loss, trayectory, stability...)
# Once you determine the best trial you want to work with, set its id on the "selected_trial_id" variable.
# Set the number of epochs you want on model.fit

In [2]:
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
from tensorflow.keras.optimizers import Adam
import keras
import pandas as pd
from keras.models import Sequential, Model
from keras.layers import *
from keras.utils import Sequence
from keras.layers import Conv2D, MaxPooling2D
from qkeras import *

from keras.utils import Sequence
from keras.callbacks import CSVLogger
from keras.callbacks import EarlyStopping

from OptimizedDataGenerator_v2 import OptimizedDataGenerator
import tensorflow_probability as tfp
from models_16x16.models import *

import os
import random

pi = 3.14159265359

maxval=1e9
minval=1e-9

2025-07-24 01:01:07.129672: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-07-24 01:01:07.129752: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-07-24 01:01:07.130815: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-24 01:01:07.137799: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

KeyboardInterrupt



In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# ==TFrecords paths
tfrecords_dir_train = "/home/callea/TFrecords_3src_filtered/train"
tfrecords_dir_validation = "/home/callea/TFrecords_3src_filtered/test"

training_generator = OptimizedDataGenerator(
    load_from_tfrecords_dir = tfrecords_dir_train,
    shuffle = True,
    seed = 13,
    quantize = True
)

validation_generator = OptimizedDataGenerator(
    load_from_tfrecords_dir = tfrecords_dir_validation,
    shuffle = True,
    seed = 13,
    quantize = True
)

In [None]:
# Custom loss: NLL and a sum of standard deviations sigma regularizer (you can modify the regularizer).

current_reg_weight = tf.Variable(0.0, trainable=False, dtype=tf.float32, name='reg_weight')

def custom_loss(y, p_base, minval=1e-9, maxval=1e9, scale = 512):

    reg_weight = current_reg_weight
    
    p = p_base
    
    mu = p[:, 0:8:2]
    
    # creating each matrix element in 4x4
    Mdia = minval + tf.math.maximum(p[:, 1:8:2], 0.0)
    Mcov = p[:,8:]
    
    # placeholder zero element
    zeros = tf.zeros_like(Mdia[:,0])
    
    # assembles scale_tril matrix
    row1 = tf.stack([Mdia[:,0],zeros,zeros,zeros])
    row2 = tf.stack([Mcov[:,0],Mdia[:,1],zeros,zeros])
    row3 = tf.stack([Mcov[:,1],Mcov[:,2],Mdia[:,2],zeros])
    row4 = tf.stack([Mcov[:,3],Mcov[:,4],Mcov[:,5],Mdia[:,3]])

    scale_tril = tf.transpose(tf.stack([row1,row2,row3,row4]),perm=[2,0,1])

    dist = tfp.distributions.MultivariateNormalTriL(loc = mu, scale_tril = scale_tril) 
    
    likelihood = dist.prob(y)  
    likelihood = tf.clip_by_value(likelihood,minval,maxval)

    NLL = -1*tf.math.log(likelihood)

    cov_matrix = tf.matmul(scale_tril, tf.transpose(scale_tril, [0, 2, 1])) 
    variances = tf.linalg.diag_part(cov_matrix)
    stds = tf.sqrt(variances + minval)

    sigma_regularizer_1 = tf.reduce_sum(stds, axis=1)

    batch_size = tf.shape(y)[0]
    
    track_loss_values(NLL, sigma_regularizer_1)

    total_loss = NLL + (sigma_regularizer_1 * reg_weight)
    
    return tf.keras.backend.sum(total_loss)

In [None]:
# compiles model
model = CreateModel((16,16,2), n_filters=5, pool_size=3)
model.summary()

In [None]:
model.compile(optimizer=Adam(learning_rate=1e-3), loss=custom_loss)

In [None]:
fingerprint = '%08x' % random.randrange(16**8)
os.makedirs("trained_models", exist_ok=True)
base_dir = f'./trained_models/model-{fingerprint}-checkpoints'
os.makedirs(base_dir, exist_ok=True)  
checkpoint_filepath = base_dir + '/weights.{epoch:02d}-t{loss:.2f}-v{val_loss:.2f}.hdf5'

In [None]:
print(fingerprint)

In [None]:
# ==Important paths you need to configure!
intermediate_dir = "/home/callea/smart-pixels-ml/intermediate_logs"
experiment_name = "general_test"
best_csv_path = os.path.join(intermediate_dir, experiment_name, "best_trials.csv")

# Leer el CSV y mostrar opciones
df = pd.read_csv(best_csv_path)
print("Trial IDs disponibles en best_trials.csv:")
print(df[["trial_id", "keras_val_loss"]]) 

In [None]:
# ==Select trial ID you want to train with the conv2D model
selected_trial_id = 0
row = df[df["trial_id"] == selected_trial_id].iloc[0]

In [None]:
from tensorflow.keras.callbacks import CSVLogger, EarlyStopping, ModelCheckpoint, Callback
from schedulers import *
early_stopping_patience = 50

class CustomModelCheckpoint(ModelCheckpoint):
    def on_epoch_end(self, epoch, logs=None):
        super().on_epoch_end(epoch, logs)
        checkpoints = [f for f in os.listdir(base_dir) if f.startswith('weights')]
        if len(checkpoints) > 1:
            checkpoints.sort()
            for checkpoint in checkpoints[:-1]:
                os.remove(os.path.join(base_dir, checkpoint))

es = EarlyStopping(patience=early_stopping_patience, restore_best_weights=True)

mcp = CustomModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_loss',
    save_best_only=True,
    save_freq='epoch',
    verbose=1
)

scheduler_type = row["scheduler"]
current_reg_weight.assign(float(row["lambda_init"])) 

def build_scheduler(row, reg_weight_var):
    scheduler_type = row["scheduler"]
    kwargs = {
        "reg_weight_var": reg_weight_var,
        "start": float(row["lambda_init"])
    }

    if scheduler_type in ["cosine", "linear", "sigmoid"]:
        kwargs["end"] = float(row["lambda_final"])
        kwargs["stop_threshold"] = float(row["stop_threshold"])
        if scheduler_type == "sigmoid":
            kwargs["sharpness"] = float(row["sharpness"]) 
    elif scheduler_type == "adaptive":
        kwargs["step"] = float(row["step"])
        kwargs["patience"] = int(row["patience"])

    schedulers = {
        "cosine": CosineScheduler,
        "linear": LinearScheduler,
        "sigmoid": SigmoidScheduler,
        "adaptive": AdaptiveScheduler
    }

    if scheduler_type not in schedulers:
        raise ValueError(f"Unsupported scheduler type: {scheduler_type}")

    print(f"Scheduler selected: {scheduler_type}")
    return schedulers[scheduler_type](**kwargs)

scheduler = build_scheduler(row, current_reg_weight)


csv_logger = CSVLogger(f'{base_dir}/training_log.csv', append=True)

In [None]:
history = model.fit(
        x=training_generator,
        validation_data=validation_generator,
        callbacks=[es, mcp, csv_logger, scheduler],
        epochs=1000,
        shuffle=False,
        steps_per_epoch=len(training_generator),
        validation_steps=len(validation_generator),
        verbose=1
    )