In [1]:
# === PARETO FRONT STUDIES NOTEBOOK #3 ===

# This notebook helps train an specific best trial id with the conv2D model and it's corresponding loss scheduler.
# This nb will generate a fingerprint with the training information and weights you can later plot on nb #4

# == Before running:
# Configure the same TFrecord paths you used in pareto_analysis.
# Configure the intermediate_dir and experiment_name you used before.
# I suggest checking out the best_trials.csv of your experiment and choosing the one you want to work with (based on loss, trayectory, stability...)
# Once you determine the best trial you want to work with, set its id on the "selected_trial_id" variable.
# Set the number of epochs you want on model.fit

In [1]:
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
from tensorflow.keras.optimizers import Adam
import keras
import pandas as pd
from keras.models import Sequential, Model
from keras.layers import *
from keras.utils import Sequence
from keras.layers import Conv2D, MaxPooling2D
from qkeras import *

from keras.utils import Sequence
from keras.callbacks import CSVLogger
from keras.callbacks import EarlyStopping

from ODG_v2 import OptimizedDataGenerator
import tensorflow_probability as tfp
from models_16x16.models import *

import os
import random

pi = 3.14159265359

maxval=1e9
minval=1e-9

2025-07-22 04:48:20.900927: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-07-22 04:48:20.901008: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-07-22 04:48:20.902085: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-22 04:48:20.909361: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-07-22 04:48:24.498688: I tensorflow/core/common_

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# ==TFrecords paths
tfrecords_dir_train = "/home/callea/TFrecords_3src_filtered/train"
tfrecords_dir_validation = "/home/callea/TFrecords_3src_filtered/test"

training_generator = OptimizedDataGenerator(
    load_from_tfrecords_dir = tfrecords_dir_train,
    shuffle = True,
    seed = 13,
    quantize = True
)

validation_generator = OptimizedDataGenerator(
    load_from_tfrecords_dir = tfrecords_dir_validation,
    shuffle = True,
    seed = 13,
    quantize = True
)



Loading metadata from /home/callea/TFrecords_3src_filtered/train/metadata.json
Loading metadata from /home/callea/TFrecords_3src_filtered/test/metadata.json




In [4]:
# Custom loss: NLL and a sum of standard deviations sigma regularizer (you can modify the regularizer).

current_reg_weight = tf.Variable(0.0, trainable=False, dtype=tf.float32, name='reg_weight')

def custom_loss(y, p_base, minval=1e-9, maxval=1e9, scale = 512):

    reg_weight = current_reg_weight
    
    p = p_base
    
    mu = p[:, 0:8:2]
    
    # creating each matrix element in 4x4
    Mdia = minval + tf.math.maximum(p[:, 1:8:2], 0.0)
    Mcov = p[:,8:]
    
    # placeholder zero element
    zeros = tf.zeros_like(Mdia[:,0])
    
    # assembles scale_tril matrix
    row1 = tf.stack([Mdia[:,0],zeros,zeros,zeros])
    row2 = tf.stack([Mcov[:,0],Mdia[:,1],zeros,zeros])
    row3 = tf.stack([Mcov[:,1],Mcov[:,2],Mdia[:,2],zeros])
    row4 = tf.stack([Mcov[:,3],Mcov[:,4],Mcov[:,5],Mdia[:,3]])

    scale_tril = tf.transpose(tf.stack([row1,row2,row3,row4]),perm=[2,0,1])

    dist = tfp.distributions.MultivariateNormalTriL(loc = mu, scale_tril = scale_tril) 
    
    likelihood = dist.prob(y)  
    likelihood = tf.clip_by_value(likelihood,minval,maxval)

    NLL = -1*tf.math.log(likelihood)

    cov_matrix = tf.matmul(scale_tril, tf.transpose(scale_tril, [0, 2, 1])) 
    variances = tf.linalg.diag_part(cov_matrix)
    stds = tf.sqrt(variances + minval)

    sigma_regularizer_1 = tf.reduce_sum(stds, axis=1)

    batch_size = tf.shape(y)[0]
    
    track_loss_values(NLL, sigma_regularizer_1)

    total_loss = NLL + (sigma_regularizer_1 * reg_weight)
    
    return tf.keras.backend.sum(total_loss)

In [5]:
# compiles model
model = CreateModel((16,16,2), n_filters=5, pool_size=3)
model.summary()

2025-07-22 04:48:25.555132: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 16, 16, 2)]       0         
                                                                 
 q_separable_conv2d (QSepar  (None, 14, 14, 5)         33        
 ableConv2D)                                                     
                                                                 
 q_activation (QActivation)  (None, 14, 14, 5)         0         
                                                                 
 q_conv2d (QConv2D)          (None, 14, 14, 5)         30        
                                                                 
 q_activation_1 (QActivatio  (None, 14, 14, 5)         0         
 n)                                                              
                                                                 
 average_pooling2d (Average  (None, 4, 4, 5)           0     

In [6]:
model.compile(optimizer=Adam(learning_rate=1e-3), loss=custom_loss)

In [7]:
fingerprint = '%08x' % random.randrange(16**8)
os.makedirs("trained_models", exist_ok=True)
base_dir = f'./trained_models/model-{fingerprint}-checkpoints'
os.makedirs(base_dir, exist_ok=True)  
checkpoint_filepath = base_dir + '/weights.{epoch:02d}-t{loss:.2f}-v{val_loss:.2f}.hdf5'

In [8]:
print(fingerprint)

dfbf1abd


In [9]:
# ==Important paths you need to configure!
intermediate_dir = "/home/callea/smart-pixels-ml/intermediate_logs"
experiment_name = "general_test5"
best_csv_path = os.path.join(intermediate_dir, experiment_name, "best_trials.csv")

# Leer el CSV y mostrar opciones
df = pd.read_csv(best_csv_path)
print("Trial IDs disponibles en best_trials.csv:")
print(df[["trial_id", "keras_val_loss"]]) 

Trial IDs disponibles en best_trials.csv:
   trial_id  keras_val_loss
0         1    -4652.891602
1         0    -9084.822266
2         8    -9320.451172


In [10]:
# Elegir manualmente uno (puedes cambiar este número o pedir input())
selected_trial_id = 0
row = df[df["trial_id"] == selected_trial_id].iloc[0]

In [11]:
from tensorflow.keras.callbacks import CSVLogger, EarlyStopping, ModelCheckpoint, Callback
from schedulers import *
early_stopping_patience = 50

class CustomModelCheckpoint(ModelCheckpoint):
    def on_epoch_end(self, epoch, logs=None):
        super().on_epoch_end(epoch, logs)
        checkpoints = [f for f in os.listdir(base_dir) if f.startswith('weights')]
        if len(checkpoints) > 1:
            checkpoints.sort()
            for checkpoint in checkpoints[:-1]:
                os.remove(os.path.join(base_dir, checkpoint))

es = EarlyStopping(patience=early_stopping_patience, restore_best_weights=True)

mcp = CustomModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_loss',
    save_best_only=True,
    save_freq='epoch',
    verbose=1
)

scheduler_type = row["scheduler"]
current_reg_weight.assign(float(row["lambda_init"])) 

def build_scheduler(row, reg_weight_var):
    scheduler_type = row["scheduler"]
    kwargs = {
        "reg_weight_var": reg_weight_var,
        "start": float(row["lambda_init"])
    }

    if scheduler_type in ["cosine", "linear", "sigmoid"]:
        kwargs["end"] = float(row["lambda_final"])
        kwargs["stop_threshold"] = float(row["stop_threshold"])
        if scheduler_type == "sigmoid":
            kwargs["sharpness"] = float(row["sharpness"]) 
    elif scheduler_type == "adaptive":
        kwargs["step"] = float(row["step"])
        kwargs["patience"] = int(row["patience"])

    schedulers = {
        "cosine": CosineScheduler,
        "linear": LinearScheduler,
        "sigmoid": SigmoidScheduler,
        "adaptive": AdaptiveScheduler
    }

    if scheduler_type not in schedulers:
        raise ValueError(f"Unsupported scheduler type: {scheduler_type}")

    print(f"Scheduler selected: {scheduler_type}")
    return schedulers[scheduler_type](**kwargs)

scheduler = build_scheduler(row, current_reg_weight)


csv_logger = CSVLogger(f'{base_dir}/training_log.csv', append=True)

Scheduler selected: adaptive


In [12]:
history = model.fit(
        x=training_generator,
        validation_data=validation_generator,
        callbacks=[mcp, csv_logger, scheduler],
        epochs=50,
        shuffle=False,
        steps_per_epoch=len(training_generator),
        validation_steps=len(validation_generator),
        verbose=1
    )

 Initial reg_weight set to: 0.3940
Epoch 1/50


2025-07-22 04:48:30.369268: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8907
2025-07-22 04:48:30.462703: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2025-07-22 04:48:30.667215: I tensorflow/core/util/cuda_solvers.cc:179] Creating GpuSolver handles for stream 0x5610016fc0a0
2025-07-22 04:48:31.802691: I external/local_xla/xla/service/service.cc:168] XLA service 0x7f2005b7bfd0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-07-22 04:48:31.802740: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA A100-PCIE-40GB MIG 7g.40gb, Compute Capability 8.0
2025-07-22 04:48:31.809171: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1753152511.902083 3675716 device_compiler.h:186] Compile

Epoch 1: val_loss improved from inf to 20661.55469, saving model to ./trained_models/model-dfbf1abd-checkpoints/weights.01-t70326.56-v20661.55.hdf5
[Epoch 1] NLL improved to 11.99 → reg_weight stays at 0.3940
Epoch 2/50
Epoch 2: val_loss improved from 20661.55469 to 14195.40039, saving model to ./trained_models/model-dfbf1abd-checkpoints/weights.02-t17310.28-v14195.40.hdf5
[Epoch 2] NLL improved to 7.30 → reg_weight stays at 0.3940
Epoch 3/50
Epoch 3: val_loss improved from 14195.40039 to 7959.15088, saving model to ./trained_models/model-dfbf1abd-checkpoints/weights.03-t11496.60-v7959.15.hdf5
[Epoch 3] NLL improved to 5.38 → reg_weight stays at 0.3940
Epoch 4/50
Epoch 4: val_loss improved from 7959.15088 to 2923.23657, saving model to ./trained_models/model-dfbf1abd-checkpoints/weights.04-t5592.67-v2923.24.hdf5
[Epoch 4] NLL improved to 4.16 → reg_weight stays at 0.3940
Epoch 5/50
Epoch 5: val_loss improved from 2923.23657 to 479.63199, saving model to ./trained_models/model-dfbf1abd-

KeyboardInterrupt: 