In [None]:
import os

In [None]:
# Remove TF warnings (this can be dangerous)
os.environ["TF_XLA_FLAGS"] = "--tf_xla_enable_xla_devices"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

In [None]:
# See: https://gist.github.com/zrruziev/b93e1292bf2ee39284f834ec7397ee9f
# sudo echo 0 | sudo tee -a /sys/bus/pci/devices/0000\:01\:00.0/numa_node

In [None]:
import tensorflow as tf

from tensorflow.keras import datasets, layers, models
from tensorflow.keras.optimizers import Adam
import keras
from keras.models import Sequential, Model
from keras.layers import *
from keras.utils import Sequence
from keras.layers import Conv2D, MaxPooling2D
from qkeras import *

from keras.utils import Sequence
from keras.callbacks import CSVLogger
from keras.callbacks import EarlyStopping

from qkeras import *
from qkeras.utils import _add_supported_quantized_objects

import matplotlib.pyplot as plt


import json
import random
import psutil

pi = 3.14159265359

maxval=1e9
minval=1e-9

2024-12-02 16:02:23.261089: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
# You can disable the GPU, if a GPU is present
#os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
from dataloaders.OptimizedDataGenerator import OptimizedDataGenerator
from loss import *
from models.models import *

#### Scaling Lists for Different Pixel Pitches (dataset_2s):
* 100x25x100 um:  [150.0, 37.5, 10.0, 1.22]
* 50x25x100 um:   [75.0, 37.5, 10.0, 1.22]
* 50x20x100 um:   [75.0, 30.0, 10.0, 1.22]
* 50x15x100 um:   [75.0, 22.5, 10.0, 1.22]
* 50x12.5x100 um: [75.0, 18.75, 10.0, 1.22]
* 50x10x100 um:   [75.0, 15.0, 10.0, 1.22]

#### Scaling Lists for Different Pixel Pitches (dataset_3sr):
* 100x25x100 um:  [150.0, 37.5, 10.0, 10.0]
* 50x25x100 um:   [75.0, 37.5, 10.0, 10.0]
* 50x20x100 um:   [75.0, 30.0, 10.0, 10.0]
* 50x15x100 um:   [75.0, 22.5, 10.0, 10.0]
* 50x12.5x100 um: [75.0, 18.75, 10.0, 10.0]
* 50x10x100 um:   [75.0, 15.0, 10.0, 10.0]

In [3]:
batch_size = 5000
val_batch_size = 5000
train_file_size = 50
val_file_size = 10

# See: https://docs.google.com/document/d/1ZoqVyJOOAXhzt2egMWh3OoNJ6lWq5lNR6sjcYON4Vlo/edit?tab=t.0#heading=h.k6tyal7z5t5l
dataset_name = "dataset_2s"
# 50x12.5x100 micron pixel sensor => 13x21 pixel sensor array
sensor_geometry_name = "50x12P5x100"
# Either 20 or 2 timeslices
timeslices_name = "timeslices20" if timeslices_all_enable else "timeslices2"
timeslices_range = -1 if timeslices_all_enable else [0, 19]
timeslices_val = 20 if timeslices_all_enable else 2
#
batch_size_name = f"bs{batch_size}"

# Input: parquets
data_dir = f"{data_base_dir}/dataset_2s_50x12P5_parquets/unflipped/recon3D/"
labels_dir = f"{data_base_dir}/dataset_2s_50x12P5_parquets/unflipped/labels/"

# Output: tfrecords
tfrecords_dir_train = f"{tfrecords_base_dir}/tfrecords_{dataset_name}_{sensor_geometry_name}_{timeslices_name}_{batch_size_name}_train"
tfrecords_dir_val = f"{tfrecords_base_dir}/tfrecords_{dataset_name}_{sensor_geometry_name}_{timeslices_name}_{batch_size_name}_val"

training_generator = OptimizedDataGenerator(
    data_directory_path = "/data/dajiang/smartPixels/dataset_3s/dataset_3sr_100x25x150_parquets/unflipped/recon3D/",
    labels_directory_path = "/data/dajiang/smartPixels/dataset_3s/dataset_3sr_100x25x150_parquets/unflipped/labels/",
    is_directory_recursive = False,
    file_type = "parquet",
    data_format = "3D",
    batch_size = batch_size,
    file_count = train_file_size,
    to_standardize= True,
    include_y_local= False,
    labels_list = ['x-midplane','y-midplane','cotAlpha','cotBeta'],
    scaling_list = [75.0, 18.75, 10.0, 1.22],
    input_shape = (timeslices_val,13,21),
    transpose = (0,2,3,1),
    files_from_end = True,
    shuffle = True,

    load_from_tfrecords_dir = "/data/dajiang/smartPixels/tfrecords/tfrecords_dataset_3sr_100x25x150_20t_bs5000_train",
    tfrecords_dir = tfrecords_dir_train,
    use_time_stamps = timeslices_range,
    max_workers = 1, # Don't make this too large (will use up all RAM)
    seed = 10,
    quantize = True # Quantization ON
)

validation_generator = OptimizedDataGenerator(
    data_directory_path = "/data/dajiang/smartPixels/dataset_3s/dataset_3sr_100x25x150_parquets/unflipped/recon3D/",
    labels_directory_path = "/data/dajiang/smartPixels/dataset_3s/dataset_3sr_100x25x150_parquets/unflipped/labels/",
    is_directory_recursive = False,
    file_type = "parquet",
    data_format = "3D",
    batch_size = val_batch_size,
    file_count = val_file_size,
    to_standardize= True,
    include_y_local= False,
    labels_list = ['x-midplane','y-midplane','cotAlpha','cotBeta'],
    scaling_list = [75.0, 18.75, 10.0, 1.22],
    input_shape = (timeslices_val,13,21),
    transpose = (0,2,3,1),
    files_from_end = True,
    shuffle = True,

    load_from_tfrecords_dir = "/data/dajiang/smartPixels/tfrecords/tfrecords_dataset_3sr_100x25x150_20t_bs5000_val",
    tfrecords_dir = tfrecords_dir_val,
    use_time_stamps = timeslices_range,
    max_workers = 1, # Don't make this too large (will use up all RAM)
    seed = 10,
    quantize = True # Quantization ON
)



In [4]:
num_batches = validation_generator.__len__()
print(num_batches)

X_val_all = []
y_val_all = []

num_batches = validation_generator.__len__() # The total number of batches for the validation dataset
#val_num_batches = 1

for i_batch in range(num_batches): # Loop over all batches
    X_val, y_val = validation_generator.__getitem__(i_batch)
    X_val = X_val.numpy()
    y_val = y_val.numpy()
    X_val_all.append(X_val)
    y_val_all.append(y_val)

X_val_all = np.array(np.concatenate(X_val_all))
y_val_all = np.array(np.concatenate(y_val_all))

os.makedirs(npy_base_dir, exist_ok=True)
np.save(f"{npy_base_dir}/X_{timeslices_name}_val.npy", X_val_all)
np.save(f"{npy_base_dir}/y_{timeslices_name}_val.npy", y_val_all)

47


2024-12-02 16:03:10.736112: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 3234 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB MIG 1g.5gb, pci bus id: 0000:81:00.0, compute capability: 8.0


batch = 0
(5000, 13, 21, 20)
(5000, 4)
batch = 1
(5000, 13, 21, 20)
(5000, 4)
batch = 2
(5000, 13, 21, 20)
(5000, 4)
batch = 3
(5000, 13, 21, 20)
(5000, 4)
batch = 4
(5000, 13, 21, 20)
(5000, 4)
batch = 5
(5000, 13, 21, 20)
(5000, 4)


In [5]:
# compiles model
model = CreateModel((13,21,timeslices_val), n_filters=5, pool_size=3)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 13, 21, 20)]      0         
                                                                 
 q_separable_conv2d (QSepar  (None, 11, 19, 5)         285       
 ableConv2D)                                                     
                                                                 
 q_activation (QActivation)  (None, 11, 19, 5)         0         
                                                                 
 q_conv2d (QConv2D)          (None, 11, 19, 5)         30        
                                                                 
 q_activation_1 (QActivatio  (None, 11, 19, 5)         0         
 n)                                                              
                                                                 
 average_pooling2d (Average  (None, 3, 6, 5)           0     

In [6]:
model.compile(optimizer=Adam(learning_rate=0.001), loss=custom_loss)

In [7]:
# training
pitch = '100x25x150'
date = '14Nov2024'
base_dir = '/home/dajiang/smart-pixels-ml/weights/weights_7pitches/weights-{}-bs{}-{}-checkpoints'.format(pitch, batch_size, date)
 
os.mkdir(base_dir)
checkpoint_filepath = base_dir + '/weights.{epoch:02d}-t{loss:.2f}-v{val_loss:.2f}.hdf5'
mcp = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_loss',
    save_best_only=False,
)

best_model_hdf5 = f"{model_base_dir}/weights_7pitches/best_model_{model_name}.hdf5"
best_model_keras = f"{model_base_dir}/weights_7pitches/best_model_{model_name}.keras"
best_model_weights_hdf5 = f"{model_base_dir}/weights_7pitches/best_model_{model_name}_weights.hdf5"
best_model_weights_keras = f"{model_base_dir}/weights_7pitches/best_model_{model_name}_weights.keras"
best_model_architecture_json = f"{model_base_dir}/weights_7pitches/best_model_{model_name}_architecture.json"

if not load_model_from_hdf5_enabled:
    # training
    es = EarlyStopping(
        patience=50,
        restore_best_weights=True
    )

    checkpoint_base_dir = f"{model_base_dir}/weights_7pitches/{dataset_name}_{sensor_geometry_name}_{timeslices_name}_{batch_size_name}"  + ("_bnorm" if batch_norm_enabled else "") + "-checkpoints"

    os.makedirs(checkpoint_base_dir, exist_ok=True)
    checkpoint_filepath = checkpoint_base_dir + '/weights.{epoch:02d}-t{loss:.2f}-v{val_loss:.2f}.hdf5'
    mcp = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='val_loss',
        save_best_only=False,
    )

    class ScalePrintingCallback(keras.callbacks.Callback):    
        def on_epoch_end(self, epoch, logs=None):
            scale_layer = self.model.layers[-1]
            print(
                f"scaling layer ({epoch}):", 
                scale_layer.scale, 
                tf.math.softplus(scale_layer.scale)
            )

    print_scale = ScalePrintingCallback()
    
    history = model.fit(x=training_generator,
                        validation_data=validation_generator,
                        callbacks=[mcp],
                        epochs=100,
                        shuffle=False, # shuffling now occurs within the data-loader
                        verbose=1)

    # Revert to best model
    files = os.listdir(checkpoint_base_dir)
    vlosses = [float(f.split("-v")[1].split(".hdf5")[0]) for f in files]
    bestfile = files[np.argmin(vlosses)]
    model.load_weights(f"{checkpoint_base_dir}/{bestfile}")

    # Save (best) model information to file
    model.save(best_model_hdf5)
    model.save(best_model_keras)
    model.save_weights(best_model_weights_hdf5)
    model.save_weights(best_model_weights_keras)
    model_json = model.to_json()
    with open(best_model_architecture_json, "w") as json_file:
        json_file.write(model_json)

else:
    co = {"custom_loss": custom_loss}
    _add_supported_quantized_objects(co)
    # This overrides the previously compiled model
    # TODO: load just weights
    model = load_model(best_model_hdf5, custom_objects=co)
    model.summary()

2024-11-15 02:16:31.017164: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8906
2024-11-15 02:16:31.249887: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2024-11-15 02:16:31.689255: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:606] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2024-11-15 02:16:31.710408: I tensorflow/core/util/cuda_solvers.cc:179] Creating GpuSolver handles for stream 0x7fa2a4bdbae0
2024-11-15 02:16:31.786865: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f847cb3f780 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-11-15 02:16:31.786916: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA A100-SXM4-40GB MIG 1g.5gb, Compute Capability 8.0
2024-11-15 02:16:31.816761: I tensorflow/compiler/mlir/tensorflow/utils/dump

In [None]:
training_validation_loss_png = f"{model_base_dir}/weights_7pitches/training_validation_loss_{model_name}.png"
if load_model_from_hdf5_enabled:
    from PIL import Image
    img = Image.open(training_validation_loss_png)
    img.show()  # Opens the image in the default viewer
else: 
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()
    plt.savefig(training_validation_loss_png)  # Save as PNG
    plt.show()