In [1]:
# Environment Variables: 

YES = 1
NO = 0

DATA_DIR = '/home/pujan/Research/RHEED/Data/' # Change to your DATA PATH
using_GPU = YES

In [2]:
# Imports for Training

import os
import numpy as np
import matplotlib.pyplot as plt
import h5py
from scipy.ndimage import median_filter
from scipy.optimize import least_squares
from scipy.optimize import curve_fit
from dask import delayed, compute
from dask.distributed import Client
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import tensorflow as tf
from qkeras import *

%matplotlib inline
output_scaler = StandardScaler()
if(using_GPU):
    print(tf.config.list_physical_devices('GPU'))

2025-03-31 17:42:53.017549: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-31 17:42:53.050592: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-31 17:42:53.050635: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-31 17:42:53.050664: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-31 17:42:53.058477: I tensorflow/core/platform/cpu_feature_g

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:3', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:4', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:5', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:6', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:7', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:8', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:9', device_type='GPU')]


In [3]:
# Read H5 Data File:

RHEED_data_file = DATA_DIR + 'RHEED_4848_test6.h5'
spot = 'spot_2'
h5 = h5py.File(RHEED_data_file, 'r')

raw_data = []
for growth in h5.keys():
    raw_data.extend(h5[growth][spot])
raw_data = np.array(raw_data).astype(np.float32)
raw_data = np.expand_dims(raw_data, axis=-1).astype(np.float32) # if (batch_size, height, width, channels)

print(f'[Raw Images Shape]: {raw_data.shape}')

[Raw Images Shape]: (150985, 48, 48, 1)


In [4]:
# Normalize w/ image max

normalized_images = []
normalized_factor = []
for image in tqdm(raw_data):
    normalized_images.append(image / np.max(image))
    normalized_factor.append(np.max(image))
normalized_images = np.array(normalized_images).astype(np.float32)
normalized_factor = np.array(normalized_factor).astype(np.float32)


print(f'[Normalized Images Shape]: {normalized_images.shape}')

100%|██████████| 150985/150985 [00:01<00:00, 109449.84it/s]


[Normalized Images Shape]: (150985, 48, 48, 1)


In [5]:
# Estimate Labels:
load_labels = YES # (Takes <1 min to load, ~40 mins to generate)
p_count = 100 
# Import From File
if load_labels:
    RHEED_label_file = DATA_DIR + 'Estimated_Labels.npy'
    estimated_labels = np.load(RHEED_label_file)

# Generate
else:
    from multiprocessing import Pool
    with Pool(100) as p:
        estimated_labels = np.array(list(tqdm(p.imap(estimate_label, normalized_images),
                                     total=len(normalized_images),
                                     position=0)))

# avg_params = np.mean(estimated_labels, axis=0) # output_scaler.mean_ ?
# std_params = np.std(estimated_labels, axis=0) # output_scaler.var_ ?

print(f'[Estimated Labels Shape]: {estimated_labels.shape}')

[Estimated Labels Shape]: (150985, 5)


In [6]:
# Create DataSet:
batch_size = 1000

with tf.device('CPU'):
    dataset = tf.data.Dataset.from_tensor_slices(normalized_images)
    dataset = dataset.shuffle(normalized_images.shape[0], reshuffle_each_iteration=True)
    dataset = dataset.batch(batch_size)

output_scaler.fit(estimated_labels)

2025-03-31 17:43:38.362479: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2143] Unable to enable peer access between device ordinals 0 and 9, status: INTERNAL: failed to enable peer access from 0x562bb1bb4db0 to 0x562bab8f9300: CUDA_ERROR_TOO_MANY_PEERS: peer mapping resources exhausted
2025-03-31 17:43:38.381894: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2143] Unable to enable peer access between device ordinals 1 and 9, status: INTERNAL: failed to enable peer access from 0x562ba9445ce0 to 0x562bab8f9300: CUDA_ERROR_TOO_MANY_PEERS: peer mapping resources exhausted
2025-03-31 17:43:38.395296: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2143] Unable to enable peer access between device ordinals 2 and 9, status: INTERNAL: failed to enable peer access from 0x562ba9eb71f0 to 0x562bab8f9300: CUDA_ERROR_TOO_MANY_PEERS: peer mapping resources exhausted
2025-03-31 17:43:38.406651: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2143] Unable to enable peer access between

In [7]:
# TF Functions: (TENSORFLOW)
print_example_guassian = NO
print_example_loss = NO


# mean_x, mean_y, cov_x, cov_y, theta
def generate_guassian(batch, image_shape):
    batch_size = batch.shape[0]
    mean_x, mean_y, cov_x, cov_y, theta = tf.unstack(batch, axis=-1)
    x = tf.range(image_shape[1], dtype=tf.float32)[:, tf.newaxis]
    x = tf.tile(x, [1, image_shape[0]])

    y = tf.range(image_shape[0], dtype=tf.float32)[tf.newaxis, :]
    y = tf.tile(y, [image_shape[1], 1])

    x = tf.tile(tf.expand_dims(x, 0), [batch_size, 1, 1])
    y = tf.tile(tf.expand_dims(y, 0), [batch_size, 1, 1])

    rota_matrix = tf.stack([tf.cos(theta), -tf.sin(theta), tf.sin(theta), tf.cos(theta)], axis=-1)
    rota_matrix = tf.reshape(rota_matrix, (batch_size, 2, 2))

    xy = tf.stack([x - tf.reshape(mean_x, (-1, 1, 1)), y - tf.reshape(mean_y, (-1, 1, 1))], axis=-1)
    xy = tf.einsum('bijk,bkl->bijl', xy, rota_matrix)

    img = tf.exp(-0.5 * (xy[:, :, :, 0]**2 / tf.reshape(cov_x, (-1, 1, 1))**2 + xy[:, :, :, 1]**2 / tf.reshape(cov_y, (-1, 1, 1))**2))

    return tf.expand_dims(img, axis=-1) # if (batch_size, height, width, channels)
    return tf.expand_dims(img, axis=1)  # if (batch_size, channels, height, width)

def custom_weighted_mse_loss(I, J, n):
    W = tf.pow(I, n)
    squared_diffs = tf.pow(I - J, 2)
    weighted_squared_diffs = W * squared_diffs

    return tf.reduce_mean(weighted_squared_diffs)

if print_example_loss:
    I = tf.random.normal((5, 1, 48, 48))
    J = tf.random.normal((5, 1, 48, 48))
    n = 2
    loss = custom_weighted_mse_loss(I, J, n)
    print("[Custom Weighted MSE Loss]:", loss.numpy())

if print_example_guassian:
    image_shape = (48, 48)
    batch = tf.convert_to_tensor(estimated_labels[0:5])
    generated_imgs = generate_guassian(batch, image_shape)
    plt.imshow(tf.squeeze(generated_imgs[0]))
    plt.show()

In [8]:
# Model Architecture QAT
integer_bits = 2
fraction_bits = 6
symmetric = 0
keep_negative = 1

total_bits = integer_bits + fraction_bits
model = tf.keras.Sequential(
    [
        QConv2DBatchnorm(
            filters=6, kernel_size=5, strides=1, padding='valid',
            kernel_quantizer=f"quantized_bits({total_bits}, {integer_bits}, alpha=1)",
            bias_quantizer=f"quantized_bits({total_bits}, {integer_bits}, alpha=1)",
            kernel_initializer='lecun_uniform',
            kernel_regularizer=tf.keras.regularizers.l1(0.0001),
            use_bias=True,
        ),
       
        QActivation(f"quantized_relu({total_bits}, {integer_bits})"),
        
        tf.keras.layers.MaxPool2D(pool_size=4, strides=4),

        QConv2DBatchnorm(
            filters=16, kernel_size=5, strides=1, padding='valid',
            kernel_quantizer=f"quantized_bits({total_bits}, {integer_bits}, alpha=1)",
            bias_quantizer=f"quantized_bits({total_bits}, {integer_bits}, alpha=1)",
            kernel_initializer='lecun_uniform',
            kernel_regularizer=tf.keras.regularizers.l1(0.0001),
            use_bias=True,
        ),

        QActivation(f"quantized_relu({total_bits}, {integer_bits})"),
        
        tf.keras.layers.MaxPool2D(pool_size=2, strides=2),

        tf.keras.layers.Flatten(),

        QDense(
            units=98,
            kernel_quantizer=f"quantized_bits({total_bits}, {integer_bits}, alpha=1)",
            bias_quantizer=f"quantized_bits({total_bits}, {integer_bits}, alpha=1)",
        ),
        
        QActivation(f"quantized_relu({total_bits}, {integer_bits})"),

        QDense(
            units=52,
            kernel_quantizer=f"quantized_bits({total_bits}, {integer_bits}, alpha=1)",
            bias_quantizer=f"quantized_bits({total_bits}, {integer_bits}, alpha=1)",
        ),
        QActivation(f"quantized_relu({total_bits}, {integer_bits})"),

        QDense(
            units=5,
            kernel_quantizer=f"quantized_bits({total_bits}, {integer_bits}, alpha=1)",
            bias_quantizer=f"quantized_bits({total_bits}, {integer_bits}, alpha=1)",
        )
    ]
)

# Compile the quantization-aware model
model.compile(optimizer='adam', loss=custom_weighted_mse_loss, run_eagerly=True)

In [10]:
# Training Loop
train_model = YES
save_model = NO
load_model = NO
model_summary = NO

if train_model:
    num_epochs = 200
    lr = 0.0001
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
    n = 1

    for epoch in range(num_epochs):
        running_loss = 0.0

        if epoch % 10 == 0:
            n += 0.1

        for image_batch in tqdm(dataset): 
            with tf.GradientTape() as tape:
                embedding = model(image_batch)
                unscaled_param = tf.constant(embedding * output_scaler.var_ ** 0.5 + output_scaler.mean_)
                final = generate_guassian(unscaled_param, (48,48))
                loss = custom_weighted_mse_loss(image_batch, final, n)
            grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

            running_loss += loss.numpy()
        average_loss = running_loss / len(dataset)
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {average_loss}")

if (save_model and not load_model):
    model.save(Gaussian_Model_QAT_file)

if (load_model and not save_model):
    with tf.keras.utils.custom_object_scope({'custom_weighted_mse_loss': custom_weighted_mse_loss,
                                         'QConv2D': QConv2D,
                                         'QBatchNormalization': QBatchNormalization,
                                         'QDense': QDense,
                                         'QActivation': QActivation
                                         }):
        model = tf.keras.models.load_model(Gaussian_Model_QAT_file)

if model_summary:
    model.summary()

  0%|          | 0/151 [00:00<?, ?it/s]2025-03-31 17:44:20.389557: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8700
2025-03-31 17:44:22.269130: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x562b9e6b7160 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-03-31 17:44:22.269161: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA A100 80GB PCIe, Compute Capability 8.0
2025-03-31 17:44:22.269167: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (1): NVIDIA A100 80GB PCIe, Compute Capability 8.0
2025-03-31 17:44:22.269172: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (2): NVIDIA A100 80GB PCIe, Compute Capability 8.0
2025-03-31 17:44:22.269176: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (3): NVIDIA A100 80GB PCIe, Compute Capability 8.0
2025-03-31 17:44:22.269180: I tensorfl



100%|██████████| 151/151 [00:11<00:00, 12.69it/s]


Epoch [1/200], Loss: 0.007239880481278462


100%|██████████| 151/151 [00:06<00:00, 21.82it/s]


Epoch [2/200], Loss: 0.005931984879915288


100%|██████████| 151/151 [00:06<00:00, 21.76it/s]


Epoch [3/200], Loss: 0.0049451367392988


 61%|██████    | 92/151 [00:04<00:02, 21.81it/s]


KeyboardInterrupt: 

In [None]:
model.save('/home/pujan/Research/RHEED/Data/Models/Gaussian_Model_QAT_2I_6F.h5')

In [21]:
model.save_weights('/home/pujan/Research/RHEED/Data/Models/Gaussian_Model_QAT_2I_6F_weights.h5')

In [None]:
from qkeras.autoqkeras.utils import print_qmodel_summary
print(print_qmodel_summary(model))

In [None]:
# Imports for Synthesizing
import hls4ml

In [None]:
from pprint import pprint
pprint(model.to_json())

In [None]:
reuse = 32 #reuse factor

precision = 'ap_fixed<{},{}>'.format((total_bits), integer_bits)


hls_config = hls4ml.utils.config_from_keras_model(model, granularity='name')

for Layer in hls_config['LayerName'].keys():
    hls_config['LayerName'][Layer]['Strategy'] = 'Resource'
    hls_config['LayerName'][Layer]['ReuseFactor'] = reuse
    # hls_config['LayerName'][Layer]['weight'] = precision
    # hls_config['LayerName'][Layer]['scale'] = precision
    # hls_config['LayerName'][Layer]['bias'] = precision


In [None]:
cfg = hls4ml.converters.create_config(backend='Vitis')
cfg['IOType']     = 'io_parallel' #io_stream
cfg['HLSConfig']  = hls_config
cfg['KerasModel'] = model

hls_model = hls4ml.converters.keras_to_hls(cfg)
hls_model.compile()

# hls_model._compile()

In [None]:
hls_model.build( 
    reset=False, 
    csim=True, 
    synth=True,
    cosim=False,
    validation=False,
    export=False,
    vsynth=True
)

In [None]:
# Generate Plausible Gaussians for Validation:
num_generated_gaussians = 1000

generated_gaussians_labels = []
for num in range(num_generated_gaussians):
    new_gaussian_label = []
    for (avg, std) in zip(avg_params, std_params):
        new_gaussian_label.append(avg + np.random.normal(loc=0, scale=std))
    generated_gaussians_labels.append(new_gaussian_label)
generated_gaussians_labels = np.array(generated_gaussians_labels)

generated_gaussians_images = generate_guassian(tf.convert_to_tensor(generated_gaussians_labels, dtype=tf.float32), (48,48))

print(f'[Generated Gaussian Labels Shape]: {generated_gaussians_labels.shape}')
print(f'[Generated Guassian Images Shape]: {generated_gaussians_images.shape}')

In [None]:
predicted_gaussian_labels = np.array(hls_model.predict(np.array(generated_gaussians_images[:5])))
print(predicted_gaussian_labels)

In [None]:
predicted_gaussian_labels = np.array(model.predict(np.array(generated_gaussians_images[:5])))
print(predicted_gaussian_labels)