In [1]:
import os
import sys 
os.chdir("/workspaces/dev/modules")
sys.path.append("/workspaces/dev/models/PillNet")
os.environ["TF_XLA_FLAGS"] = "--tf_xla_auto_jit=2 --tf_xla_cpu_global_jit=false"
os.environ["XLA_FLAGS"] = "--xla_cpu_multi_thread_eigen=false intra_op_parallelism_threads=1 inter_op_parallelism_threads=1"

In [2]:
from tensorflow import keras
import tensorflow as tf
from datetime import datetime

2025-03-20 19:31:25.228872: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-20 19:31:25.239251: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742499085.251300  120200 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742499085.254617  120200 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-20 19:31:25.268152: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [3]:
from Pills.MaskedSequence import MaskedSequence
from PillNetPT import PillNetPT
from image_segmentation.Callback import EpochTracker

In [4]:
SOURCE_PATH = '/workspaces/dev/_Shared/Datasets/pills/data'
LABEL_PATH = '/workspaces/dev/_Shared/Datasets/pills/class_label.csv'
SHAPE_PATH = '/workspaces/dev/_Shared/Datasets/pills/class_shape_id.csv'
SHAPE_ID_PATH = '/workspaces/dev/_Shared/Datasets/pills/id_shape.csv'
BACKGROUND_IMAGEES_PATH = '/workspaces/dev/data/pills/background'

In [5]:
TRAIN_DATA_SIZE = 10
VALIDATION_DATA_SIZE = 1
BATCH_SIZE = 8
INPUT_SIZE = (256, 256, 3)

In [6]:
model = PillNetPT()
# model.compile(jit_compile=False)
model.compile(jit_compile=False, run_eagerly=True)
model.build(BATCH_SIZE, INPUT_SIZE)
model.summary()

I0000 00:00:1742499088.106135  120200 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9502 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4070 Ti, pci bus id: 0000:01:00.0, compute capability: 8.9


In [7]:
train_sequence = MaskedSequence(
  TRAIN_DATA_SIZE, BATCH_SIZE, input_shape = INPUT_SIZE[:2],
  material_path=SOURCE_PATH, label_csv_path=LABEL_PATH, 
  shape_csv_path=SHAPE_PATH, id_shape_csv_path=SHAPE_ID_PATH,
  background_images_path=BACKGROUND_IMAGEES_PATH,
  use_multiprocessing=True, workers = 12
)

In [8]:
validation_sequence = MaskedSequence(
  TRAIN_DATA_SIZE, BATCH_SIZE, input_shape = INPUT_SIZE[:2],
  material_path=SOURCE_PATH, label_csv_path=LABEL_PATH, 
  shape_csv_path=SHAPE_PATH, id_shape_csv_path=SHAPE_ID_PATH,
  background_images_path=BACKGROUND_IMAGEES_PATH,
  use_multiprocessing=True, workers = 8
)

In [9]:
train_dataset = tf.data.Dataset.from_generator(
  lambda: train_sequence, 
  output_signature=(
    tf.TensorSpec(shape=(BATCH_SIZE, *INPUT_SIZE), dtype=tf.float32),
    tf.TensorSpec(shape=(BATCH_SIZE, *INPUT_SIZE), dtype=tf.float32),
  )
)

In [10]:
checkpoint_cb = keras.callbacks.ModelCheckpoint(
  filepath=f"/workspaces/dev/models/PillNet/checkpoints/PillNetPT_{datetime.now().strftime('%Y_%m_%d')}.keras",
  monitor="feature_matching_metric",
  mode = 'min',
  save_best_only=True,
  save_weights_only=False,
  verbose = 1
)

In [11]:
early_stopping_cb = keras.callbacks.EarlyStopping(
  monitor="feature_matching_metric",
  mode = 'min',
  patience=10,
  restore_best_weights=True,
  verbose = 1
)

In [12]:
# keras.config.disable_traceback_filtering()
model.fit(
  train_sequence,
  validation_data=validation_sequence,
  epochs=2,
  callbacks=[checkpoint_cb, early_stopping_cb, EpochTracker()],
  verbose=1
)

Epoch 1/2


I0000 00:00:1742499138.115229  120200 cuda_dnn.cc:529] Loaded cuDNN version 90300



 current epoch:  0
[1m1/2[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m3s[0m 4s/step - feature_matching_metric: 0.0735 - mae: 0.3121
 current epoch:  0


2025-03-20 19:32:22.151900: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:306] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.09GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - feature_matching_metric: 0.0727 - mae: 0.3238
Epoch 1: feature_matching_metric improved from inf to 0.07179, saving model to /workspaces/dev/models/PillNet/checkpoints/PillNetPT_2025_03_20.keras
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 7s/step - feature_matching_metric: 0.0724 - mae: 0.3278 - val_feature_matching_metric: 0.0764 - val_mae: 0.3701
Epoch 2/2

 current epoch:  1
[1m1/2[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m1s[0m 1s/step - feature_matching_metric: 0.0890 - mae: 0.3534
 current epoch:  1
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - feature_matching_metric: 0.0865 - mae: 0.3396
Epoch 2: feature_matching_metric did not improve from 0.07179
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 5s/step - feature_matching_metric: 0.0857 - mae: 0.3350 - val_feature_matching_metric: 0.1117 - val_mae: 0.2016
Restoring model weights from the end 

In [13]:
raise Exception("End")

Exception: End

In [None]:
model.save(f"/workspaces/dev/models/PillNet/checkpoints/PillNetPT_sub_{datetime.now().strftime('%Y_%m_%d_sub')}.keras")