In [1]:
import keras
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from lib.vit_keras.vit_keras import vit
import os

In [2]:
DATASET_DIR = "_data/combined"
TRAIN_DIR = "train"
VAL_DIR = "val"
IMAGES_DIR = "images"
MASKS_DIR = "leaf_instances"
IMAGE_SIZE = (384, 384)
CLASSES = 93
NAME = "VisionTransformer"
EPOCHS = 200

In [3]:
def gen_dataset(path, mask_subdir, batch_size, input_shape):
    x = keras.utils.image_dataset_from_directory(os.path.join(path, "images"),
                                                 batch_size=1,
                                                 image_size=input_shape[:2],
                                                 crop_to_aspect_ratio=True,
                                                 labels=None).map(lambda x0: x0 / 255)#.map(lambda x1: tf.expand_dims(x1, 0) if len(x1.shape) == 3 else x1)
    y = keras.utils.image_dataset_from_directory(os.path.join(path, mask_subdir),
                                                 batch_size=1,
                                                 image_size=input_shape[:2],
                                                 crop_to_aspect_ratio=True,
                                                 labels=None,
                                                 color_mode='grayscale').map(lambda y: to_categorical(y, num_classes=CLASSES))
    print("Dataset Sizes:", len(x), len(y))
    datagen = tf.data.Dataset.zip((x, y))
    datagen = datagen.map(lambda x,y: (tf.squeeze(x, axis=0), tf.squeeze(y, axis=0))).batch(batch_size=batch_size, drop_remainder=True).prefetch(tf.data.AUTOTUNE)
    for s in datagen.take(1).as_numpy_iterator():
        print("X", s[0].shape, tf.reduce_max(s[0]).numpy())
        print("Y", s[1].shape, tf.reduce_max(s[1]).numpy())
    return datagen

In [4]:
model = vit.vit_l32(
    image_size=IMAGE_SIZE[0],
    activation='sigmoid',
    pretrained=False,
    include_top=True,
    pretrained_top=False,
    classes=CLASSES
)

2024-07-19 10:17:13.635398: I tensorflow/core/common_runtime/gpu/gpu_device.cc:2021] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 46866 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:06:00.0, compute capability: 8.6


In [5]:
train_dir = os.path.join(DATASET_DIR, TRAIN_DIR)
val_dir = os.path.join(DATASET_DIR, VAL_DIR)

In [6]:
train_dataset = gen_dataset(train_dir, MASKS_DIR, 8, IMAGE_SIZE)
val_dataset = gen_dataset(val_dir, MASKS_DIR, 8, IMAGE_SIZE)

Found 10599 files.
Found 10607 files.
Dataset Sizes: 10599 10607
X (8, 384, 384, 3) 1.0
Y (8, 384, 384, 93) 1.0
Found 69 files.
Found 69 files.


2024-07-19 10:17:17.516417: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Dataset Sizes: 69 69
X (8, 384, 384, 3) 1.0
Y (8, 384, 384, 93) 1.0


2024-07-19 10:17:18.037432: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [7]:
def multiclass_iou_loss(y_true, y_pred, smooth=1e-6):
    """
    Compute the IoU loss for multiclass segmentation.

    :param y_true: True labels, one-hot encoded, shape (batch_size, height, width, num_classes)
    :param y_pred: Predictions, shape (batch_size, height, width, num_classes)
    :param smooth: Smoothing factor to avoid division by zero
    :return: Average IoU loss across all classes
    """
    num_classes = y_pred.shape[-1]
    iou_loss_per_class = []

    for c in range(num_classes):
        y_true_c = y_true[..., c]
        y_pred_c = y_pred[..., c]

        intersection = K.sum(y_true_c * y_pred_c)
        total = K.sum(y_true_c) + K.sum(y_pred_c)
        union = total - intersection

        iou = (intersection + smooth) / (union + smooth)
        iou_loss_per_class.append(1 - iou)

    return K.mean(tf.stack(iou_loss_per_class))

def combined_bce_iou_loss(y_true, y_pred):
    bce_loss = tf.keras.losses.binary_crossentropy(y_true, y_pred)
    iou = multiclass_iou_loss(y_true, y_pred)
    return bce_loss + iou

In [8]:
opt = keras.optimizers.SGD()
loss = tf.keras.losses.CategoricalCrossentropy()
model.compile(
        loss=loss,
        optimizer=opt,
        metrics=[
            keras.metrics.CategoricalAccuracy(),
        ],
    )

callbacks = [
    #keras.callbacks.EarlyStopping(patience=5),
    keras.callbacks.ModelCheckpoint(filepath='checkpoints/model_##name##.{epoch:02d}_##data##.keras'.replace("##name##", NAME).replace('##data##', os.path.basename(DATASET_DIR))),
    keras.callbacks.TensorBoard(log_dir='./logs'),
    keras.callbacks.ModelCheckpoint(filepath='out/best_##name##_##data##.keras'.replace('##name##', NAME).replace('##data##', os.path.basename(DATASET_DIR)), save_best_only=True, mode='max', monitor='val_one_hot_mean_io_u:')
]


In [9]:
model.summary()

print(f"Beginning training of model {NAME}")

model.fit(train_dataset, epochs=EPOCHS, callbacks=callbacks, validation_data=val_dataset)

print("Training finished, starting test evaluation")

result = model.evaluate(val_data)
print(result)

Beginning training of model VisionTransformer
Epoch 1/200


ValueError: Arguments `target` and `output` must have the same rank (ndim). Received: target.shape=(8, 384, 384, 93), output.shape=(8, 93)