# Imports

In [None]:
import os
import json
from tqdm.auto import tqdm
import xml.etree.ElementTree as ET

import tensorflow as tf
from tensorflow import keras

import keras_cv
from keras_cv import bounding_box
from keras_cv import visualization

2024-08-12 15:06:05.474870: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-12 15:06:06.022191: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-12 15:06:06.178862: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-12 15:06:07.381527: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
#print(os.listdir('/root/dataset/subset_dataset'))
dataset_location = '/root/dataset/subset_dataset'

# Load Data

In [6]:
"""
A dictionary is created to map each class name to a unique numerical identifier. This
mapping is used to encode and decode the class labels during training and inference in
object detection tasks.
"""

class_ids = [
    "dieback_ash"
]
class_mapping = dict(zip(range(len(class_ids)), class_ids))

# Path to images and annotations
images_location = os.path.join(dataset_location, 'images')
labels_loation = os.path.join(dataset_location, 'labels')

# Get all XML file paths in path_annot and sort them
json_files = sorted(
    [
        os.path.join(labels_loation, file_name)
        for file_name in os.listdir(labels_loation)
        if file_name.endswith(".json")
    ]
)

# Get all JPEG image file paths in path_images and sort them
jpg_files = sorted(
    [
        os.path.join(images_location, file_name)
        for file_name in os.listdir(images_location)
        if file_name.endswith(".jpg")
    ]
)

def parse_label(json_file):
    with open(json_file, 'r') as f:
        data = json.load(f)

    image_name = data['imagePath']
    image_path = os.path.join(images_location, image_name)

    boxes = []
    classes = []
    for obj in data['shapes']:
        cls = obj['label']
        classes.append(cls)

        bbox = obj['points']
        xmin = float(bbox[0][0])
        ymin = float(bbox[0][1])
        xmax = float(bbox[1][0])
        ymax = float(bbox[1][1])
        boxes.append([xmin, ymin, xmax, ymax])

    class_ids = [
        list(class_mapping.keys())[list(class_mapping.values()).index(cls)]
        for cls in classes
    ]
    return image_path, boxes, class_ids


image_paths = []
bbox = []
classes = []
for json_file in tqdm(json_files):
    image_path, boxes, class_ids = parse_label(json_file)
    image_paths.append(image_path)
    bbox.append(boxes)
    classes.append(class_ids)

  0%|          | 0/195 [00:00<?, ?it/s]

In [7]:
SPLIT_RATIO = 0.2
BATCH_SIZE = 1
LEARNING_RATE = 0.001
EPOCH = 5
GLOBAL_CLIPNORM = 10.0


bbox = tf.ragged.constant(bbox)
classes = tf.ragged.constant(classes)
image_paths = tf.ragged.constant(image_paths)

data = tf.data.Dataset.from_tensor_slices((image_paths, classes, bbox))

"""
Splitting data in training and validation data
"""

# Determine the number of validation samples
num_val = int(len(json_files) * SPLIT_RATIO)

# Split the dataset into train and validation sets
val_data = data.take(num_val)
train_data = data.skip(num_val)


def load_image(image_path):
    image = tf.io.read_file(image_path)
    image = tf.image.decode_jpeg(image, channels=3)
    return image


def load_dataset(image_path, classes, bbox):
    # Read Image
    image = load_image(image_path)
    bounding_boxes = {
        "classes": tf.cast(classes, dtype=tf.float32),
        "boxes": bbox,
    }
    return {"images": tf.cast(image, tf.float32), "bounding_boxes": bounding_boxes}

I0000 00:00:1723471615.473896   67722 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:29:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1723471617.129504   67722 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:29:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1723471617.129552   67722 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:29:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1723471617.152522   67722 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:29:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1723471617.152570   67722 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:29:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:0

# Augmentation

In [8]:
## Data Augmentation
"""
One of the most challenging tasks when constructing object detection pipelines is data
augmentation. It involves applying various transformations to the input images to
increase the diversity of the training data and improve the model's ability to
generalize. However, when working with object detection tasks, it becomes even more
complex as these transformations need to be aware of the underlying bounding boxes and
update them accordingly.

KerasCV provides native support for bounding box augmentation. KerasCV offers an
extensive collection of data augmentation layers specifically designed to handle bounding
boxes. These layers intelligently adjust the bounding box coordinates as the image is
transformed, ensuring that the bounding boxes remain accurate and aligned with the
augmented images.

By leveraging KerasCV's capabilities, developers can conveniently integrate bounding
box-friendly data augmentation into their object detection pipelines. By performing
on-the-fly augmentation within a tf.data pipeline, the process becomes seamless and
efficient, enabling better training and more accurate object detection results.
"""

augmenter = keras.Sequential(
    layers=[
        keras_cv.layers.RandomFlip(mode="horizontal", bounding_box_format="xyxy"),
        keras_cv.layers.RandomShear(
            x_factor=0.2, y_factor=0.2, bounding_box_format="xyxy"
        ),
        keras_cv.layers.JitteredResize(
            target_size=(640, 640), scale_factor=(0.75, 1.3), bounding_box_format="xyxy"
        ),
    ]
)

# Dataset Creation


In [9]:
"""
## Creating Training Dataset
"""

train_ds = train_data.map(load_dataset, num_parallel_calls=tf.data.AUTOTUNE)
train_ds = train_ds.shuffle(BATCH_SIZE * 1)
train_ds = train_ds.ragged_batch(BATCH_SIZE, drop_remainder=True)
train_ds = train_ds.map(augmenter, num_parallel_calls=tf.data.AUTOTUNE)

"""
## Creating Validation Dataset
"""

resizing = keras_cv.layers.JitteredResize(
    target_size=(640, 640),
    scale_factor=(0.75, 1.3),
    bounding_box_format="xyxy",
)

val_ds = val_data.map(load_dataset, num_parallel_calls=tf.data.AUTOTUNE)
val_ds = val_ds.shuffle(BATCH_SIZE * 1)
val_ds = val_ds.ragged_batch(BATCH_SIZE, drop_remainder=True)
val_ds = val_ds.map(resizing, num_parallel_calls=tf.data.AUTOTUNE)


# Visualisation

In [11]:
test = train_ds.take(1).as_numpy_iterator()
test = next(test)
print(test)
test_image = test[0]
bboxes = test[1]['boxes'][0]
visualization.plot_bounding_box_gallery(
    test_image,
    value_range=(0, 255),
    rows=1,
    cols=1,
    y_true=bboxes,
    scale=5,
    font_scale=0.7,
    bounding_box_format='xyxy',
    class_mapping=class_mapping,
)

(array([[[[ 66.02933 ,  99.135925,  61.721073],
         [ 65.66833 ,  96.99063 ,  63.442562],
         [ 60.215446,  88.790665,  58.861217],
         ...,
         [172.24806 , 193.21626 , 118.06815 ],
         [165.00566 , 181.99548 , 110.98962 ],
         [188.8495  , 204.1371  , 134.86139 ]],

        [[ 60.57143 ,  90.80391 ,  56.810745],
         [ 78.03141 , 106.962135,  76.065704],
         [ 79.210754, 105.57271 ,  77.920746],
         ...,
         [165.89082 , 186.88937 , 111.8598  ],
         [158.7063  , 175.09338 , 104.70555 ],
         [186.11546 , 201.1686  , 132.116   ]],

        [[ 59.48938 ,  87.466286,  56.19877 ],
         [ 90.378586, 116.47752 ,  88.348045],
         [ 93.07867 , 116.37177 ,  91.83502 ],
         ...,
         [175.06381 , 197.16397 , 120.38739 ],
         [167.0734  , 184.8273  , 112.93527 ],
         [183.48718 , 199.95866 , 129.18257 ]],

        ...,

        [[196.28662 , 219.66537 , 158.24529 ],
         [174.91133 , 202.60332 , 136.88687 

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [11]:
"""
## Visualization
"""

"""
def visualize_dataset(inputs, value_range, rows, cols, bounding_box_format):
    inputs = next(iter(inputs.take(1)))
    images, bounding_boxes = inputs["images"], inputs["bounding_boxes"]
    visualization.plot_bounding_box_gallery(
        images,
        value_range=value_range,
        rows=rows,
        cols=cols,
        y_true=bounding_boxes,
        scale=5,
        font_scale=0.7,
        bounding_box_format=bounding_box_format,
        class_mapping=class_mapping,
    )


visualize_dataset(
    train_ds, bounding_box_format="xyxy", value_range=(0, 255), rows=2, cols=2
)

visualize_dataset(
    val_ds, bounding_box_format="xyxy", value_range=(0, 255), rows=2, cols=2
)

"""
"""
We need to extract the inputs from the preprocessing dictionary and get them ready to be
fed into the model.
"""
def dict_to_tuple(inputs):
    return inputs["images"], inputs["bounding_boxes"]


train_ds = train_ds.map(dict_to_tuple, num_parallel_calls=tf.data.AUTOTUNE)
train_ds = train_ds.prefetch(tf.data.AUTOTUNE)

val_ds = val_ds.map(dict_to_tuple, num_parallel_calls=tf.data.AUTOTUNE)
val_ds = val_ds.prefetch(tf.data.AUTOTUNE)

# Model Creation and Training

In [13]:
"""
## Creating Model
"""

"""
YOLOv8 is a cutting-edge YOLO model that is used for a variety of computer vision tasks,
such as object detection, image classification, and instance segmentation. Ultralytics,
the creators of YOLOv5, also developed YOLOv8, which incorporates many improvements and
changes in architecture and developer experience compared to its predecessor. YOLOv8 is
the latest state-of-the-art model that is highly regarded in the industry.
"""

"""
Below table compares the performance metrics of five different YOLOv8 models with
different sizes (measured in pixels): YOLOv8n, YOLOv8s, YOLOv8m, YOLOv8l, and YOLOv8x.
The metrics include mean average precision (mAP) values at different
intersection-over-union (IoU) thresholds for validation data, inference speed on CPU with
ONNX format and A100 TensorRT, number of parameters, and number of floating-point
operations (FLOPs) (both in millions and billions, respectively). As the size of the
model increases, the mAP, parameters, and FLOPs generally increase while the speed
decreases. YOLOv8x has the highest mAP, parameters, and FLOPs but also the slowest
inference speed, while YOLOv8n has the smallest size, fastest inference speed, and lowest
mAP, parameters, and FLOPs.

| Model                                                                                |
size<br><sup>(pixels) | mAP<sup>val<br>50-95 | Speed<br><sup>CPU ONNX<br>(ms) |
Speed<br><sup>A100 TensorRT<br>(ms) | params<br><sup>(M) | FLOPs<br><sup>(B) |
| ------------------------------------------------------------------------------------ |
--------------------- | -------------------- | ------------------------------ |
----------------------------------- | ------------------ | ----------------- |
| YOLOv8n | 640                   | 37.3                 | 80.4
| 0.99                                | 3.2                | 8.7               |
| YOLOv8s | 640                   | 44.9                 | 128.4
| 1.20                                | 11.2               | 28.6              |
| YOLOv8m | 640                   | 50.2                 | 234.7
| 1.83                                | 25.9               | 78.9              |
| YOLOv8l | 640                   | 52.9                 | 375.2
| 2.39                                | 43.7               | 165.2             |
| YOLOv8x | 640                   | 53.9                 | 479.1
| 3.53                                | 68.2               | 257.8             |
"""

"""
You can read more about YOLOV8 and its architecture in this
[RoboFlow Blog](https://blog.roboflow.com/whats-new-in-yolov8/)
"""

"""
First we will create a instance of backbone which will be used by our yolov8 detector
class.

YOLOV8 Backbones available in KerasCV:

1.   Without Weights:

    1.   yolo_v8_xs_backbone
    2.   yolo_v8_s_backbone
    3.   yolo_v8_m_backbone
    4.   yolo_v8_l_backbone
    5.   yolo_v8_xl_backbone

2. With Pre-trained coco weight:

    1.   yolo_v8_xs_backbone_coco
    2.   yolo_v8_s_backbone_coco
    2.   yolo_v8_m_backbone_coco
    2.   yolo_v8_l_backbone_coco
    2.   yolo_v8_xl_backbone_coco



"""

backbone = keras_cv.models.YOLOV8Backbone.from_preset(
    "yolo_v8_s_backbone_coco"  # We will use yolov8 small backbone with coco weights
)

"""
Next, let's build a YOLOV8 model using the `YOLOV8Detector`, which accepts a feature
extractor as the `backbone` argument, a `num_classes` argument that specifies the number
of object classes to detect based on the size of the `class_mapping` list, a
`bounding_box_format` argument that informs the model of the format of the bbox in the
dataset, and a finally, the feature pyramid network (FPN) depth is specified by the
`fpn_depth` argument.

It is simple to build a YOLOV8 using any of the aforementioned backbones thanks to
KerasCV.

"""

yolo = keras_cv.models.YOLOV8Detector(
    num_classes=len(class_mapping),
    bounding_box_format="xyxy",
    backbone=backbone,
    fpn_depth=1,
)

"""
## Compile the Model
"""

"""
Loss used for YOLOV8


1. Classification Loss: This loss function calculates the discrepancy between anticipated
class probabilities and actual class probabilities. In this instance,
`binary_crossentropy`, a prominent solution for binary classification issues, is
Utilized. We Utilized binary crossentropy since each thing that is identified is either
classed as belonging to or not belonging to a certain object class (such as a person, a
car, etc.).

2. Box Loss: `box_loss` is the loss function used to measure the difference between the
predicted bounding boxes and the ground truth. In this case, the Complete IoU (CIoU)
metric is used, which not only measures the overlap between predicted and ground truth
bounding boxes but also considers the difference in aspect ratio, center distance, and
box size. Together, these loss functions help optimize the model for object detection by
minimizing the difference between the predicted and ground truth class probabilities and
bounding boxes.


"""

optimizer = tf.keras.optimizers.Adam(
    learning_rate=LEARNING_RATE,
    global_clipnorm=GLOBAL_CLIPNORM,
)

yolo.compile(
    optimizer=optimizer, classification_loss="binary_crossentropy", box_loss="ciou"
)

"""
## COCO Metric Callback

We will be using `BoxCOCOMetrics` from KerasCV to evaluate the model and calculate the
Map(Mean Average Precision) score, Recall and Precision. We also save our model when the
mAP score improves.
"""


class EvaluateCOCOMetricsCallback(keras.callbacks.Callback):
    def __init__(self, data, save_path):
        super().__init__()
        self.data = data
        self.metrics = keras_cv.metrics.BoxCOCOMetrics(
            bounding_box_format="xyxy",
            evaluate_freq=1e9,
        )

        self.save_path = save_path
        self.best_map = -1.0

    def on_epoch_end(self, epoch, logs):
        self.metrics.reset_state()
        for batch in self.data:
            images, y_true = batch[0], batch[1]
            y_pred = self.model.predict(images, verbose=0)
            self.metrics.update_state(y_true, y_pred)

        metrics = self.metrics.result(force=True)
        logs.update(metrics)

        current_map = metrics["MaP"]
        if current_map > self.best_map:
            self.best_map = current_map
            self.model.save(self.save_path)  # Save the model when mAP improves

        return logs


"""
## Train the Model
"""

yolo.fit(
    train_ds,
    validation_data=val_ds,
    epochs=3,
    #callbacks=[EvaluateCOCOMetricsCallback(val_ds, "/root/models/trained/model.h5")],
)

"""
## Visualize Predictions
"""


def visualize_detections(model, dataset, bounding_box_format):
    images, y_true = next(iter(dataset.take(1)))
    y_pred = model.predict(images)
    y_pred = bounding_box.to_ragged(y_pred)
    visualization.plot_bounding_box_gallery(
        images,
        value_range=(0, 255),
        bounding_box_format=bounding_box_format,
        y_true=y_true,
        y_pred=y_pred,
        scale=4,
        rows=2,
        cols=2,
        show=True,
        font_scale=0.7,
        class_mapping=class_mapping,
    )


visualize_detections(yolo, dataset=val_ds, bounding_box_format="xyxy")

Epoch 1/3


: 

: 

: 

In [10]:
backbone = keras_cv.models.YOLOV8Backbone.from_preset(
    "yolo_v8_m_backbone_coco"  # We will use yolov8 small backbone with coco weights
)
optimizer = tf.keras.optimizers.Adam(
    learning_rate=LEARNING_RATE,
    global_clipnorm=GLOBAL_CLIPNORM,
)

yolo.compile(
    optimizer=optimizer, classification_loss="binary_crossentropy", box_loss="ciou"
)


In [11]:
import tensorflow as tf
import keras_cv

# Create 2 images
images = tf.ones(shape=(2, 512, 512, 3))
labels = {
    "boxes": tf.ragged.constant([
        [
            [0, 0, 100, 100],
            [100, 100, 200, 200],
            [300, 300, 100, 100],
        ],
        # Add a second image with one bbox
        [
            [0, 0, 100, 100]
        ], 
    ], dtype=tf.float32),
    
    "classes": tf.ragged.constant([[1, 1, 1], [1]], dtype=tf.int64),
}

model = keras_cv.models.YOLOV8Detector(
    num_classes=20,
    bounding_box_format="xyxy",
    backbone=keras_cv.models.YOLOV8Backbone.from_preset(
        "yolo_v8_m_backbone_coco"
    ),
    fpn_depth=2
)

# Evaluate model without box decoding and NMS
model(images)

# Prediction with box decoding and NMS
model.predict(images)

# Train model
model.compile(
    classification_loss='binary_crossentropy',
    box_loss='ciou',
    optimizer=tf.optimizers.SGD(global_clipnorm=10.0),
    jit_compile=False,
)
model.fit(images, labels)

2024-08-12 14:19:27.295563: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907
W0000 00:00:1723468768.759068   52499 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1723468769.848947   52499 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1723468769.868195   52499 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1723468769.880363   52499 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1723468769.908357   52499 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1723468769.917121   52499 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1723468769.918366   52499 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1723468769.919499   52499 gpu_t

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 13s/step


I0000 00:00:1723468792.705938   52744 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


ValueError: Cannot split a ragged dimension. Got `value` with shape <DynamicRaggedShape lengths=[None, None, None] num_row_partitions=2> and `axis` 2.