In [None]:
%pip install opencv-python matplotlib kagglehub python-dotenv jupyter ipykernel

In [None]:
%pip uninstall -y numpy
%pip install "numpy>=1.23.5,<2"
%pip install tensorflow-macos tensorflow-metal tensorflow-hub

Imports

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
import cv2
import os
import kagglehub
import numpy as np
import matplotlib.pyplot as plt
import random

Clearing cache

In [None]:
from tensorflow.keras import backend as K
K.clear_session()

import gc
gc.collect()

GPU Check

In [None]:
gpus = tf.config.list_physical_devices("GPU")
if gpus:
    print("Using gpu: ", gpus)
else:
    print("no gpu found using cpu")

Config

In [None]:
COCO_CLASSES = [
    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat',
    'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat',
    'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack',
    'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
    'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
    'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse',
    'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator',
    'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

In [None]:
IMG_SIZE = 320
GRID_SIZE = 10
NUM_CLASSES = 3
BATCH_SIZE = 8
CONF_THRESHOLD = 0.4
IOU_THRESHOLD = 0.5

## Dataset

Loading the dataset

In [None]:
path = kagglehub.dataset_download("ultralytics/coco128")

print(path)
print(os.listdir(path))

In [None]:
IMG_DIR = os.path.join(path, "coco128","images", "train2017")
LABEL_DIR = os.path.join(path, "coco128","labels","train2017")

In [None]:
print("images: ",len(os.listdir(IMG_DIR)))
print("labels: ",len(os.listdir(LABEL_DIR)))

Loading the image and label in sorted pairs

In [None]:
image_files = sorted(os.listdir(IMG_DIR))
label_files = sorted(os.listdir(LABEL_DIR))

Label Encoding

In [None]:
def encode_labels(boxes, labels):
    target = np.zeros((GRID_SIZE, GRID_SIZE, 5+NUM_CLASSES), dtype=np.float32)

    for box, clss in zip(boxes,labels):
        cx, cy, w, h = box

        # grid cell
        grid_x = int(cx*GRID_SIZE)
        grid_y = int(cy*GRID_SIZE)

        # cell offset
        dx = cx*GRID_SIZE - grid_x
        dy = cy*GRID_SIZE - grid_y

        # objectness->if there is an object
        target[grid_y, grid_x, 0] = 1.0
        # box regression->where is that object
        target[grid_y, grid_x, 1:5] = [dx, dy, w, h]
        # class one-hot->what is that object
        target[grid_y, grid_x, 5+clss] = 1.0

    return target

In [None]:
def load_sample(image_path, label_path):
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = cv2.resize(image, (IMG_SIZE, IMG_SIZE))
    image = image / 255.0

    boxes = []
    labels = []

    with open(label_path, "r") as f:
        for line in f:
            clss, cx, cy, w, h = map(float, line.split())
            boxes.append([cx,cy,w,h])
            labels.append(int(clss))

    boxes = np.array(boxes)
    labels = np.array(labels)

    target = encode_labels(boxes, labels)

    return image, target

Dataset Creation

In [None]:
def create_dataset():
    for img, label in zip(image_files, label_files):
        img_path = os.path.join(IMG_DIR, img)
        label_path = os.path.join(LABEL_DIR, label)

        image, target = load_sample(img_path, label_path)
        yield image.astype(np.float32), target.astype(np.float32)

In [None]:
train_ds = tf.data.Dataset.from_generator(
    create_dataset,
    output_signature=(
        tf.TensorSpec(shape=(IMG_SIZE, IMG_SIZE, 3), dtype=tf.float32),
        tf.TensorSpec(shape=(GRID_SIZE,GRID_SIZE,5+NUM_CLASSES), dtype=tf.float32),
    )
)

train_ds = train_ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

## Model Architecture

In [None]:
inputs = tf.keras.layers.Input(shape=(IMG_SIZE, IMG_SIZE, 3))

backbone = tf.keras.applications.MobileNetV2(
    input_tensor = inputs,
    include_top = False,
    weights = "imagenet"
)

# freeze backbone
backbone.trainable = False

x=backbone.output
x=tf.keras.layers.Conv2D(256, 3, padding="same", activation="relu")(x)
outputs = tf.keras.layers.Conv2D(
    5+NUM_CLASSES,
    1,
    padding="same",
    activation=None
)(x)

model = tf.keras.Model(input, outputs)

In [None]:
tf.keras.utils.plot_model(model, show_shapes=True, show_layer_names=False)

In [None]:
model.summary()

## Loss Function

In [None]:
def yolo_loss(y_true, y_pred):
    # object mask
    obj_mask = y_true[..., 0:1]

    # losses
    bce = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    mse = tf.keras.losses.MeanSquaredError()
    cce = tf.keras.losses.CategoricalCrossentropy(from_logits=True)

    # objectness
    obj_loss = bce(
        y_true[..., 0],
        y_pred[..., 0]
    )

    # bb loss
    box_loss = mse(
        y_true[..., 1:5] * obj_mask,
        y_pred[..., 1:5] * obj_mask
    )

    # class loss
    class_loss = cce(
        y_true[..., 5:] * obj_mask,
        y_pred[..., 5:] * obj_mask
    )

    return obj_loss + box_loss + class_loss

## Training hell

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-4),
    loss=yolo_loss
)

model.fit(train_ds, epochs=30)

Unfreezing the backbone

In [None]:
for layer in backbone.layers[-20:]:
    layer.trainable = True

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam[1e-5]
    loss=yolo_loss
)

model.fit(train_ds, epochs=10)

## Testing

In [None]:
def decode_predictions(pred, conf_thresh=CONF_THRESHOLD):
    boxes = []
    classes = []
    scores = []

    for gy in range(GRID_SIZE):
        for gx in range(GRID_SIZE):
            cell = pred[gy,gx]

            objectness = cell[0]
            if objectness < conf_thresh:
                continue

            dx, dy, w, h = cell[1:5]
            class_probs = cell[5]

            class_id = np.argmax(class_probs)
            score = objectness * class_probs[class_id]

            cx = (gx+dx)/GRID_SIZE
            cy = (gy+dy)/GRID_SIZE

            x1 = int((cx - w / 2) * IMG_SIZE)
            y1 = int((cy - h / 2) * IMG_SIZE)
            x2 = int((cx + w / 2) * IMG_SIZE)
            y2 = int((cy + h / 2) * IMG_SIZE)

            boxes.append([x1, y1, x2, y2])
            classes.append(class_id)
            scores.append(score)

    return boxes, classes, scores

In [None]:
img = cv2.imread("test/image.png")
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
img = img / 255.0

pred = model.predict(img[None, ...])[0]

boxes, classes, scores = decode_predictions(pred)

In [None]:
def draw_boxes(img, boxes, classes, scores):
    img = img.copy()

    for (x1,y1,x2,y2), clss, sc in zip(boxes,classes,scores):
        cv2.rectangle(img, (x1,y1), (x2,y2), (0,255,0), 2)
        cv2.putText(img,
                    f"{clss}:{sc:.2f}",
                    (x1, y1-5),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.5,
                    (255,0,0),
                    1
        )

    return img

In [None]:
vis = draw_boxes((img*255).astype(np.uint8), boxes, classes, scores)
plt.imshow(vis)
plt.axis("off")