In [1]:
from data import build_data_loader_train_detection, denormalize_transform

import os
import torch
from pathlib import Path
from ultralytics import YOLO
from ultralytics import RTDETR
import cv2
import torchvision.transforms as T
import numpy as np
import gc

data_loader = build_data_loader_train_detection( 'dataset_detection', batch_size = 4, max_objects = 200, max_poly_points = 64, crop_size = 512, mode = 'seg' )

In [2]:
BASE_FOLDER = "dataset_detection_pre"
OUTPUT_IMAGE_FOLDER = 'images/train'
OUTPUT_ANNOTATED_IMAGE_FOLDER = 'annotated_images/train'
OUTPUT_LABEL_FOLDER = 'labels/train'
TRAIN_TXT_PATH = 'train.txt'
DATA_YAML_PATH = 'data.yaml'

In [3]:
os.makedirs( os.path.join( BASE_FOLDER, OUTPUT_IMAGE_FOLDER ), exist_ok = True )
os.makedirs( os.path.join( BASE_FOLDER, OUTPUT_ANNOTATED_IMAGE_FOLDER ), exist_ok = True )
os.makedirs( os.path.join( BASE_FOLDER, OUTPUT_LABEL_FOLDER ), exist_ok = True )

In [4]:
# model = YOLO( 'yolo/yolo11x.pt', task = 'detect' )
y12 = YOLO( 'yolo/yolo12x.pt', task = 'detect' )
y11 = YOLO( 'yolo/yolo11x.pt', task = 'detect' )
dterr = RTDETR( "yolo/rtdetr-x.pt" )
to_pil = T.ToPILImage()
train_txt_lines = []

In [5]:
yolo_to_custom = {
    0: 1,    # person ➜ Person
    1: 9,    # bicycle ➜ Bicycle
    2: 10,   # car ➜ LMVs
    3: 8,    # motorcycle ➜ Motorcycle
    5: 11,   # bus ➜ HMVs
    6: 11,   # train ➜ HMVs
    7: 11,   # truck ➜ HMVs
    9: 7,    # traffic light ➜ Traffic Light
    10: 6,   # fire hydrant ➜ Fire Hydrant
    11: 4,   # stop sign ➜ Stop Sign
    12: 3,   # parking meter ➜ Parking Meter
    14: 2,   # bird ➜ Birds
    16: 12,  # dog ➜ Animals
    17: 12,  # horse ➜ Animals
    18: 12,  # sheep ➜ Animals
    19: 12,  # cow ➜ Animals
    20: 12,  # elephant ➜ Animals
    21: 12,  # bear ➜ Animals
    22: 12,  # zebra ➜ Animals
    23: 12,  # giraffe ➜ Animals
    56: 20,  # chair ➜ Furniture
    58: 21,  # potted plant ➜ Pot Plant
    13: 20,  # bench ➜ Furniture
    33: 22,  # kite ➜ Sign Boards (loosely)
    45: 20,  # bowl ➜ Furniture
}

In [6]:
def compute_iou_topleft(box1, box2):
    """
    Compute IoU for two boxes in [x_left, y_top, w, h] (normalized) format.
    """
    x1, y1, w1, h1 = box1
    x2, y2, w2, h2 = box2

    box1_x1, box1_y1 = x1, y1
    box1_x2, box1_y2 = x1 + w1, y1 + h1

    box2_x1, box2_y1 = x2, y2
    box2_x2, box2_y2 = x2 + w2, y2 + h2

    inter_x1 = max(box1_x1, box2_x1)
    inter_y1 = max(box1_y1, box2_y1)
    inter_x2 = min(box1_x2, box2_x2)
    inter_y2 = min(box1_y2, box2_y2)

    inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1)
    area1 = w1 * h1
    area2 = w2 * h2
    union = area1 + area2 - inter_area
    if union == 0:
        return 0
    return inter_area / union

def center_to_topleft(box_center):
    """
    Convert a box from center format [cx, cy, w, h] to top-left format [x_left, y_top, w, h].
    """
    cx, cy, w, h = box_center
    return [cx - w/2, cy - h/2, w, h]

def deduplicate_detections(detections, iou_threshold=0.5):
    """
    Remove duplicate detections from a list by keeping the larger detection.
    
    detections: list of tuples (custom_class, box)
      where box is in [x_left, y_top, w, h] (normalized) format.
    iou_threshold: if IoU between two boxes of the same class exceeds this threshold,
                   they are considered duplicates.
                   
    Returns:
      A deduplicated list of detections, keeping the detection with the larger area
      for overlapping detections.
    """
    deduped = []
    for det in detections:
        cls_det, box_det = det
        area_det = box_det[2] * box_det[3]  # area = w * h
        duplicate_found = False
        for idx, d in enumerate(deduped):
            cls_existing, box_existing = d
            if cls_existing == cls_det:
                iou = compute_iou_topleft(box_det, box_existing)
                if iou > iou_threshold:
                    area_existing = box_existing[2] * box_existing[3]
                    # If new detection has a bigger area, replace the existing one.
                    if area_det > area_existing:
                        deduped[idx] = det
                    duplicate_found = True
                    break
        if not duplicate_found:
            deduped.append(det)
    return deduped

def merge_detections(result, ds_labels, ds_boxes, yolo_to_custom, iou_threshold=0.5):
    """
    Merges dataset detections with YOLO detections for one image.

    - result: YOLO detection result for one image (each box has .cls and .xywhn)
    - ds_labels: tensor of shape [N] containing dataset detection labels (0 means no detection)
    - ds_boxes: tensor of shape [N, 4] containing dataset boxes in [x_left, y_top, w, h] normalized format
    - yolo_to_custom: dictionary mapping YOLO class ids to your custom class ids
    - iou_threshold: IoU threshold for matching

    For detections with the same label and IoU > iou_threshold, only the detection with the larger area is kept.

    Returns:
      A list of tuples (custom_class, box) where box is in [x_left, y_top, w, h] normalized format.
      Dataset detections are kept, and a YOLO detection is added only if it does not conflict
      with an existing dataset detection of the same class. In case of conflict, the bigger detection is kept.
    """
    final_detections = []
    
    # 1. Add dataset detections (non-zero labels) as they are.
    for label, box in zip(ds_labels, ds_boxes):
        if label.item() != 0:
            final_detections.append( ( int( label.item() ) - 1, box.tolist() ) )
    
    # 2. Process YOLO detections.
    for yolo_box in result.boxes:
        yolo_cls = int( yolo_box.cls )
        if yolo_cls not in yolo_to_custom:
            continue
        custom_cls = yolo_to_custom[yolo_cls] - 1
        # Get YOLO detection in center format and convert to top-left format.
        yolo_center = yolo_box.xywhn[0].tolist()  # [cx, cy, w, h]
        yolo_tl = center_to_topleft(yolo_center)   # now in [x_left, y_top, w, h]
        area_new = yolo_tl[2] * yolo_tl[3]  # area of new detection
        confidence = yolo_box.conf[0].item()
        if confidence < 0.5:
            continue
        
        conflict_found = False
        indices_to_remove = []
        # Check for conflict with any existing detection of the same class.
        for idx, (existing_cls, existing_box) in enumerate(final_detections):            
            iou = compute_iou_topleft( yolo_tl, existing_box )
            if iou > iou_threshold and confidence < 0.7:
                conflict_found = True
                break
            if existing_cls == custom_cls:
                if iou > iou_threshold:
                    area_existing = existing_box[2] * existing_box[3]
                    if area_new > area_existing:
                        # New detection is bigger, mark the existing one for removal.
                        indices_to_remove.append(idx)
                    else:
                        # Existing detection is bigger; skip adding the new one.
                        conflict_found = True
                        break
        if not conflict_found:
            # Remove any conflicting detections that are smaller.
            for idx in sorted(indices_to_remove, reverse=True):
                del final_detections[idx]
            final_detections.append((custom_cls, yolo_tl))
    
    return final_detections

In [None]:
i = 0
max_images = 10000

for batch in data_loader:
    # Convert tensor images to NumPy arrays (and multiply by 255 if your transform normalizes to [0,1])
    images = list(batch[0].unbind(0))
    images = [denormalize_transform(img).permute(1, 2, 0).cpu().numpy() * 255 for img in images]
    
    # Get dataset detections:
    # ds_labels: tensor of shape [batch_size, N] with labels (in [x_left, y_top, w, h] format for boxes)
    # ds_boxes: tensor of shape [batch_size, N, 4] with boxes in [x_left, y_top, w, h] normalized format
    ds_labels = list( batch[1].unbind(0) )
    ds_boxes = list( batch[2].unbind(0) )
    
    r12 = y12.predict( images, verbose = False, conf = 0.25, iou = 0.45, agnostic_nms = True )
    r11 = y11.predict( images, verbose = False, conf = 0.25, iou = 0.45, agnostic_nms = True )
    rtdetr = dterr.predict( images, verbose = False )

    for idx in range(len(images)):

        final_detections_1 = merge_detections( rtdetr[idx], ds_labels[idx], ds_boxes[idx], yolo_to_custom, iou_threshold = 0.3 )
        final_detections_2 = merge_detections( r12[idx], ds_labels[idx], ds_boxes[idx], yolo_to_custom, iou_threshold = 0.3 )
        final_detections_3 = merge_detections( r11[idx], ds_labels[idx], ds_boxes[idx], yolo_to_custom, iou_threshold = 0.3 )

        # Combine detections from all three sources
        combined_detections = final_detections_1 + final_detections_2 + final_detections_3

        # Deduplicate overlapping detections of the same class
        final_detections = deduplicate_detections( combined_detections, iou_threshold = 0.1 )

        # Skip sample if no final detections.
        if not final_detections:
            continue

        filename = f'image_{i}.png'
        output_annotated_img_path = os.path.join(BASE_FOLDER, OUTPUT_ANNOTATED_IMAGE_FOLDER, filename)
        output_img_path = os.path.join(BASE_FOLDER, OUTPUT_IMAGE_FOLDER, filename)
        
        image = images[idx]
        annotated_image = image.copy()
        h_img, w_img, _ = annotated_image.shape
        
        # Draw each final detection on the image.
        for custom_cls, box in final_detections:
            # box is in [x_left, y_top, w, h] (normalized).
            # Convert normalized coordinates to absolute pixel values.
            x_left = int(box[0] * w_img)
            y_top = int(box[1] * h_img)
            box_w = int(box[2] * w_img)
            box_h = int(box[3] * h_img)
            x2 = x_left + box_w
            y2 = y_top + box_h

            if custom_cls > 23:
                a = 10
            
            cv2.rectangle(annotated_image, (x_left, y_top), (x2, y2), (0, 255, 0), 2)
            cv2.putText(annotated_image, str(custom_cls), (x_left, y_top - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
        
        # Save annotated image.
        cv2.imwrite(output_annotated_img_path, annotated_image)
        cv2.imwrite(output_img_path, cv2.cvtColor( image, cv2.COLOR_BGR2RGB ) )
        
        # Save label file in YOLO format but with your box format ([x_left, y_top, w, h]).
        label_path = os.path.join(BASE_FOLDER, OUTPUT_LABEL_FOLDER, Path(filename).stem + '.txt')
        with open(label_path, 'w') as f:
            for custom_cls, box in final_detections:
                f.write(f"{custom_cls} {box[0]:.6f} {box[1]:.6f} {box[2]:.6f} {box[3]:.6f}\n")
        
        train_txt_lines.append(str(Path(output_annotated_img_path).resolve()))
        with open( os.path.join( BASE_FOLDER, TRAIN_TXT_PATH ), 'a' ) as f:
            f.write( str(Path(output_annotated_img_path).resolve()) + '\n' )
        print(f"{i} of {max_images}", end='\r')

        del annotated_image        
        del final_detections
        gc.collect()
        torch.cuda.empty_cache()

        i += 1
        if i >= max_images:
            break
    
    del rtdetr
    del r12
    del r11
    del images
    del ds_labels
    del ds_boxes
    gc.collect()
    torch.cuda.empty_cache()
    if i >= max_images:
        break

[ WARN:0@8.146] global loadsave.cpp:848 imwrite_ Unsupported depth image for selected encoder is fallbacked to CV_8U.


9999 of 10000

In [8]:
# Save train.txt
# with open( os.path.join( BASE_FOLDER, TRAIN_TXT_PATH ), 'w' ) as f:
#    f.write('\n'.join(train_txt_lines))