# RT-DETR Pretraining with SHIFT-Discrete Dataset

## Check GPU Availability

In [1]:
!nvidia-smi

Tue Jul 22 14:31:00 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off | 00000000:04:00.0 Off |                    0 |
| N/A   41C    P0              34W / 250W |  12772MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  Tesla P100-PCIE-16GB           Off | 00000000:06:00.0 Off |  

In [2]:
# Set CUDA Device Number
DEVICE_NUM = 6
ADDITIONAL_GPU = 0

from os import environ
environ["CUDA_VISIBLE_DEVICES"] = ",".join([f"{i+DEVICE_NUM}" for i in range(0, ADDITIONAL_GPU+1)])
environ["CUDA_VISIBLE_DEVICES"]

'6'

## Imports

In [3]:
from os import path

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from ttadapters.datasets import BaseDataset, DatasetHolder, DataLoaderHolder
from ttadapters.datasets import SHIFTClearDatasetForObjectDetection, SHIFTCorruptedDatasetForObjectDetection
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
from accelerate import Accelerator, notebook_launcher

# import wandb
import supervision as sv
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

from ultralytics import YOLO

In [4]:
if torch.cuda.is_available():
    if ADDITIONAL_GPU:
        device = torch.device("cuda")
    else:
        device = torch.device(f"cuda")  # torch.device(f"cuda:{DEVICE_NUM}")
else:
    device = torch.device("cpu")
    DEVICE_NUM = -1

print(f"INFO: Using device - {device}" + (f":{DEVICE_NUM}" if ADDITIONAL_GPU else ""))

INFO: Using device - cuda


In [5]:
# Tqdm Test
for _ in tqdm(range(100)):
    pass

  0%|          | 0/100 [00:00<?, ?it/s]

In [6]:
PROJECT_NAME = "APT_SHIFT_Pretraining"
RUN_NAME = "RT-DETR_50"

# # WandB Initialization
# wandb.init(project=PROJECT_NAME, name=RUN_NAME)

## Define Dataset

In [7]:
DATA_ROOT = path.join(".", "data")

dataset = DatasetHolder(
    train=SHIFTClearDatasetForObjectDetection(root=DATA_ROOT, train=True),
    valid=SHIFTClearDatasetForObjectDetection(root=DATA_ROOT, valid=True),
    test=SHIFTCorruptedDatasetForObjectDetection(root=DATA_ROOT, valid=True)
)

[07/22/2025 14:31:09] SHIFT DevKit - INFO - Base: ./data/SHIFT/discrete/images/train. Backend: <shift_dev.utils.backend.ZipBackend object at 0x7fa1312a2950>
[07/22/2025 14:31:09] SHIFT DevKit - INFO - Loading annotation from './data/SHIFT_SUBSET/normal/discrete/images/train/front/det_2d.json' ...


INFO: Downloading 'SHIFT_SUBSET' from file server to ./data/SHIFT/discrete...
INFO: Dataset archive found in the root directory. Skipping download.
INFO: Subset split for 'SHIFT_SUBSET' dataset is already done. Skipping...
INFO: Downloading 'SHIFT_SUBSET' from file server to ./data/SHIFT/discrete...
INFO: Dataset archive found in the root directory. Skipping download.


[07/22/2025 14:31:11] SHIFT DevKit - INFO - Loading annotation from './data/SHIFT_SUBSET/normal/discrete/images/train/front/det_2d.json' Done.
[07/22/2025 14:31:24] SHIFT DevKit - INFO - Loading annotation takes 14.61 seconds.


Batch 0:

Item                 Shape                               Min        Max       
--------------------------------------------------------------------------------
original_hw          [tensor([800]), tensor([1280])]
input_hw             [tensor([800]), tensor([1280])]
frame_ids            torch.Size([1])                           0.00       0.00
name                 ['00000000_img_front.jpg']
videoName            ['0016-1b62']
intrinsics           torch.Size([1, 3, 3])                     0.00     640.00
extrinsics           torch.Size([1, 4, 4])                    -7.53     219.91
boxes2d              torch.Size([1, 26, 4])                    5.00     974.00
boxes2d_classes      torch.Size([1, 26])                       0.00       3.00
boxes2d_track_ids    torch.Size([1, 26])                       0.00      25.00
images               torch.Size([1, 1, 3, 800, 1280])          0.00     255.00



[07/22/2025 14:31:29] SHIFT DevKit - INFO - Base: ./data/SHIFT/discrete/images/val. Backend: <shift_dev.utils.backend.ZipBackend object at 0x7fa1312a2950>
[07/22/2025 14:31:29] SHIFT DevKit - INFO - Loading annotation from './data/SHIFT_SUBSET/normal/discrete/images/val/front/det_2d.json' ...
[07/22/2025 14:31:29] SHIFT DevKit - INFO - Loading annotation from './data/SHIFT_SUBSET/normal/discrete/images/val/front/det_2d.json' Done.


Video name: 0016-1b62
Sample indices within a video: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
INFO: Downloading 'SHIFT_SUBSET' from file server to ./data/SHIFT/discrete...
INFO: Dataset archive found in the root directory. Skipping download.
INFO: Subset split for 'SHIFT_SUBSET' dataset is already done. Skipping...
INFO: Downloading 'SHIFT_SUBSET' from file server to ./data/SHIFT/discrete...
INFO: Dataset archive found in the root directory. Skipping download.


[07/22/2025 14:31:31] SHIFT DevKit - INFO - Loading annotation takes 1.50 seconds.


Batch 0:

Item                 Shape                               Min        Max       
--------------------------------------------------------------------------------
original_hw          [tensor([800]), tensor([1280])]
input_hw             [tensor([800]), tensor([1280])]
frame_ids            torch.Size([1])                           0.00       0.00
name                 ['00000000_img_front.jpg']
videoName            ['0116-4859']
intrinsics           torch.Size([1, 3, 3])                     0.00     640.00
extrinsics           torch.Size([1, 4, 4])                    -0.90     138.34
boxes2d              torch.Size([1, 6, 4])                   246.00     859.00
boxes2d_classes      torch.Size([1, 6])                        1.00       5.00
boxes2d_track_ids    torch.Size([1, 6])                        0.00       5.00
images               torch.Size([1, 1, 3, 800, 1280])          0.00     255.00



[07/22/2025 14:31:31] SHIFT DevKit - INFO - Base: ./data/SHIFT/discrete/images/val. Backend: <shift_dev.utils.backend.ZipBackend object at 0x7fa1312a2950>
[07/22/2025 14:31:31] SHIFT DevKit - INFO - Loading annotation from './data/SHIFT_SUBSET/corrupted/discrete/images/val/front/det_2d.json' ...


Video name: 0116-4859
Sample indices within a video: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
INFO: Downloading 'SHIFT_SUBSET' from file server to ./data/SHIFT/discrete...
INFO: Dataset archive found in the root directory. Skipping download.
INFO: Subset split for 'SHIFT_SUBSET' dataset is already done. Skipping...
INFO: Downloading 'SHIFT_SUBSET' from file server to ./data/SHIFT/discrete...
INFO: Dataset archive found in the root directory. Skipping download.


[07/22/2025 14:31:33] SHIFT DevKit - INFO - Loading annotation from './data/SHIFT_SUBSET/corrupted/discrete/images/val/front/det_2d.json' Done.
[07/22/2025 14:31:49] SHIFT DevKit - INFO - Loading annotation takes 18.10 seconds.


Batch 0:

Item                 Shape                               Min        Max       
--------------------------------------------------------------------------------
original_hw          [tensor([800]), tensor([1280])]
input_hw             [tensor([800]), tensor([1280])]
frame_ids            torch.Size([1])                           0.00       0.00
name                 ['00000000_img_front.jpg']
videoName            ['007b-4e72']
intrinsics           torch.Size([1, 3, 3])                     0.00     640.00
extrinsics           torch.Size([1, 4, 4])                  -311.22     226.46
boxes2d              torch.Size([1, 3, 4])                   233.00     802.00
boxes2d_classes      torch.Size([1, 3])                        0.00       1.00
boxes2d_track_ids    torch.Size([1, 3])                        0.00       2.00
images               torch.Size([1, 1, 3, 800, 1280])          0.00     255.00

Video name: 007b-4e72
Sample indices within a video: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,

In [8]:
dataset.train[1]['front'].keys()

dict_keys(['original_hw', 'input_hw', 'frame_ids', 'name', 'videoName', 'intrinsics', 'extrinsics', 'boxes2d', 'boxes2d_classes', 'boxes2d_track_ids', 'images'])

In [9]:
dataset.train[999]

{'front': {'original_hw': (800, 1280),
  'input_hw': (800, 1280),
  'frame_ids': 490,
  'name': '00000490_img_front.jpg',
  'videoName': '0c9d-eefc',
  'intrinsics': tensor([[640.,   0., 640.],
          [  0., 640., 400.],
          [  0.,   0.,   1.]]),
  'extrinsics': tensor([[-5.7429e-01,  7.7804e-01, -2.5465e-01,  1.6100e+02],
          [-7.0979e-01, -6.2821e-01, -3.1867e-01, -2.0023e+01],
          [-4.0791e-01, -2.2626e-03,  9.1302e-01,  1.5929e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00]]),
  'boxes2d': tensor([[ 457.,  405.,  525.,  467.],
          [ 599.,  391.,  612.,  403.],
          [ 599.,  398.,  677.,  459.],
          [ 835.,  391., 1280.,  605.],
          [ 655.,  396.,  668.,  402.],
          [ 392.,  394.,  404.,  401.],
          [ 665.,  396.,  676.,  402.],
          [ 842.,  390.,  848.,  397.],
          [1207.,  380., 1217.,  399.]]),
  'boxes2d_classes': tensor([1, 2, 1, 1, 1, 2, 1, 0, 0]),
  'boxes2d_track_ids': tensor([ 4,  1,  0

In [10]:
dataset.train[1000]['front']['images'].shape  # should be (batch_size, num_channels, height, width)

torch.Size([1, 3, 800, 1280])

## DataLoader

In [11]:
# Set Batch Size
BATCH_SIZE = 1, 32, 32

# Dataset Configs
CLASSES = dataset.train.classes
NUM_CLASSES = len(CLASSES)

print(f"INFO: Set batch size - Train: {BATCH_SIZE[0]}, Valid: {BATCH_SIZE[1]}, Test: {BATCH_SIZE[2]}")
print(f"INFO: Number of classes - {NUM_CLASSES} {CLASSES}")

INFO: Set batch size - Train: 1, Valid: 32, Test: 32
INFO: Number of classes - 6 ['pedestrian', 'car', 'truck', 'bus', 'motorcycle', 'bicycle']


In [12]:
class DatasetAdapterForTransformers(BaseDataset):
    def __init__(self, original_dataset, camera='front'):
        self.dataset = original_dataset
        self.camera = camera

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx][self.camera]
        image = item['images'].squeeze(0)

        # Convert to COCO_Detection Format
        annotations = []
        target = dict(image_id=idx, annotations=annotations)
        for box, cls in zip(item['boxes2d'], item['boxes2d_classes']):
            x1, y1, x2, y2 = box.tolist()  # from Pascal VOC format (x1, y1, x2, y2)
            width, height = x2 - x1, y2 - y1
            annotations.append(dict(
                bbox=[x1, y1, width, height],  # to COCO format: [x, y, width, height]
                category_id=cls.item(),
                area=width * height,
                iscrowd=0
            ))

        # Following prepare_coco_detection_annotation's expected format
        # RT-DETR ImageProcessor converts the COCO bbox to center format (cx, cy, w, h) during preprocessing
        # But, eventually re-converts the bbox to Pascal VOC (x1, y1, x2, y2) format after post-processing
        return dict(image=image, target=target)

In [13]:
def collate_fn(batch, preprocessor=None):
    images = [item['image'] for item in batch]
    if preprocessor is not None:
        target = [item['target'] for item in batch]
        return preprocessor(images=images, annotations=target, return_tensors="pt")
    else:
        # If no preprocessor is provided, just assume images are already in tensor format
        return dict(
            pixel_values=dict(pixel_values=torch.stack(images)),
            labels=[dict(
                class_labels=item['boxes2d_classes'].long(),
                boxes=item["boxes2d"].float()
            ) for item in batch]
        )

## Load Model

In [14]:
from transformers import RTDetrForObjectDetection, RTDetrImageProcessorFast, RTDetrConfig
from transformers.image_utils import AnnotationFormat

In [15]:
USE_PRETRAINED_MODEL = False

In [16]:
reference_model_id = "PekingU/rtdetr_r50vd"

# Load the reference model configuration
reference_config = RTDetrConfig.from_pretrained(reference_model_id, torch_dtype=torch.float32, return_dict=True)
reference_config.num_labels = NUM_CLASSES

# Load the reference model image processor
reference_preprocessor = RTDetrImageProcessorFast.from_pretrained(reference_model_id)
reference_preprocessor.format = AnnotationFormat.COCO_DETECTION  # COCO Format / Detection BBOX Format

In [17]:
# import os
# import numpy as np
# import torch
# from types import SimpleNamespace
# from dataclasses import dataclass
# from ultralytics.nn.tasks import DetectionModel, v8DetectionLoss, E2EDetectLoss
# from transformers.modeling_outputs import ModelOutput

# @dataclass
# class DetectionOutput(ModelOutput):
#     logits: np.ndarray     # (batch_size, num_boxes, num_classes)
#     pred_boxes: np.ndarray = None  # (batch_size, num_boxes, 4)

# # transformer 모델처럼 객체 안에서 loss를 내보내는것이 필요.
# class MyDetectionModel(DetectionModel):
#     def __init__(self,
#                  pre_train: str | None = None,
#                  cfg: str = "yolo11n.yaml",
#                  ch: int = 3,
#                  nc: int | None = None,
#                  verbose: bool = True):
#         super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)

#         # .pt 체크포인트 로드
#         if pre_train and os.path.isfile(pre_train):
#             self.load(pre_train, verbose=True)

#         # loss hyperparameters 셋업
#         default_hyp = dict(box=0.05, cls=0.58, dfl=1.5)
#         hyp = self.yaml.get("hyp", {}) or default_hyp
#         hyp_ns = SimpleNamespace(**hyp)
#         self.args = hyp_ns           # HuggingFace Trainer가 읽도록
#         self.model.args = hyp_ns     # v8DetectionLoss가 읽도록

#     def init_criterion(self):
#         # end2end 모드면 E2E loss | 아니면 v8DetectionLoss
#         # yolo11에서는 train 시 v8DetectionLoss.
#         # test 시에 뭘 사용할지는 조사 필요
#         return E2EDetectLoss(self) if getattr(self, "end2end", False) else v8DetectionLoss(self)

#     def forward(self,
#                 pixel_values: torch.Tensor,
#                 labels: list[dict] | None = None,
#                 **kwargs):
#         # inference 
#         if labels is None:
#             # super().predict() 가 (preds, train_out) 튜플일 수도 있으니 안전하게 언패킹
#             pred_out = super().predict(pixel_values)
#             preds = pred_out[0] if isinstance(pred_out, tuple) else pred_out

#             # 각 이미지별 [Ni, 5+nc] 텐서에서
#             #   - boxes: [:,:4]
#             #   - logits: [:,5:]
#             logits_list, boxes_list = [], []
#             for p in preds:
#                 p = p.detach().cpu()
#                 boxes_list.append(p[:, :4].numpy())     # [Ni, 4]
#                 logits_list.append(p[:, 5:].numpy())   # [Ni, nc]

#             B = len(logits_list)
#             logits_arr     = np.empty(B, dtype=object)
#             pred_boxes_arr = np.empty(B, dtype=object)
#             for i in range(B):
#                 logits_arr[i]     = logits_list[i]
#                 pred_boxes_arr[i] = boxes_list[i]

#             return DetectionOutput(logits=logits_arr, pred_boxes=pred_boxes_arr)

#         # training 분기: HF Trainer 형식의 labels → YOLO batch dict
#         device = pixel_values.device
#         batch_idx, cls, bboxes = [], [], []
#         for i, l in enumerate(labels):
#             ci = l["class_labels"].to(device)  # (Ni,)
#             bi = l["boxes"].to(device)         # (Ni,4), normalized cx,cy,w,h
#             n = ci.size(0)
#             batch_idx.append(torch.full((n,), i, device=device, dtype=torch.long))
#             cls.append(ci)
#             bboxes.append(bi)

#         batch = {
#             "img":       pixel_values.to(device, non_blocking=True),
#             "batch_idx": torch.cat(batch_idx, 0),
#             "cls":       torch.cat(cls,       0),
#             "bboxes":    torch.cat(bboxes,    0),
#         }

#         # (3) loss 계산
#         crit = self.init_criterion()
#         preds = super().predict(pixel_values)
#         raw_loss_vec, _ = crit(preds, batch)
#         # print("raw_loss_vec :",raw_loss_vec)
#         loss = raw_loss_vec.sum()
#         # print("loss", loss)

#         # (4) HF Trainer 호환 dict 리턴
#         return {"loss": loss}

In [28]:
import os, numpy as np, torch
from types import SimpleNamespace
from dataclasses import dataclass
from ultralytics.nn.tasks import DetectionModel, v8DetectionLoss, E2EDetectLoss
from transformers.modeling_outputs import ModelOutput

from typing import Optional

# 1) DetectionOutput 정의
@dataclass
class DetectionOutput(ModelOutput):
    logits:     np.ndarray | None = None   # (B, N, C)
    pred_boxes: np.ndarray | None = None # Optional[np.ndarray]    # (B, N, 4)

class HFDetectionModel(torch.nn.Module):
    def __init__(self, yolo_cfg, pretrained_weights=None, device='cuda'):
        super().__init__()
        self.device = device

        # 2) YOLO 모델 구조만 로드
        self.yolo = DetectionModel(cfg=yolo_cfg, ch=3, nc=None, verbose=False).to(device)

        # 3) 안전하게 checkpoint 불러오기
        if pretrained_weights and os.path.isfile(pretrained_weights):
            # (a) pickle global 허용
            import ultralytics.nn.tasks as tasks
            torch.serialization.add_safe_globals([tasks.DetectionModel, tasks.BaseModel])
            # (b) checkpoint dict 로드 (weights_only=False!)
            ckpt = torch.load(pretrained_weights, map_location=device, weights_only=False)
            # (c) dict 형태로 load
            self.yolo.load(ckpt, verbose=False)

        # 4) 하이퍼파라미터 세팅
        hyp = self.yolo.yaml.get('hyp', {}) or dict(box=0.05, cls=0.58, dfl=1.5)
        hyp_ns = SimpleNamespace(**hyp)
        self.yolo.args       = hyp_ns
        self.yolo.model.args = hyp_ns

    def init_criterion(self):
        return E2EDetectLoss(self.yolo) if getattr(self.yolo, 'end2end', False) \
               else v8DetectionLoss(self.yolo)

    def forward(self, pixel_values, labels=None):
        # ─── 1) 언제나 predict 로부터 raw detections 얻기 ─────────────────
        device = pixel_values.device
        B = pixel_values.shape[0]
        
        # ─── 1) 항상 predict() 로 raw detection 얻기 ───────────────────────────
        with torch.no_grad():
            pred = self.yolo.predict(pixel_values.to(device))
        # predict() 가 (preds, train_out) 튜플일 수 있으니 unpack
        if isinstance(pred, tuple):
            preds = pred[0]
        else:
            preds = pred

        # preds: List[Tensor] 길이=B, 각 텐서 shape=(Ni, 5+nc)
        #   → boxes: [:,:4], logits: [:,5:]
        boxes_arr  = np.array([p[:, :4].cpu().numpy()   for p in preds], dtype=object)
        logits_arr = np.array([p[:, 5:].cpu().numpy()   for p in preds], dtype=object)


        # ─── training 분기 ────────────────
        # (batch dict 준비)
        if labels is not None:
            
            batch_idx, cls, bboxes = [], [], []
            for i, lab in enumerate(labels):
                ci = lab['class_labels'].to(device)
                bi = lab['boxes'].to(device)
                n  = ci.size(0)
                batch_idx.append(torch.full((n,), i, device=device, dtype=torch.long))
                cls.append(ci); bboxes.append(bi)

            batch = {
                'img':       pixel_values.to(device),
                'batch_idx': torch.cat(batch_idx, 0),
                'cls':       torch.cat(cls,       0),
                'bboxes':    torch.cat(bboxes,    0),
            }
            

            # (3) YOLO 내부 loss 직접 호출
            pred = self.yolo.predict(pixel_values.to(self.device))
            crit = self.yolo.init_criterion()
            raw_loss_vec, _ = crit(pred, batch)
            loss = raw_loss_vec.sum()
            
            return DetectionOutput(
                loss       = loss,
                logits     = logits_arr,
                pred_boxes = boxes_arr
            )
        
        
        
        return DetectionOutput(
            logits     = logits_arr,
            pred_boxes = boxes_arr
        )
    
    #         device = pixel_values.device
#         batch_idx, cls, bboxes = [], [], []
#         for i, l in enumerate(labels):
#             ci = l["class_labels"].to(device)  # (Ni,)
#             bi = l["boxes"].to(device)         # (Ni,4), normalized cx,cy,w,h
#             n = ci.size(0)
#             batch_idx.append(torch.full((n,), i, device=device, dtype=torch.long))
#             cls.append(ci)
#             bboxes.append(bi)

#         batch = {
#             "img":       pixel_values.to(device, non_blocking=True),
#             "batch_idx": torch.cat(batch_idx, 0),
#             "cls":       torch.cat(cls,       0),
#             "bboxes":    torch.cat(bboxes,    0),
#         }

#         # (3) loss 계산
#         crit = self.init_criterion()
#         preds = super().predict(pixel_values)
#         raw_loss_vec, _ = crit(preds, batch)
#         # print("raw_loss_vec :",raw_loss_vec)
#         loss = raw_loss_vec.sum()
#         # print("loss", loss)

#         # (4) HF Trainer 호환 dict 리턴
#         return {"loss": loss}

In [19]:
# class Detect_yolo(nn.Module):
#     def __init__(self, yolo):
#         super().__init__()
#         self.model = yolo.model  # 내부 nn.Module
#         self.criterion = self.init_criterion()
#     def compute_loss(self):
        

#     def forward(self, pixel_values, labels=None):
#         # pixel_values → x 로 넘겨줌
#         output = self.model(pixel_values, labels)
#         print(f"yolo_output : {output}")
#         return output

In [29]:
# if USE_PRETRAINED_MODEL:
#     # Load the pre-trained model
#     model = RTDetrForObjectDetection.from_pretrained(reference_model_id, config=reference_config, torch_dtype=torch.float32, ignore_mismatched_sizes=True)
# else:
#     # Initialize a new model with the reference configuration
#     model = RTDetrForObjectDetection(config=reference_config)
# model.to(device)

# if USE_PRETRAINED_MODEL:
#     # Load the pre-trained model
#     model = MyDetectionModel(pre_train="yolo11n.pt", cfg="yolo11n.yaml")
# else:
#     # Initialize a new model with the reference configuration
#     model = MyDetectionModel(cfg="yolo11n.yaml")
# model.to(device)

if USE_PRETRAINED_MODEL:
    # Load the pre-trained model
    model = HFDetectionModel(
    yolo_cfg="yolo11n.yaml",
    pretrained_weights="yolo11n.pt",
    device="cuda"
)
else:
    # Initialize a new model with the reference configuration
    model = HFDetectionModel(
    yolo_cfg="yolo11n.yaml",
    pretrained_weights="yolo11n.pt",
    device="cuda"
)
model.to(device)

HFDetectionModel(
  (yolo): DetectionModel(
    (model): Sequential(
      (0): Conv(
        (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (2): C3k2(
        (cv1): Conv(
          (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (act): SiLU(inplace=True)
        )
        (cv2): Conv(
          (conv): Conv2d(48, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, tra

In [30]:
test_d = DatasetAdapterForTransformers(dataset.train)[5]
test_d

{'image': tensor([[[ 64.,  65.,  67.,  ..., 121., 121., 121.],
          [ 62.,  61.,  64.,  ..., 121., 121., 121.],
          [ 58.,  60.,  61.,  ..., 121., 121., 121.],
          ...,
          [105., 105., 105.,  ...,  60.,  64.,  72.],
          [103., 103., 103.,  ...,  67.,  67.,  67.],
          [100., 100., 100.,  ...,  76.,  72.,  66.]],
 
         [[ 27.,  26.,  26.,  ..., 139., 139., 139.],
          [ 25.,  24.,  22.,  ..., 139., 139., 139.],
          [ 22.,  23.,  22.,  ..., 139., 139., 139.],
          ...,
          [106., 106., 106.,  ...,  59.,  63.,  71.],
          [104., 104., 104.,  ...,  66.,  66.,  66.],
          [101., 101., 101.,  ...,  75.,  71.,  65.]],
 
         [[  9.,   9.,   8.,  ..., 153., 153., 153.],
          [  7.,   6.,   6.,  ..., 153., 153., 153.],
          [  6.,   7.,   7.,  ..., 153., 153., 153.],
          ...,
          [111., 111., 111.,  ...,  65.,  69.,  77.],
          [109., 109., 109.,  ...,  72.,  72.,  72.],
          [106., 106.,

In [31]:
reference_preprocessor(images=test_d['image'], annotations=test_d['target'])

{'pixel_values': tensor([[[[0.2514, 0.2654, 0.2874,  ..., 0.4745, 0.4745, 0.4745],
          [0.2384, 0.2522, 0.2799,  ..., 0.4745, 0.4745, 0.4745],
          [0.2345, 0.2456, 0.2704,  ..., 0.4768, 0.4768, 0.4768],
          ...,
          [0.4108, 0.4118, 0.4114,  ..., 0.2585, 0.2519, 0.2749],
          [0.4072, 0.4083, 0.4117,  ..., 0.2805, 0.2562, 0.2625],
          [0.3951, 0.3963, 0.4037,  ..., 0.3235, 0.2901, 0.2716]],

         [[0.1014, 0.0963, 0.1000,  ..., 0.5451, 0.5451, 0.5451],
          [0.0919, 0.0887, 0.0952,  ..., 0.5451, 0.5451, 0.5451],
          [0.0912, 0.0901, 0.0929,  ..., 0.5474, 0.5474, 0.5474],
          ...,
          [0.4147, 0.4157, 0.4154,  ..., 0.2545, 0.2480, 0.2710],
          [0.4111, 0.4123, 0.4156,  ..., 0.2766, 0.2522, 0.2586],
          [0.3990, 0.4002, 0.4076,  ..., 0.3196, 0.2862, 0.2676]],

         [[0.0324, 0.0304, 0.0396,  ..., 0.6000, 0.6000, 0.6000],
          [0.0254, 0.0253, 0.0359,  ..., 0.6000, 0.6000, 0.6000],
          [0.0317, 0.0325

In [32]:
# Set Epoch Count & Learning Rate
EPOCHS = 20
LEARNING_RATE = 2e-5

training_args = TrainingArguments(
    learning_rate=LEARNING_RATE,
    warmup_ratio=0.1,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE[0],
    per_device_eval_batch_size=BATCH_SIZE[1],
    eval_accumulation_steps=BATCH_SIZE[1],
    batch_eval_metrics=True,
    remove_unused_columns=False,
    optim="adamw_torch",
    lr_scheduler_type="cosine",
    eval_strategy="steps",
    eval_steps=100,
    save_steps=100,
    save_total_limit=10,
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="mAP@0.50:0.95",
    greater_is_better=True,
    #metric_for_best_model="eval_loss",
    #greater_is_better=False,
    # report_to="wandb",
    output_dir="./results/"+RUN_NAME,
    logging_dir="./logs/"+RUN_NAME,
    run_name=RUN_NAME,
    #fp16=True,
)

testing_args = TrainingArguments(
    per_device_eval_batch_size=BATCH_SIZE[2],
    batch_eval_metrics=True,
    remove_unused_columns=False,
)

In [33]:
from transformers.trainer_utils import EvalPrediction
from torchvision.ops import box_convert
from dataclasses import dataclass


@dataclass
class ModelOutput:
    logits: torch.Tensor
    pred_boxes: torch.Tensor


def de_normalize_boxes(boxes, height, width):
    # 1. cxcywh → xyxy
    boxes_xyxy_norm = box_convert(boxes, 'cxcywh', 'xyxy')

    # 2. de-normalize (convert to actual pixel coordinates)
    boxes_xyxy_norm[:, [0, 2]] *= width
    boxes_xyxy_norm[:, [1, 3]] *= height
    return boxes_xyxy_norm


def map_compute_metrics(eval_pred: EvalPrediction, compute_result=False, preprocessor=reference_preprocessor, threshold=0.3):
    print("eval_pred :", eval_pred)
    preds_tuple = eval_pred.predictions
    print("preds_tuple : ", preds_tuple)
    
    # 길이에 따라 unpack
    if len(preds_tuple) == 3:
        _, logits, pred_boxes = preds_tuple
    else:
        logits, pred_boxes = preds_tuple
    
    preds = ModelOutput(logits=logits, pred_boxes=pred_boxes)
    labels = eval_pred.label_ids

    results = preprocessor.post_process_object_detection(
        preds, target_sizes=[label['orig_size'].cpu().tolist() for label in labels], threshold=threshold
    )

    predictions = [sv.Detections.from_transformers(result) for result in results]
    targets = [sv.Detections(
        xyxy=de_normalize_boxes(label['boxes'], *label['orig_size']).cpu().numpy(),
        class_id=label['class_labels'].cpu().numpy(),
    ) for label in labels]

    m_ap = sv.MeanAveragePrecision.from_detections(
        predictions=predictions,
        targets=targets,
    )
    try: # [0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95]
        per_class_map = {f"{CLASSES[i]}_mAP@0.50:0.95": sum(v)/len(v) for i, v in enumerate(m_ap.per_class_ap50_95.tolist())}  # Shape: [num_classes, 10]
    except AttributeError:
        per_class_map = {}

    result = {
        "mAP@0.50:0.95": m_ap.map50_95,
        "mAP@0.50": m_ap.map50,
        "mAP@0.75": m_ap.map75,
        **per_class_map
    }

    print(f"\rINFO: Computed Metrics - {result}", end="")

    return result

In [34]:
from functools import partial

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=DatasetAdapterForTransformers(dataset.train),
    eval_dataset=DatasetAdapterForTransformers(dataset.valid),
    data_collator=partial(collate_fn, preprocessor=reference_preprocessor),
    compute_metrics=partial(map_compute_metrics, preprocessor=reference_preprocessor),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=10)]
)

tester = Trainer(
    model=model,
    args=testing_args,
    eval_dataset=DatasetAdapterForTransformers(dataset.test),
    data_collator=partial(collate_fn, preprocessor=reference_preprocessor),
    compute_metrics=partial(map_compute_metrics, preprocessor=reference_preprocessor)
)

[2025-07-22 14:37:37,780 other.py:512 check_os_kernel] Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
[2025-07-22 14:37:37,792 other.py:512 check_os_kernel] Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


## Train

In [35]:
def start_train():
    accelerator = Accelerator()
    try:
        print("INFO: Trying to resume from previous checkpoint")
        trainer.train(resume_from_checkpoint=False)
    except Exception as e:
        print(f"ERROR: Failed to resume from checkpoint - {e}")
        print("INFO: Starting training from scratch")
        trainer.train(resume_from_checkpoint=False)

In [36]:
if ADDITIONAL_GPU:
    notebook_launcher(start_train, args=(), num_processes=ADDITIONAL_GPU)
else:
    start_train()

[2025-07-22 14:37:43,939 other.py:512 check_os_kernel] Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


INFO: Trying to resume from previous checkpoint
ERROR: Failed to resume from checkpoint - could not broadcast input array from shape (4,80,80) into shape (1,4)
INFO: Starting training from scratch


ValueError: could not broadcast input array from shape (4,80,80) into shape (1,4)

## Evaluate

### Auto Evaluation

In [None]:
trainer.evaluate()

In [None]:
tester.evaluate()

### Manual Evaluation

In [None]:
checkpoint = 31100

In [None]:
try:
    model = RTDetrForObjectDetection.from_pretrained(f"{training_args.output_dir}/checkpoint-{checkpoint}/", torch_dtype=torch.float32, return_dict=True, local_files_only=True)
    model.to(device)
except Exception:
    pass

In [None]:
class LabelDataset(BaseDataset):
    def __init__(self, original_dataset, camera='front'):
        self.dataset = original_dataset
        self.camera = camera

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx][self.camera]
        return item['boxes2d'], item['boxes2d_classes']

In [None]:
def naive_collate_fn(batch):
    return batch

In [None]:
targets = []
predictions = []

img_size = [640, 640]
original_size = [800, 1280]
batch_size = 32

raw_data = DataLoader(LabelDataset(dataset.valid), batch_size=batch_size, collate_fn=naive_collate_fn)
loader = DataLoader(DatasetAdapterForTransformers(dataset.valid), batch_size=batch_size, collate_fn=partial(collate_fn, preprocessor=reference_preprocessor))
for idx, lables, inputs in zip(tqdm(range(len(raw_data))), raw_data, loader):
    with torch.no_grad():
        outputs = model(pixel_values=inputs['pixel_values'].to(device))

    results = reference_preprocessor.post_process_object_detection(
        outputs, target_sizes=[original_size for _ in range(len(inputs))], threshold=0.3
    )

    detections = [sv.Detections.from_transformers(results[i]) for i in range(batch_size)]
    annotations = [sv.Detections(
        xyxy=lables[i][0].cpu().numpy(),
        class_id=lables[i][1].cpu().numpy(),
    ) for i in range(batch_size)]

    targets.extend(annotations)
    predictions.extend(detections)

In [None]:
len(predictions) == len(targets), len(predictions), len(targets)

In [None]:
mean_average_precision = sv.MeanAveragePrecision.from_detections(
    predictions=predictions,
    targets=targets,
)

print(f"mAP@0.95: {mean_average_precision.map50_95:.2f}")
print(f"map50: {mean_average_precision.map50:.2f}")
print(f"map75: {mean_average_precision.map75:.2f}")
print(f"per_class_ap50_95: {mean_average_precision.per_class_ap50_95}")