# RT-DETR Pretraining with SHIFT-Discrete Dataset

## Check GPU Availability

In [1]:
!nvidia-smi

Wed Jul 23 15:44:47 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off | 00000000:04:00.0 Off |                    0 |
| N/A   42C    P0              34W / 250W |  13146MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  Tesla P100-PCIE-16GB           Off | 00000000:06:00.0 Off |  

In [2]:
# Set CUDA Device Number
DEVICE_NUM = 6
ADDITIONAL_GPU = 0

from os import environ
environ["CUDA_VISIBLE_DEVICES"] = ",".join([f"{i+DEVICE_NUM}" for i in range(0, ADDITIONAL_GPU+1)])
environ["CUDA_VISIBLE_DEVICES"]

'6'

## Imports

In [3]:
from os import path

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from ttadapters.datasets import BaseDataset, DatasetHolder, DataLoaderHolder
from ttadapters.datasets import SHIFTClearDatasetForObjectDetection, SHIFTCorruptedDatasetForObjectDetection
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
from accelerate import Accelerator, notebook_launcher

# import wandb
import supervision as sv
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

from ultralytics import YOLO

In [4]:
if torch.cuda.is_available():
    if ADDITIONAL_GPU:
        device = torch.device("cuda")
    else:
        device = torch.device(f"cuda")  # torch.device(f"cuda:{DEVICE_NUM}")
else:
    device = torch.device("cpu")
    DEVICE_NUM = -1

print(f"INFO: Using device - {device}" + (f":{DEVICE_NUM}" if ADDITIONAL_GPU else ""))

INFO: Using device - cuda


In [5]:
# Tqdm Test
for _ in tqdm(range(100)):
    pass

  0%|          | 0/100 [00:00<?, ?it/s]

In [6]:
PROJECT_NAME = "APT_SHIFT_Pretraining"
RUN_NAME = "RT-DETR_50"

# # WandB Initialization
# wandb.init(project=PROJECT_NAME, name=RUN_NAME)

## Define Dataset

In [7]:
DATA_ROOT = path.join(".", "data")

dataset = DatasetHolder(
    train=SHIFTClearDatasetForObjectDetection(root=DATA_ROOT, train=True),
    valid=SHIFTClearDatasetForObjectDetection(root=DATA_ROOT, valid=True),
    test=SHIFTCorruptedDatasetForObjectDetection(root=DATA_ROOT, valid=True)
)

[07/23/2025 15:44:59] SHIFT DevKit - INFO - Base: ./data/SHIFT/discrete/images/train. Backend: <shift_dev.utils.backend.ZipBackend object at 0x7f4b01509690>
[07/23/2025 15:44:59] SHIFT DevKit - INFO - Loading annotation from './data/SHIFT_SUBSET/normal/discrete/images/train/front/det_2d.json' ...


INFO: Downloading 'SHIFT_SUBSET' from file server to ./data/SHIFT/discrete...
INFO: Dataset archive found in the root directory. Skipping download.
INFO: Subset split for 'SHIFT_SUBSET' dataset is already done. Skipping...
INFO: Downloading 'SHIFT_SUBSET' from file server to ./data/SHIFT/discrete...
INFO: Dataset archive found in the root directory. Skipping download.


[07/23/2025 15:45:01] SHIFT DevKit - INFO - Loading annotation from './data/SHIFT_SUBSET/normal/discrete/images/train/front/det_2d.json' Done.
[07/23/2025 15:45:12] SHIFT DevKit - INFO - Loading annotation takes 13.32 seconds.


Batch 0:

Item                 Shape                               Min        Max       
--------------------------------------------------------------------------------
original_hw          [tensor([800]), tensor([1280])]
input_hw             [tensor([800]), tensor([1280])]
frame_ids            torch.Size([1])                           0.00       0.00
name                 ['00000000_img_front.jpg']
videoName            ['0016-1b62']
intrinsics           torch.Size([1, 3, 3])                     0.00     640.00
extrinsics           torch.Size([1, 4, 4])                    -7.53     219.91
boxes2d              torch.Size([1, 26, 4])                    5.00     974.00
boxes2d_classes      torch.Size([1, 26])                       0.00       3.00
boxes2d_track_ids    torch.Size([1, 26])                       0.00      25.00
images               torch.Size([1, 1, 3, 800, 1280])          0.00     255.00



[07/23/2025 15:45:17] SHIFT DevKit - INFO - Base: ./data/SHIFT/discrete/images/val. Backend: <shift_dev.utils.backend.ZipBackend object at 0x7f4b01509690>
[07/23/2025 15:45:17] SHIFT DevKit - INFO - Loading annotation from './data/SHIFT_SUBSET/normal/discrete/images/val/front/det_2d.json' ...
[07/23/2025 15:45:17] SHIFT DevKit - INFO - Loading annotation from './data/SHIFT_SUBSET/normal/discrete/images/val/front/det_2d.json' Done.


Video name: 0016-1b62
Sample indices within a video: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
INFO: Downloading 'SHIFT_SUBSET' from file server to ./data/SHIFT/discrete...
INFO: Dataset archive found in the root directory. Skipping download.
INFO: Subset split for 'SHIFT_SUBSET' dataset is already done. Skipping...
INFO: Downloading 'SHIFT_SUBSET' from file server to ./data/SHIFT/discrete...
INFO: Dataset archive found in the root directory. Skipping download.


[07/23/2025 15:45:19] SHIFT DevKit - INFO - Loading annotation takes 1.49 seconds.


Batch 0:

Item                 Shape                               Min        Max       
--------------------------------------------------------------------------------
original_hw          [tensor([800]), tensor([1280])]
input_hw             [tensor([800]), tensor([1280])]
frame_ids            torch.Size([1])                           0.00       0.00
name                 ['00000000_img_front.jpg']
videoName            ['0116-4859']
intrinsics           torch.Size([1, 3, 3])                     0.00     640.00
extrinsics           torch.Size([1, 4, 4])                    -0.90     138.34
boxes2d              torch.Size([1, 6, 4])                   246.00     859.00
boxes2d_classes      torch.Size([1, 6])                        1.00       5.00
boxes2d_track_ids    torch.Size([1, 6])                        0.00       5.00
images               torch.Size([1, 1, 3, 800, 1280])          0.00     255.00



[07/23/2025 15:45:19] SHIFT DevKit - INFO - Base: ./data/SHIFT/discrete/images/val. Backend: <shift_dev.utils.backend.ZipBackend object at 0x7f4b01509690>
[07/23/2025 15:45:19] SHIFT DevKit - INFO - Loading annotation from './data/SHIFT_SUBSET/corrupted/discrete/images/val/front/det_2d.json' ...


Video name: 0116-4859
Sample indices within a video: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
INFO: Downloading 'SHIFT_SUBSET' from file server to ./data/SHIFT/discrete...
INFO: Dataset archive found in the root directory. Skipping download.
INFO: Subset split for 'SHIFT_SUBSET' dataset is already done. Skipping...
INFO: Downloading 'SHIFT_SUBSET' from file server to ./data/SHIFT/discrete...
INFO: Dataset archive found in the root directory. Skipping download.


[07/23/2025 15:45:21] SHIFT DevKit - INFO - Loading annotation from './data/SHIFT_SUBSET/corrupted/discrete/images/val/front/det_2d.json' Done.
[07/23/2025 15:45:33] SHIFT DevKit - INFO - Loading annotation takes 14.15 seconds.


Batch 0:

Item                 Shape                               Min        Max       
--------------------------------------------------------------------------------
original_hw          [tensor([800]), tensor([1280])]
input_hw             [tensor([800]), tensor([1280])]
frame_ids            torch.Size([1])                           0.00       0.00
name                 ['00000000_img_front.jpg']
videoName            ['007b-4e72']
intrinsics           torch.Size([1, 3, 3])                     0.00     640.00
extrinsics           torch.Size([1, 4, 4])                  -311.22     226.46
boxes2d              torch.Size([1, 3, 4])                   233.00     802.00
boxes2d_classes      torch.Size([1, 3])                        0.00       1.00
boxes2d_track_ids    torch.Size([1, 3])                        0.00       2.00
images               torch.Size([1, 1, 3, 800, 1280])          0.00     255.00

Video name: 007b-4e72
Sample indices within a video: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,

In [8]:
dataset.train[1]['front'].keys()

dict_keys(['original_hw', 'input_hw', 'frame_ids', 'name', 'videoName', 'intrinsics', 'extrinsics', 'boxes2d', 'boxes2d_classes', 'boxes2d_track_ids', 'images'])

In [9]:
dataset.train[999]

{'front': {'original_hw': (800, 1280),
  'input_hw': (800, 1280),
  'frame_ids': 490,
  'name': '00000490_img_front.jpg',
  'videoName': '0c9d-eefc',
  'intrinsics': tensor([[640.,   0., 640.],
          [  0., 640., 400.],
          [  0.,   0.,   1.]]),
  'extrinsics': tensor([[-5.7429e-01,  7.7804e-01, -2.5465e-01,  1.6100e+02],
          [-7.0979e-01, -6.2821e-01, -3.1867e-01, -2.0023e+01],
          [-4.0791e-01, -2.2626e-03,  9.1302e-01,  1.5929e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00]]),
  'boxes2d': tensor([[ 457.,  405.,  525.,  467.],
          [ 599.,  391.,  612.,  403.],
          [ 599.,  398.,  677.,  459.],
          [ 835.,  391., 1280.,  605.],
          [ 655.,  396.,  668.,  402.],
          [ 392.,  394.,  404.,  401.],
          [ 665.,  396.,  676.,  402.],
          [ 842.,  390.,  848.,  397.],
          [1207.,  380., 1217.,  399.]]),
  'boxes2d_classes': tensor([1, 2, 1, 1, 1, 2, 1, 0, 0]),
  'boxes2d_track_ids': tensor([ 4,  1,  0

In [10]:
dataset.train[1000]['front']['images'].shape  # should be (batch_size, num_channels, height, width)

torch.Size([1, 3, 800, 1280])

## DataLoader

In [11]:
# Set Batch Size
BATCH_SIZE = 1, 32, 32

# Dataset Configs
CLASSES = dataset.train.classes
NUM_CLASSES = len(CLASSES)

print(f"INFO: Set batch size - Train: {BATCH_SIZE[0]}, Valid: {BATCH_SIZE[1]}, Test: {BATCH_SIZE[2]}")
print(f"INFO: Number of classes - {NUM_CLASSES} {CLASSES}")

INFO: Set batch size - Train: 1, Valid: 32, Test: 32
INFO: Number of classes - 6 ['pedestrian', 'car', 'truck', 'bus', 'motorcycle', 'bicycle']


In [12]:
class DatasetAdapterForTransformers(BaseDataset):
    def __init__(self, original_dataset, camera='front'):
        self.dataset = original_dataset
        self.camera = camera

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx][self.camera]
        image = item['images'].squeeze(0)

        # Convert to COCO_Detection Format
        annotations = []
        target = dict(image_id=idx, annotations=annotations)
        for box, cls in zip(item['boxes2d'], item['boxes2d_classes']):
            x1, y1, x2, y2 = box.tolist()  # from Pascal VOC format (x1, y1, x2, y2)
            width, height = x2 - x1, y2 - y1
            annotations.append(dict(
                bbox=[x1, y1, width, height],  # to COCO format: [x, y, width, height]
                category_id=cls.item(),
                area=width * height,
                iscrowd=0
            ))

        # Following prepare_coco_detection_annotation's expected format
        # RT-DETR ImageProcessor converts the COCO bbox to center format (cx, cy, w, h) during preprocessing
        # But, eventually re-converts the bbox to Pascal VOC (x1, y1, x2, y2) format after post-processing
        return dict(image=image, target=target)

In [13]:
def collate_fn(batch, preprocessor=None):
    images = [item['image'] for item in batch]
    if preprocessor is not None:
        target = [item['target'] for item in batch]
        return preprocessor(images=images, annotations=target, return_tensors="pt")
    else:
        # If no preprocessor is provided, just assume images are already in tensor format
        return dict(
            pixel_values=dict(pixel_values=torch.stack(images)),
            labels=[dict(
                class_labels=item['boxes2d_classes'].long(),
                boxes=item["boxes2d"].float()
            ) for item in batch]
        )

## Load Model

In [14]:
from transformers import RTDetrForObjectDetection, RTDetrImageProcessorFast, RTDetrConfig
from transformers.image_utils import AnnotationFormat

In [15]:
USE_PRETRAINED_MODEL = False

In [16]:
reference_model_id = "PekingU/rtdetr_r50vd"

# Load the reference model configuration
reference_config = RTDetrConfig.from_pretrained(reference_model_id, torch_dtype=torch.float32, return_dict=True)
reference_config.num_labels = NUM_CLASSES

# Load the reference model image processor
reference_preprocessor = RTDetrImageProcessorFast.from_pretrained(reference_model_id)
reference_preprocessor.format = AnnotationFormat.COCO_DETECTION  # COCO Format / Detection BBOX Format

In [24]:
from ultralytics.utils import LOGGER, RANK
from ultralytics.utils.dist import generate_ddp_command, ddp_cleanup
from ultralytics.engine.trainer import BaseTrainer
import subprocess
from typing import Any, Dict, List, Optional, Tuple, Union

from ultralytics.models import yolo
from ultralytics.nn.tasks import DetectionModel

BATCH_SIZE = 1, 32, 32
datasets = DatasetHolder(
    train=SHIFTClearDatasetForObjectDetection(root=DATA_ROOT, train=True),
    valid=SHIFTClearDatasetForObjectDetection(root=DATA_ROOT, valid=True),
    test=SHIFTCorruptedDatasetForObjectDetection(root=DATA_ROOT, valid=True)
)


class YOLO_trainer :
    def __init__(self, datasets: DatasetHolder, BATCH_SIZE):
        self.dataset = datasets
        self.train_dataset = DatasetAdapterForTransformers(datasets.train)
        self.valid_dataset = DatasetAdapterForTransformers(datasets.valid)
        self.train_batch_size, self.valid_batch_size = BATCH_SIZE[0], BATCH_SIZE[1]
    def get_dataloader(self, rank: int = 0, mode: str = "train"):
        """
        Construct and return dataloader for the specified mode.

        Args:
            dataset_path (str): Path to the dataset.
            batch_size (int): Number of images per batch.
            rank (int): Process rank for distributed training.
            mode (str): 'train' for training dataloader, 'val' for validation dataloader.

        Returns:
            (DataLoader): PyTorch dataloader object.
        """
        assert mode in {"train", "val"}, f"Mode must be 'train' or 'val', not {mode}."
        
        if mode == 'train':
            return DataLoader(dataset=self.train_dataset, 
                              batch_size=self.train_batch_size,
                              shuffle=True,
                              num_workers=4
                              )
        else :
            return DataLoader(dataset=self.valid_dataset, 
                              batch_size=self.valid_batch_size,
                              shuffle=False,
                              num_workers=4
                              )
    
    def preprocess_batch(self, pixel_values: Dict, labels: Dict) -> Dict:
        if self.training:
            
            batch_idx, cls, bboxes = [], [], []
            for i, lab in enumerate(labels):
                ci = lab['class_labels'].to(device)
                bi = lab['boxes'].to(device)
                n  = ci.size(0)
                batch_idx.append(torch.full((n,), i, device=device, dtype=torch.long))
                cls.append(ci); bboxes.append(bi)

            batch = {
                'img':       pixel_values.to(device).float() / 255,
                'batch_idx': torch.cat(batch_idx, 0),
                'cls':       torch.cat(cls,       0),
                'bboxes':    torch.cat(bboxes,    0),
            }
        return batch

    def get_model(self, cfg: Optional[str] = None, weights: Optional[str] = None, verbose: bool = True):
        """
        Return a YOLO detection model.

        Args:
            cfg (str, optional): Path to model configuration file.
            weights (str, optional): Path to model weights.
            verbose (bool): Whether to display model information.

        Returns:
            (DetectionModel): YOLO detection model.
        """
        model = DetectionModel(cfg, nc=None, ch=3, verbose=verbose and RANK == -1)
        if weights:
            ckpt = torch.load(weights, map_location="cpu", weights_only=False)  # ① 파일을 dict로 읽음
            model.load(ckpt)                               # ② dict을 load()에 넘김
        return model
    
    def get_validator(self):
        """Return a DetectionValidator for YOLO model validation."""
        self.loss_names = "box_loss", "cls_loss", "dfl_loss"
        return yolo.detect.DetectionValidator(
            self.test_loader, save_dir=self.save_dir, args=copy(self.args), _callbacks=self.callbacks
        )
    
    def train(self, epochs: int = 100, val_interval: int = 1):
        # train_dataloader
        train_dataloader = self.get_dataloader(mode="train")
        # valid_dataloader
        val_dataloader = self.get_dataloader(mode="val")
        
        # model load
        model = self.get_model(cfg='yolo11n.yaml', weights='yolo11n.pt')
        
        # train
        
        
        for epoch in range(epochs):
            model.train()
            total_epoch_loss = 0.0
            
            train_pbar = tqdm(
                train_dataloader,
                desc=f"Epoch {epoch}/{epochs} ▶️",
                leave=False,
                unit="batch"
            )
            
            for batch in train_pbar:
                pixel_values = batch["image"]                   # Tensor[B,3,H,W]
                labels       = batch["target"]["annotations"]
                train_batch = self.preprocess_batch(pixel_values, labels)
        
                self.optimizer.zero_grad()
                total_loss, loss_items = model(train_batch)
        
                total_loss.backward()
                self.optimizer.step()
                
                total_epoch_loss += total_loss.item()
            
            avg_train_loss = total_epoch_loss / len(train_dataloader)
            print(f"[Epoch {epoch/epochs}] -> Train Loss: {avg_train_loss:.4f} | Last batch losses: {loss_items.tolist()}")
            
            if epoch % val_interval == 0:
                model.eval()
                with torch.no_grad():
                    stats = self.validator(trainer=self)
                print(f"[Epoch {epoch}/{epochs}] ->  Validation: {stats}")
                model.train()

        return print("train finish~!")
        
trainer = YOLO_trainer(datasets, BATCH_SIZE)

  
trainer.train()
        

[07/23/2025 16:24:27] SHIFT DevKit - INFO - Base: ./data/SHIFT/discrete/images/train. Backend: <shift_dev.utils.backend.ZipBackend object at 0x7f4b01509690>
[07/23/2025 16:24:27] SHIFT DevKit - INFO - Loading annotation from './data/SHIFT_SUBSET/normal/discrete/images/train/front/det_2d.json' ...


INFO: Downloading 'SHIFT_SUBSET' from file server to ./data/SHIFT/discrete...
INFO: Dataset archive found in the root directory. Skipping download.
INFO: Subset split for 'SHIFT_SUBSET' dataset is already done. Skipping...
INFO: Downloading 'SHIFT_SUBSET' from file server to ./data/SHIFT/discrete...
INFO: Dataset archive found in the root directory. Skipping download.


[07/23/2025 16:24:28] SHIFT DevKit - INFO - Loading annotation from './data/SHIFT_SUBSET/normal/discrete/images/train/front/det_2d.json' Done.
[07/23/2025 16:25:01] SHIFT DevKit - INFO - Loading annotation takes 34.02 seconds.


Batch 0:

Item                 Shape                               Min        Max       
--------------------------------------------------------------------------------
original_hw          [tensor([800]), tensor([1280])]
input_hw             [tensor([800]), tensor([1280])]
frame_ids            torch.Size([1])                           0.00       0.00
name                 ['00000000_img_front.jpg']
videoName            ['0016-1b62']
intrinsics           torch.Size([1, 3, 3])                     0.00     640.00
extrinsics           torch.Size([1, 4, 4])                    -7.53     219.91
boxes2d              torch.Size([1, 26, 4])                    5.00     974.00
boxes2d_classes      torch.Size([1, 26])                       0.00       3.00
boxes2d_track_ids    torch.Size([1, 26])                       0.00      25.00
images               torch.Size([1, 1, 3, 800, 1280])          0.00     255.00



[07/23/2025 16:25:04] SHIFT DevKit - INFO - Base: ./data/SHIFT/discrete/images/val. Backend: <shift_dev.utils.backend.ZipBackend object at 0x7f4b01509690>
[07/23/2025 16:25:04] SHIFT DevKit - INFO - Loading annotation from './data/SHIFT_SUBSET/normal/discrete/images/val/front/det_2d.json' ...
[07/23/2025 16:25:04] SHIFT DevKit - INFO - Loading annotation from './data/SHIFT_SUBSET/normal/discrete/images/val/front/det_2d.json' Done.


Video name: 0016-1b62
Sample indices within a video: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
INFO: Downloading 'SHIFT_SUBSET' from file server to ./data/SHIFT/discrete...
INFO: Dataset archive found in the root directory. Skipping download.
INFO: Subset split for 'SHIFT_SUBSET' dataset is already done. Skipping...
INFO: Downloading 'SHIFT_SUBSET' from file server to ./data/SHIFT/discrete...
INFO: Dataset archive found in the root directory. Skipping download.


[07/23/2025 16:25:05] SHIFT DevKit - INFO - Loading annotation takes 1.66 seconds.


Batch 0:

Item                 Shape                               Min        Max       
--------------------------------------------------------------------------------
original_hw          [tensor([800]), tensor([1280])]
input_hw             [tensor([800]), tensor([1280])]
frame_ids            torch.Size([1])                           0.00       0.00
name                 ['00000000_img_front.jpg']
videoName            ['0116-4859']
intrinsics           torch.Size([1, 3, 3])                     0.00     640.00
extrinsics           torch.Size([1, 4, 4])                    -0.90     138.34
boxes2d              torch.Size([1, 6, 4])                   246.00     859.00
boxes2d_classes      torch.Size([1, 6])                        1.00       5.00
boxes2d_track_ids    torch.Size([1, 6])                        0.00       5.00
images               torch.Size([1, 1, 3, 800, 1280])          0.00     255.00



[07/23/2025 16:25:06] SHIFT DevKit - INFO - Base: ./data/SHIFT/discrete/images/val. Backend: <shift_dev.utils.backend.ZipBackend object at 0x7f4b01509690>
[07/23/2025 16:25:06] SHIFT DevKit - INFO - Loading annotation from './data/SHIFT_SUBSET/corrupted/discrete/images/val/front/det_2d.json' ...


Video name: 0116-4859
Sample indices within a video: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
INFO: Downloading 'SHIFT_SUBSET' from file server to ./data/SHIFT/discrete...
INFO: Dataset archive found in the root directory. Skipping download.
INFO: Subset split for 'SHIFT_SUBSET' dataset is already done. Skipping...
INFO: Downloading 'SHIFT_SUBSET' from file server to ./data/SHIFT/discrete...
INFO: Dataset archive found in the root directory. Skipping download.


[07/23/2025 16:25:07] SHIFT DevKit - INFO - Loading annotation from './data/SHIFT_SUBSET/corrupted/discrete/images/val/front/det_2d.json' Done.
[07/23/2025 16:25:19] SHIFT DevKit - INFO - Loading annotation takes 13.47 seconds.


Batch 0:

Item                 Shape                               Min        Max       
--------------------------------------------------------------------------------
original_hw          [tensor([800]), tensor([1280])]
input_hw             [tensor([800]), tensor([1280])]
frame_ids            torch.Size([1])                           0.00       0.00
name                 ['00000000_img_front.jpg']
videoName            ['007b-4e72']
intrinsics           torch.Size([1, 3, 3])                     0.00     640.00
extrinsics           torch.Size([1, 4, 4])                  -311.22     226.46
boxes2d              torch.Size([1, 3, 4])                   233.00     802.00
boxes2d_classes      torch.Size([1, 3])                        0.00       1.00
boxes2d_track_ids    torch.Size([1, 3])                        0.00       2.00
images               torch.Size([1, 1, 3, 800, 1280])          0.00     255.00

Video name: 007b-4e72
Sample indices within a video: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,

Epoch 0/100 ▶️:   0%|          | 0/20800 [00:00<?, ?batch/s]

BadZipFile: Caught BadZipFile in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/workspace/ptta/.venv/lib/python3.11/site-packages/torch/utils/data/_utils/worker.py", line 349, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
           ^^^^^^^^^^^^^^^^^^^^
  File "/workspace/ptta/.venv/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/ptta/.venv/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py", line 52, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
            ~~~~~~~~~~~~^^^^^
  File "/tmp/ipykernel_269915/1044741527.py", line 10, in __getitem__
    item = self.dataset[idx][self.camera]
           ~~~~~~~~~~~~^^^^^
  File "/workspace/ptta/ttadapters/datasets/SHIFT.py", line 182, in __getitem__
    queried = super().__getitem__(idx)
              ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/ptta/.venv/lib/python3.11/site-packages/shift_dev/dataloader/shift_dataset.py", line 449, in __getitem__
    self.scalabel_datasets[f"{view}/{group}"][idx]
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^
  File "/workspace/ptta/.venv/lib/python3.11/site-packages/shift_dev/dataloader/base/scalabel.py", line 328, in __getitem__
    data = self._load_inputs(frame)
           ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/ptta/.venv/lib/python3.11/site-packages/shift_dev/dataloader/base/scalabel.py", line 253, in _load_inputs
    image = load_image(frame.url, self.data_backend)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/ptta/.venv/lib/python3.11/site-packages/shift_dev/dataloader/base/scalabel.py", line 55, in load_image
    im_bytes = backend.get(url)
               ^^^^^^^^^^^^^^^^
  File "/workspace/ptta/.venv/lib/python3.11/site-packages/shift_dev/utils/backend.py", line 366, in get
    content = zf.read()
              ^^^^^^^^^
  File "/anaconda3/envs/bio_ai/lib/python3.11/zipfile.py", line 952, in read
    buf += self._read1(self.MAX_N)
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "/anaconda3/envs/bio_ai/lib/python3.11/zipfile.py", line 1056, in _read1
    self._update_crc(data)
  File "/anaconda3/envs/bio_ai/lib/python3.11/zipfile.py", line 984, in _update_crc
    raise BadZipFile("Bad CRC-32 for file %r" % self.name)
zipfile.BadZipFile: Bad CRC-32 for file '1fa8-dddc/00000040_img_front.jpg'


In [None]:
if USE_PRETRAINED_MODEL:
    # Load the pre-trained model
    model = HFDetectionModel(
    yolo_cfg="yolo11n.yaml",
    pretrained_weights="yolo11n.pt",
    device="cuda"
)
else:
    # Initialize a new model with the reference configuration
    model = HFDetectionModel(
    yolo_cfg="yolo11n.yaml",
    pretrained_weights=None,
    device="cuda"
)
model.to(device)

In [None]:
test_d = DatasetAdapterForTransformers(dataset.train)[5]
test_d

In [None]:
reference_preprocessor(images=test_d['image'], annotations=test_d['target'])

In [None]:
# Set Epoch Count & Learning Rate
EPOCHS = 20
LEARNING_RATE = 2e-5

training_args = TrainingArguments(
    learning_rate=LEARNING_RATE,
    warmup_ratio=0.1,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE[0],
    per_device_eval_batch_size=BATCH_SIZE[1],
    eval_accumulation_steps=BATCH_SIZE[1],
    batch_eval_metrics=True,
    remove_unused_columns=False,
    optim="adamw_torch",
    lr_scheduler_type="cosine",
    eval_strategy="steps",
    eval_steps=100,
    save_steps=100,
    save_total_limit=10,
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="mAP@0.50:0.95",
    greater_is_better=True,
    #metric_for_best_model="eval_loss",
    #greater_is_better=False,
    # report_to="wandb",
    output_dir="./results/"+RUN_NAME,
    logging_dir="./logs/"+RUN_NAME,
    run_name=RUN_NAME,
    #fp16=True,
)

testing_args = TrainingArguments(
    per_device_eval_batch_size=BATCH_SIZE[2],
    batch_eval_metrics=True,
    remove_unused_columns=False,
)

In [None]:
from transformers.trainer_utils import EvalPrediction
from torchvision.ops import box_convert
from dataclasses import dataclass


@dataclass
class ModelOutput:
    logits: torch.Tensor
    pred_boxes: torch.Tensor


def de_normalize_boxes(boxes, height, width):
    # 1. cxcywh → xyxy
    boxes_xyxy_norm = box_convert(boxes, 'cxcywh', 'xyxy')

    # 2. de-normalize (convert to actual pixel coordinates)
    boxes_xyxy_norm[:, [0, 2]] *= width
    boxes_xyxy_norm[:, [1, 3]] *= height
    return boxes_xyxy_norm


def map_compute_metrics(eval_pred: EvalPrediction, compute_result=False, preprocessor=reference_preprocessor, threshold=0.3):
    print("eval_pred :", eval_pred)
    preds_tuple = eval_pred.predictions
    print("preds_tuple : ", preds_tuple)
    logits, pred_boxes = preds_tuple
    
    preds = ModelOutput(logits=logits, pred_boxes=pred_boxes)
    labels = eval_pred.label_ids

    results = preprocessor.post_process_object_detection(
        preds, target_sizes=[label['orig_size'].cpu().tolist() for label in labels], threshold=threshold
    )

    predictions = [sv.Detections.from_transformers(result) for result in results]
    targets = [sv.Detections(
        xyxy=de_normalize_boxes(label['boxes'], *label['orig_size']).cpu().numpy(),
        class_id=label['class_labels'].cpu().numpy(),
    ) for label in labels]

    m_ap = sv.MeanAveragePrecision.from_detections(
        predictions=predictions,
        targets=targets,
    )
    try: # [0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95]
        per_class_map = {f"{CLASSES[i]}_mAP@0.50:0.95": sum(v)/len(v) for i, v in enumerate(m_ap.per_class_ap50_95.tolist())}  # Shape: [num_classes, 10]
    except AttributeError:
        per_class_map = {}

    result = {
        "mAP@0.50:0.95": m_ap.map50_95,
        "mAP@0.50": m_ap.map50,
        "mAP@0.75": m_ap.map75,
        **per_class_map
    }

    print(f"\rINFO: Computed Metrics - {result}", end="")

    return result

In [None]:
from functools import partial

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=DatasetAdapterForTransformers(dataset.train),
    eval_dataset=DatasetAdapterForTransformers(dataset.valid),
    data_collator=partial(collate_fn, preprocessor=reference_preprocessor),
    compute_metrics=partial(map_compute_metrics, preprocessor=reference_preprocessor),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=10)]
)

tester = Trainer(
    model=model,
    args=testing_args,
    eval_dataset=DatasetAdapterForTransformers(dataset.test),
    data_collator=partial(collate_fn, preprocessor=reference_preprocessor),
    compute_metrics=partial(map_compute_metrics, preprocessor=reference_preprocessor)
)

## Train

In [None]:
def start_train():
    accelerator = Accelerator()
    try:
        print("INFO: Trying to resume from previous checkpoint")
        trainer.train(resume_from_checkpoint=False)
    except Exception as e:
        print(f"ERROR: Failed to resume from checkpoint - {e}")
        print("INFO: Starting training from scratch")
        trainer.train(resume_from_checkpoint=False)

In [None]:
if ADDITIONAL_GPU:
    notebook_launcher(start_train, args=(), num_processes=ADDITIONAL_GPU)
else:
    start_train()

## Evaluate

### Auto Evaluation

In [None]:
trainer.evaluate()

In [None]:
tester.evaluate()

### Manual Evaluation

In [None]:
checkpoint = 31100

In [None]:
try:
    model = RTDetrForObjectDetection.from_pretrained(f"{training_args.output_dir}/checkpoint-{checkpoint}/", torch_dtype=torch.float32, return_dict=True, local_files_only=True)
    model.to(device)
except Exception:
    pass

In [None]:
class LabelDataset(BaseDataset):
    def __init__(self, original_dataset, camera='front'):
        self.dataset = original_dataset
        self.camera = camera

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx][self.camera]
        return item['boxes2d'], item['boxes2d_classes']

In [None]:
def naive_collate_fn(batch):
    return batch

In [None]:
targets = []
predictions = []

img_size = [640, 640]
original_size = [800, 1280]
batch_size = 32

raw_data = DataLoader(LabelDataset(dataset.valid), batch_size=batch_size, collate_fn=naive_collate_fn)
loader = DataLoader(DatasetAdapterForTransformers(dataset.valid), batch_size=batch_size, collate_fn=partial(collate_fn, preprocessor=reference_preprocessor))
for idx, lables, inputs in zip(tqdm(range(len(raw_data))), raw_data, loader):
    with torch.no_grad():
        outputs = model(pixel_values=inputs['pixel_values'].to(device))

    results = reference_preprocessor.post_process_object_detection(
        outputs, target_sizes=[original_size for _ in range(len(inputs))], threshold=0.3
    )

    detections = [sv.Detections.from_transformers(results[i]) for i in range(batch_size)]
    annotations = [sv.Detections(
        xyxy=lables[i][0].cpu().numpy(),
        class_id=lables[i][1].cpu().numpy(),
    ) for i in range(batch_size)]

    targets.extend(annotations)
    predictions.extend(detections)

In [None]:
len(predictions) == len(targets), len(predictions), len(targets)

In [None]:
mean_average_precision = sv.MeanAveragePrecision.from_detections(
    predictions=predictions,
    targets=targets,
)

print(f"mAP@0.95: {mean_average_precision.map50_95:.2f}")
print(f"map50: {mean_average_precision.map50:.2f}")
print(f"map75: {mean_average_precision.map75:.2f}")
print(f"per_class_ap50_95: {mean_average_precision.per_class_ap50_95}")