In [None]:
def inference_on_dataset_online_adaptation(cfg, model, data_loader, optimizer, evaluator, d_idx, wandb, teacher_model=None, val_data_loader=None, val_evaluator=None, loss_ema99=0, loss_ema95=0, loss_ema90=0, is_used=0, domain_name=None):
    """
    Run model on the data_loader and evaluate the metrics with evaluator.
    Also benchmark the inference speed of `model.forward` accurately.
    The model will be used in eval mode.

    Args:
        model (nn.Module): a module which accepts an object from
            `data_loader` and returns some outputs. It will be temporarily set to `eval` mode.

            If you wish to evaluate a model in `training` mode instead, you can
            wrap the given model and override its behavior of `.eval()` and `.train()`.
        data_loader: an iterable object with a length.
            The elements it generates will be the inputs to the model.
        evaluator (DatasetEvaluator): the evaluator to run. Use `None` if you only want
            to benchmark, but don't want to do any evaluation.

    Returns:
        The return value of `evaluator.evaluate()`
    """
    num_devices = get_world_size()
    logger = logging.getLogger(__name__)
    # logger.info("Start inference on {} images".format(len(data_loader)))

    total = len(data_loader)  # inference data loader must have a fixed length
    if evaluator is None:
        # create a no-op evaluator
        evaluator = DatasetEvaluators([])
    evaluator.reset()
    if val_evaluator is not None:
        val_evaluator.reset()

    num_warmup = min(5, total - 1)
    start_time = time.perf_counter()
    total_compute_time = 0
    batch_size = cfg.SOLVER.IMS_PER_BATCH_TEST
    cur_used = True
    prev_used = is_used
    f_sim = {}
    div_thr = 2* sum(model.s_div.values()) * cfg.TEST.ADAPTATION.SKIP_TAU if cfg.TEST.ADAPTATION.SKIP_REDUNDANT is not None else 2* sum(model.s_div.values())
    # for weight regularization
    init_weights = []
    for p_idx, _p in enumerate(optimizer.param_groups):
        p = _p['params'][0]
        init_weights.append(p.clone().detach())

    with EventStorage() as storage:
        for idx, inputs in enumerate(data_loader):
            if idx == num_warmup:
                start_time = time.perf_counter()
                total_compute_time = 0
            if idx == total:
                break

            start_compute_time = time.perf_counter()

            cur_step = (d_idx * len(data_loader) + idx) * batch_size
            #if idx / len(data_loader) < cfg.TEST.ADAPTATION.STOP:\
            if cur_used or (not cur_used and pause_iter % cfg.TEST.ADAPTATION.SKIP_PERIOD == 0) or ('period' in cfg.TEST.ADAPTATION.SKIP_REDUNDANT and idx % cfg.TEST.ADAPTATION.SKIP_PERIOD == 0):
                
                outputs, losses, feature_sim = model(inputs)

                # weight regularization
                if cfg.TEST.ADAPTATION.WEIGHT_REG > 0.0: # 이거 안쓰는 것 같음
                    stick_loss = 0
                    for p_idx, (_p, s) in enumerate(zip(optimizer.param_groups, init_weights)):
                        p = _p['params'][0]
                        stick_loss += torch.mean((p - s) ** 2)
                    losses["stick"] = cfg.TEST.ADAPTATION.WEIGHT_REG * stick_loss
                total_loss = sum([losses[k] for k in losses])
                #not_redundant = min([feature_sim[k] for k in feature_sim if 'gl' in k]) < cfg.TEST.ADAPTATION.SKIP_THRESHOLD if cfg.TEST.ADAPTATION.SKIP_REDUNDANT else True
                #cur_used = losses["global_align"] > div_thr or not_redundant
                #cur_used = losses["global_align"] > div_thr or idx % cfg.TEST.ADAPTATION.SKIP_PERIOD == 0
                cur_used = False
                if cfg.TEST.ADAPTATION.SKIP_REDUNDANT is None:
                    cur_used = True
                elif 'stat' in cfg.TEST.ADAPTATION.SKIP_REDUNDANT and losses["global_align"] > div_thr:
                    cur_used = True
                elif 'period' in cfg.TEST.ADAPTATION.SKIP_REDUNDANT and idx % cfg.TEST.ADAPTATION.SKIP_PERIOD == 0:
                    cur_used = True
                elif 'ema' in cfg.TEST.ADAPTATION.SKIP_REDUNDANT and losses["global_align"] / (loss_ema99 + 1e-7) > cfg.TEST.ADAPTATION.SKIP_BETA:
                    cur_used = True
                # cur_used = losses["global_align"] / (loss_ema99 + 1e-7) > cfg.TEST.ADAPTATION.SKIP_BETA if cfg.TEST.ADAPTATION.SKIP_REDUNDANT else True
                is_used += int(cur_used)
                if total_loss > 0 and cur_used:
                    total_loss.backward()
                    if cfg.SOLVER.CLIP_GRADIENTS.ENABLED:
                        torch.nn.utils.clip_grad_norm_(model.backbone.parameters(),
                                                       cfg.SOLVER.CLIP_GRADIENTS.CLIP_VALUE)
                    optimizer.step()
                else:
                    pause_iter = 1
                optimizer.zero_grad()
                
                loss_str = " ".join(["{}: {:.6f}, ".format(k, losses[k].item()) for k in losses])

                if "global_align" in losses:
                    loss_ema99 = 0.99 * loss_ema99 + 0.01 * losses["global_align"].item()
                    loss_ema95 = 0.95 * loss_ema95 + 0.05 * losses["global_align"].item()
                    loss_ema90 = 0.9 * loss_ema90 + 0.1 * losses["global_align"].item()
                del losses, total_loss
            else:
                with torch.no_grad():
                    outputs = model.inference(inputs)
                loss_str = ""
                pause_iter += 1

            if torch.cuda.is_available():
                torch.cuda.synchronize()

            total_compute_time += time.perf_counter() - start_compute_time

            with torch.no_grad():
                evaluator.process(inputs, outputs)

            if val_data_loader is not None and idx % 50 == 0:
                model.online_adapt = False
                val_results, _ = inference_on_dataset(model, val_data_loader, val_evaluator)
                if wandb is not None:
                    wandb.log({'val-mAP': val_results['bbox']['AP'], 'val-mAP50': val_results['bbox']['AP50']}, step=cur_step)
                model.online_adapt = True

            iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
            seconds_per_img = total_compute_time / iters_after_start
            if idx >= num_warmup * 2 or seconds_per_img > 5:
                total_seconds_per_img = (time.perf_counter() - start_time) / iters_after_start
                eta = datetime.timedelta(seconds=int(total_seconds_per_img * (total - idx - 1)))
                print_str = "Inference done {}/{}. {:.4f} s / img. ETA={} ".format(
                        idx + 1, total, seconds_per_img, str(eta)
                    )
                print_str += loss_str
                print_str += "lr: {}".format(optimizer.param_groups[0]['lr'])
                log_every_n_seconds(
                    logging.INFO,
                    print_str,
                    n=5,
                )

    results = evaluator.evaluate(domain_name=domain_name)
    # An evaluator may return None when not in main process.
    # Replace it by an empty dict instead to make it easier for downstream code to handle

    if results is None:
        results = {}
    return results, loss_ema99, loss_ema95, loss_ema90, is_used, total_compute_time 

In [None]:
def test_continual_domain_shift_discrete(cls, cfg, wandb=None):
        """
        Args:
            cfg (CfgNode):
            evaluators (list[DatasetEvaluator] or None): if None, will call
                :meth:`build_evaluator`. Otherwise, must have the same length as
                `cfg.DATASETS.TEST`.

        Returns:
            dict: a dict of result metrics
        """
        results = OrderedDict()
        elapsed_time = OrderedDict()
        backward_num = OrderedDict()
        dataset_name = cfg.DATASETS.TEST[0]
        wandb_step = 0
        for d_idx, attr in enumerate(['dawn/dusk', 'clear']):
        #for d_idx, attr in enumerate(['foggy', 'night']):
            if d_idx == 0:
                model, optimizer, teacher_model = configure_model(cfg, DefaultTrainer, revert=True)
            # d_name = "{}-clear-{}".format(dataset_name, attr)
            d_name = "{}-clear-{}".format(dataset_name, attr) if attr in ['night', 'dawn/dusk'] else "{}-{}-daytime".format(dataset_name, attr)
            data_loader = cls.build_test_loader(cfg, d_name)
            evaluator = cls.build_evaluator(cfg, d_name)
            if cfg.TEST.ONLINE_ADAPTATION:
                results_i, loss_ema99, loss_ema95, loss_ema90, is_used, total_compute_time = inference_on_dataset_online_adaptation(cfg, model, data_loader, optimizer, evaluator, d_idx, wandb, teacher_model=teacher_model, domain_name=attr)
                backward_num[d_name] = is_used
            results[d_name] = results_i
            elapsed_time[d_name] = total_compute_time

            if comm.is_main_process(): # 분산학습 때 내가 rank=0 메인 프로세스인지 확인하는 함수
                assert isinstance(
                    results_i, dict
                ), "Evaluator must return a dict on the main process. Got {} instead.".format(
                    results_i
                )
                logger.info("Evaluation results for {} in csv format:".format(d_name))
                print_csv_format(results_i) # rank=0의 메인 프로세스의 결과만 출력 | 한 도메인 끝날때마다 출력

            mem_str = "torch.cuda.memory_allocated: %fGB\n" % (torch.cuda.memory_allocated(0) / 1024 / 1024 / 1024)
            mem_str += "torch.cuda.max_memory_allocated: %fGB\n" % (torch.cuda.max_memory_allocated(0) / 1024 / 1024 / 1024)
            mem_str += "torch.cuda.memory_reserved: %fGB" % (torch.cuda.memory_reserved(0) / 1024 / 1024 / 1024)
            mem_str += "torch.cuda.max_memory_reserved: %fGB" % (torch.cuda.max_memory_reserved(0) / 1024 / 1024 / 1024)
            print(mem_str)
            logger.info(mem_str)
        print(backward_num)
        logger.info('backward_num:{}'.format(','.join([str(v) for v in list(backward_num.values())])))
        print('Elapsed Time')
        print(elapsed_time)
        logger.info('Elapsed Time: {}'.format(','.join([str(int(v)) for v in list(elapsed_time.values())])))
        logger.info('Avg FPS: {:.3f}s'.format(sum(list(elapsed_time.values())) / len(elapsed_time)))
        if len(results) == 1:
            results = list(results.values())[0]
        return results, backward_num

In [None]:
def main(args):
    cfg = setup(args)
    res, backward_num = Trainer.test_continual_domain_shift_discrete(cfg, wandb)
    

In [1]:
!nvidia-smi

Sun Sep  7 12:11:38 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off | 00000000:04:00.0 Off |                    0 |
| N/A   50C    P0              44W / 250W |   4418MiB / 16384MiB |     94%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  Tesla P100-PCIE-16GB           Off | 00000000:06:00.0 Off |  

In [None]:
import sys
from pathlib import Path

# 현재 폴더: ptta/other_method/DUA/
# ptta 바로 위의 디렉토리를 sys.path에 추가
PROJECT_PARENT = Path.cwd().parents[1]  # -> ptta/ 의 부모 디렉토리
sys.path.insert(0, str(PROJECT_PARENT))

from os import path

import torch
from torch import nn, optim
from torch.utils.data import DataLoader

from ttadapters.datasets import BaseDataset, DatasetHolder, DataLoaderHolder
from ttadapters.datasets import SHIFTClearDatasetForObjectDetection, SHIFTCorruptedDatasetForObjectDetection, SHIFTDiscreteSubsetForObjectDetection
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
from accelerate import Accelerator, notebook_launcher

from supervision.metrics.mean_average_precision import MeanAveragePrecision
from supervision.detection.core import Detections

# import wandb
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

from transformers import RTDetrForObjectDetection, RTDetrImageProcessorFast, RTDetrConfig
from transformers.image_utils import AnnotationFormat
from safetensors.torch import load_file

In [3]:
import torch
from torch import nn, optim

# Set CUDA Device Number 0~7
DEVICE_NUM = 2
ADDITIONAL_GPU = 0
DATA_TYPE = torch.bfloat16

if torch.cuda.is_available():
    if ADDITIONAL_GPU:
        torch.cuda.set_device(DEVICE_NUM)
        device = torch.device("cuda")
    else:
        device = torch.device(f"cuda:{DEVICE_NUM}")
else:
    device = torch.device("cpu")
    DEVICE_NUM = -1

print(f"INFO: Using device - {device}" + (f":{DEVICE_NUM}" if ADDITIONAL_GPU else ""))

INFO: Using device - cuda:2


In [None]:
reference_model_id = "PekingU/rtdetr_r50vd"

# Load the reference model configuration
reference_config = RTDetrConfig.from_pretrained(reference_model_id, torch_dtype=torch.float32, return_dict=True)
reference_config.num_labels = 6

# Set the image size and preprocessor size
reference_config.image_size = 800

# Load the reference model image processor
reference_preprocessor = RTDetrImageProcessorFast.from_pretrained(reference_model_id)
reference_preprocessor.format = AnnotationFormat.COCO_DETECTION  # COCO Format / Detection BBOX Format
reference_preprocessor.size = {"height": 800, "width": 800}
reference_preprocessor.do_resize = False

In [None]:
model_pretrained = RTDetrForObjectDetection(config=reference_config)
model_states = load_file("/home/elicer/ptta/RT-DETR_R50vd_SHIFT_CLEAR.safetensors", device="cpu")
model_pretrained.load_state_dict(model_states, strict=False)

for param in model_pretrained.parameters():
    param.requires_grad = False  # Freeze

# Initialize Model
model_pretrained.to(device)

In [None]:
from typing import Optional, Callable

class SHIFTCorruptedTaskDatasetForObjectDetection(SHIFTDiscreteSubsetForObjectDetection):
    def __init__(
            self, root: str, force_download: bool = False,
            train: bool = True, valid: bool = False,
            transform: Optional[Callable] = None, task: str = "clear", target_transform: Optional[Callable] = None
    ):
        super().__init__(
            root=root, force_download=force_download,
            train=train, valid=valid, subset_type=task_to_subset_types(task),
            transform=transform, target_transform=target_transform
        )

In [None]:
dataset = SHIFTCorruptedTaskDatasetForObjectDetection(root=DATA_ROOT, train=True, valid=True, task=task)

raw_data = DataLoader(LabelDataset(dataset), batch_size=batch_size, collate_fn=naive_collate_fn)
dataloader_discrete = DataLoader(DatasetAdapterForTransformers(dataset), batch_size=batch_size, collate_fn=partial(collate_fn, preprocessor=reference_preprocessor))
for idx, lables, inputs in zip(tqdm(range(len(raw_data))), raw_data, dataloader_discrete):
    sizes = [label['orig_size'].cpu().tolist() for label in inputs['labels']]

    with torch.no_grad():
        outputs = model(pixel_values=inputs['pixel_values'].to(device))