In [1]:
import io
import os
import pickle
import tarfile
import urllib
import os
import sys
import csv
import math
import itertools
import logging
import argparse
from pathlib import Path
from collections import OrderedDict

import torch
import numpy as np
from PIL import Image
from tqdm import tqdm

from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR

from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.anchor_utils import AnchorGenerator
from torchvision.ops import MultiScaleRoIAlign
from torchmetrics.detection import MeanAveragePrecision
import torch.nn as nn

from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from scipy import signal
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.linear_model import LogisticRegression
import torch
import torchvision.transforms.functional as TF
from tqdm import tqdm
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.anchor_utils import AnchorGenerator
from torchvision.ops import MultiScaleRoIAlign
from torchmetrics.detection import MeanAveragePrecision
import torch.nn as nn



In [2]:
DINOV3_GITHUB_LOCATION = "/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/Foundation-Models/dinov3"

if os.getenv("DINOV3_LOCATION") is not None:
    DINOV3_LOCATION = os.getenv("DINOV3_LOCATION")
else:
    DINOV3_LOCATION = DINOV3_GITHUB_LOCATION

print(f"DINOv3 location set to {DINOV3_LOCATION}")

DINOv3 location set to /home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/Foundation-Models/dinov3


Downloading: "file:///home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/Foundation-Models/dinov3/notebooks/dinov3_vit7b16_pretrain_sat493m.pth" to /home/rishabh.mondal/.cache/torch/hub/checkpoints/dinov3_vit7b16_pretrain_sat493m.pth
100%|██████████| 25.0G/25.0G [00:53<00:00, 502MB/s] 


In [5]:
model

DinoVisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 4096, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (rope_embed): RopePositionEmbedding()
  (blocks): ModuleList(
    (0-39): 40 x SelfAttentionBlock(
      (norm1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
      (attn): SelfAttention(
        (qkv): LinearKMaskedBias(in_features=4096, out_features=12288, bias=False)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=4096, out_features=4096, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): LayerScale()
      (norm2): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
      (mlp): SwiGLUFFN(
        (w1): Linear(in_features=4096, out_features=8192, bias=True)
        (w2): Linear(in_features=4096, out_features=8192, bias=True)
        (w3): Linear(in_features=8192, out_features=4096, bias=True)
      )
      (ls2): LayerScale()
    )
  )
  (norm): LayerNorm

In [18]:

# -------------------------
# Logging
# -------------------------
def setup_logging(log_dir: str):
    os.makedirs(log_dir, exist_ok=True)
    log_file = os.path.join(log_dir, 'training.log')
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s [%(levelname)s] - %(message)s',
        handlers=[logging.FileHandler(log_file), logging.StreamHandler(sys.stdout)]
    )

In [19]:
# -------------------------
# CSV Logger
# -------------------------
class CSVLogger:
    def __init__(self, csv_path: str):
        self.csv_path = csv_path
        with open(self.csv_path, 'w', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(["epoch", "train_loss", "val_map", "val_map50", "lr"])

    def log(self, epoch, train_loss, val_map, val_map50, lr):
        val_map = float('nan') if val_map is None else float(val_map)
        val_map50 = float('nan') if val_map50 is None else float(val_map50)
        with open(self.csv_path, 'a', newline='') as f:
            writer = csv.writer(f)
            writer.writerow([epoch, train_loss, val_map, val_map50, lr])

In [20]:
class BrickKilnDataset(Dataset):
    def __init__(self, root: str, split: str, input_size: int = 224):
        self.root = Path(root)
        self.split = split
        self.img_dir = self.root / "images"
        self.label_dir = self.root / "yolo_obb_labels"

        # Keep as [0,1], no ImageNet normalization (works better with learned 1x1 RGB->12 adapter)
        self.transform = transforms.Compose([
            transforms.Resize((input_size, input_size)),
            transforms.ToTensor(),
        ])

        self.img_files = []
        all_files = sorted([f for f in os.listdir(self.img_dir) if f.lower().endswith(".png")])
        logging.info(f"Scanning {len(all_files)} PNGs in {self.img_dir}...")
        for img_name in tqdm(all_files, desc=f"Verify {split} data"):
            if self._has_valid_annotations(img_name):
                self.img_files.append(img_name)
        logging.info(f"Found {len(self.img_files)} valid images in {self.img_dir}")

    def _has_valid_annotations(self, img_name: str) -> bool:
        label_path = self.label_dir / f"{Path(img_name).stem}.txt"
        if not label_path.exists():
            return False
        with open(label_path, 'r') as f:
            for line in f:
                if len(line.strip().split()) == 9:
                    return True
        return False

    def __len__(self):
        return len(self.img_files)

    def __getitem__(self, idx: int):
        img_name = self.img_files[idx]
        img_path = self.img_dir / img_name
        label_path = self.label_dir / f"{Path(img_name).stem}.txt"

        img = Image.open(img_path).convert("RGB")
        img_tensor = self.transform(img)
        _, h, w = img_tensor.shape

        boxes, labels = [], []
        with open(label_path, 'r') as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) != 9:
                    continue
                cls_id = int(parts[0]) + 1  # reserve 0 for background
                obb = np.array([float(p) for p in parts[1:]])
                xs, ys = obb[0::2] * w, obb[1::2] * h
                xmin, ymin, xmax, ymax = np.min(xs), np.min(ys), np.max(xs), np.max(ys)
                if xmax > xmin and ymax > ymin:
                    boxes.append([xmin, ymin, xmax, ymax])
                    labels.append(cls_id)

        target = {
            "boxes": torch.as_tensor(boxes, dtype=torch.float32),
            "labels": torch.as_tensor(labels, dtype=torch.int64),
        }
        return img_tensor, target


def collate_fn(batch):
    batch = [item for item in batch if item[1]["boxes"].shape[0] > 0]
    if not batch:
        return None, None
    return tuple(zip(*batch))

In [21]:
train_dataset = BrickKilnDataset(root="/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/data/processed_data/sentinel/sentinelkilndb_bechmarking_data/train", split="train", input_size=224)
val_dataset = BrickKilnDataset(root="/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/data/processed_data/sentinel/sentinelkilndb_bechmarking_data/test", split="val", input_size=224)
train_loader= DataLoader(train_dataset, batch_size=128, shuffle=True, collate_fn=collate_fn, num_workers=8)
test_loader= DataLoader(val_dataset, batch_size=128, shuffle=False, collate_fn=collate_fn, num_workers=8)


Verify train data: 100%|██████████| 71856/71856 [00:02<00:00, 34567.08it/s]
Verify val data: 100%|██████████| 18492/18492 [00:00<00:00, 37739.51it/s]


In [22]:
import math
import torch
import torch.nn as nn
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.anchor_utils import AnchorGenerator
from torchvision.ops import MultiScaleRoIAlign

class DinoV3BackboneWrapper(nn.Module):
    """
    Wraps a DINOv3 ViT (e.g., dinov3_vitl16) to look like a torchvision backbone.

    Returns a single FPN level '0' with stride 16:
        features = {'0': Tensor[B, C, H/16, W/16]}
    and exposes:
        .out_channels = C
    """
    def __init__(self, dino_model: nn.Module, patch_stride: int = 16):
        super().__init__()
        self.dino = dino_model
        self.patch_stride = patch_stride

        # Try to infer channel dim (embed dim)
        # Common names: embed_dim, num_features, etc.
        C = getattr(dino_model, "embed_dim", None)
        if C is None:
            C = getattr(dino_model, "num_features", None)
        if C is None:
            # fallback: probe with a tiny dummy (32x32, will be rounded up)
            with torch.no_grad():
                x = torch.zeros(1, 3, 32, 32)
                tokens, Ht, Wt = self._get_patch_tokens(x)
                C = tokens.shape[-1]
        self.out_channels = C

    @torch.no_grad()
    def _maybe_h_w(self, x):
        # Height/Width of patch grid (round up)
        B, _, H, W = x.shape
        Ht = math.ceil(H / self.patch_stride)
        Wt = math.ceil(W / self.patch_stride)
        return Ht, Wt

    def _get_patch_tokens(self, x):
        """
        Returns:
            tokens: [B, Ht*Wt, C] — patch tokens (no cls)
            Ht, Wt: patch grid size
        """
        # Preferred: many DINOv3 builds return a dict from forward_features
        try:
            out = self.dino.forward_features(x)  # may return dict or tensor
            if isinstance(out, dict):
                # Common DINOv3 keys:
                if "x_norm_patchtokens" in out:
                    tokens = out["x_norm_patchtokens"]           # [B, Ht*Wt, C]
                    Ht = out.get("H", None)
                    Wt = out.get("W", None)
                    if Ht is None or Wt is None:
                        Ht, Wt = self._maybe_h_w(x)
                    return tokens, Ht, Wt
                if "tokens" in out and out["tokens"] is not None:
                    t = out["tokens"]                             # [B, 1+Ht*Wt, C]?
                    # drop cls if present
                    if t.shape[1] == (self._maybe_h_w(x)[0] * self._maybe_h_w(x)[1] + 1):
                        t = t[:, 1:, :]
                    return t, *self._maybe_h_w(x)
            # If it's a tensor: assume [B, 1+N, C] or [B, N, C]
            if isinstance(out, torch.Tensor):
                t = out
                # find patch count
                Ht, Wt = self._maybe_h_w(x)
                N = Ht * Wt
                if t.shape[1] == N + 1:
                    t = t[:, 1:, :]
                elif t.shape[1] != N:
                    # If shapes mismatch, just compute Ht/Wt from t
                    # assume no cls, make it square-ish if possible
                    N = t.shape[1]
                    Wt = int(round(math.sqrt(N)))
                    Ht = N // Wt
                return t, Ht, Wt
        except Exception:
            pass

        # Fallback: DINOv2/v3 often exposes get_intermediate_layers
        if hasattr(self.dino, "get_intermediate_layers"):
            # return last block tokens (no cls)
            t = self.dino.get_intermediate_layers(x, n=1, return_class_token=False)[0]  # [B, N, C]
            Ht, Wt = self._maybe_h_w(x)
            return t, Ht, Wt

        # Last resort: call the model and hope it returns tokens
        t = self.dino(x)  # [B, N, C] or [B, 1+N, C]
        Ht, Wt = self._maybe_h_w(x)
        if t.dim() == 3 and t.shape[1] == (Ht*Wt + 1):
            t = t[:, 1:, :]
        return t, Ht, Wt

    def forward(self, x: torch.Tensor):
        """
        x: [B,3,H,W] in [0,1] or normalized — up to your preprocessing.
        """
        tokens, Ht, Wt = self._get_patch_tokens(x)   # [B, N, C]
        B, N, C = tokens.shape
        # reshape tokens -> feature map
        feat = tokens.transpose(1, 2).contiguous().view(B, C, Ht, Wt)  # [B,C,H/16,W/16]
        return {"0": feat}


In [23]:
def build_faster_rcnn_with_dino(dino_model, num_classes: int, image_size: int):
    """
    dino_model: your torch.hub-loaded DINOv3 ViT (e.g., dinov3_vitl16)
    num_classes: K + 1  (Faster R-CNN includes background class at 0)
    image_size: set min/max size to avoid default 800px resize (optional)
    """
    backbone = DinoV3BackboneWrapper(dino_model, patch_stride=16)

    # Single-level (stride 16) anchors — tune to your object scale distribution.
    anchor_generator = AnchorGenerator(
        sizes=((16, 32, 64, 128),),                 # in feature pixels (rough prior). You can also put actual pixels in image space by considering stride.
        aspect_ratios=((0.5, 1.0, 2.0),)
    )

    roi_pooler = MultiScaleRoIAlign(
        featmap_names=['0'],
        output_size=7,
        sampling_ratio=2
    )

    model = FasterRCNN(
        backbone=backbone,
        num_classes=num_classes,                # includes background
        rpn_anchor_generator=anchor_generator,
        box_roi_pool=roi_pooler,
        min_size=image_size,                    # lock input size if you pre-resize upstream
        max_size=image_size,
        # You can also pass rpn_pre_nms_top_n_train/test etc. here if needed
    )
    return model


In [24]:
# -------------------------
# Build Faster R-CNN (lock size, neutral mean/std)
# -------------------------
def create_model(dino_model, num_classes: int, image_size: int):
    backbone = DinoV3BackboneWrapper(dino_model, patch_stride=16)

    anchor_generator = AnchorGenerator(
        sizes=((16,), (32,), (64,), (128,)),
        aspect_ratios=((0.5, 1.0, 2.0),) * 4
    )
    roi_pooler = MultiScaleRoIAlign(featmap_names=['0', '1', '2', '3'], output_size=7, sampling_ratio=2)

    model = FasterRCNN(
        backbone,
        num_classes=num_classes,
        rpn_anchor_generator=anchor_generator,
        box_roi_pool=roi_pooler,
        # IMPORTANT: prevent torchvision from resizing to 800
        min_size=image_size,
        max_size=image_size,
        image_mean=[0.0, 0.0, 0.0],
        image_std=[1.0, 1.0, 1.0],
    )
    return model

In [25]:
detector = create_model(model, num_classes=4, image_size=224)
detector


FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.0, 0.0, 0.0], std=[1.0, 1.0, 1.0])
      Resize(min_size=(224,), max_size=224, mode='bilinear')
  )
  (backbone): DinoV3BackboneWrapper(
    (dino): DinoVisionTransformer(
      (patch_embed): PatchEmbed(
        (proj): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))
        (norm): Identity()
      )
      (rope_embed): RopePositionEmbedding()
      (blocks): ModuleList(
        (0-23): 24 x SelfAttentionBlock(
          (norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (attn): SelfAttention(
            (qkv): LinearKMaskedBias(in_features=1024, out_features=3072, bias=True)
            (attn_drop): Dropout(p=0.0, inplace=False)
            (proj): Linear(in_features=1024, out_features=1024, bias=True)
            (proj_drop): Dropout(p=0.0, inplace=False)
          )
          (ls1): LayerScale()
          (norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      

In [13]:
device = torch.device('cuda:1') if torch.cuda.is_available() else torch.device('cpu')
detector.to(device)

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(224,), max_size=224, mode='bilinear')
  )
  (backbone): DinoV3BackboneWrapper(
    (dino): DinoVisionTransformer(
      (patch_embed): PatchEmbed(
        (proj): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))
        (norm): Identity()
      )
      (rope_embed): RopePositionEmbedding()
      (blocks): ModuleList(
        (0-23): 24 x SelfAttentionBlock(
          (norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (attn): SelfAttention(
            (qkv): LinearKMaskedBias(in_features=1024, out_features=3072, bias=True)
            (attn_drop): Dropout(p=0.0, inplace=False)
            (proj): Linear(in_features=1024, out_features=1024, bias=True)
            (proj_drop): Dropout(p=0.0, inplace=False)
          )
          (ls1): LayerScale()
          (norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=

In [14]:
# import torch
# from torchmetrics.detection import MeanAveragePrecision
# from tqdm import tqdm

# # --------- helpers ----------
# def to_device_batch(images, targets, device):
#     images = [im.to(device, non_blocking=True) for im in images]
#     targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
#     return images, targets

# @torch.no_grad()
# def evaluate(detector, loader, device, class_agnostic: bool = False, desc: str = "Evaluating"):
#     detector.eval()
#     metric = MeanAveragePrecision(iou_type="bbox", iou_thresholds=[0.5], class_metrics=False)

#     for images, targets in tqdm(loader, desc=desc, leave=False):
#         images, targets = to_device_batch(images, targets, device)
#         preds = detector(images)

#         if class_agnostic:
#             # collapse labels to one class
#             tm_preds = [{
#                 "boxes":  p["boxes"].detach().cpu(),
#                 "scores": p["scores"].detach().cpu(),
#                 "labels": torch.ones_like(p["labels"]).detach().cpu(),
#             } for p in preds]
#             tm_tgts = [{
#                 "boxes":  t["boxes"].detach().cpu(),
#                 "labels": torch.ones_like(t["labels"]).detach().cpu(),
#             } for t in targets]
#         else:
#             tm_preds = [{
#                 "boxes":  p["boxes"].detach().cpu(),
#                 "scores": p["scores"].detach().cpu(),
#                 "labels": p["labels"].detach().cpu(),
#             } for p in preds]
#             tm_tgts = [{
#                 "boxes":  t["boxes"].detach().cpu(),
#                 "labels": t["labels"].detach().cpu(),
#             } for t in targets]

#         metric.update(tm_preds, tm_tgts)

#     res = metric.compute()
#     return float(res["map_50"].item())

# # ========== ZERO-SHOT (before training) ==========
# map50_0 = evaluate(detector, test_loader, device, class_agnostic=False, desc="Zero-shot mAP@0.50")
# ca50_0  = evaluate(detector, test_loader, device, class_agnostic=True,  desc="Zero-shot CA mAP@0.50")

# print(f"[Zero-shot] mAP:50 = {map50_0:.4f} | CA_mAP:50 = {ca50_0:.4f}")


In [15]:
# detector.train()
# imgs, targets = next(iter(train_loader))  # from your YOLO->FRCNN dataset
# imgs = [im.cuda() for im in imgs]
# targets = [{k: v.cuda() for k,v in t.items()} for t in targets]

# losses = detector(imgs, targets)
# print({k: float(v) for k, v in losses.items()})


In [None]:
import os, math, time, torch
from torch.optim import AdamW
from torch.cuda.amp import autocast, GradScaler
from torchmetrics.detection import MeanAveragePrecision
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm

# -------------------------
# Helpers
# -------------------------
def to_device_batch(images, targets, device):
    images = [im.to(device, non_blocking=True) for im in images]
    targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
    return images, targets

@torch.no_grad()
def evaluate(detector, loader, device, class_agnostic: bool = False, desc: str = "Eval"):
    detector.eval()
    metric = MeanAveragePrecision(iou_type="bbox", iou_thresholds=[0.5], class_metrics=False)

    for images, targets in tqdm(loader, desc=desc, leave=False):
        images, targets = to_device_batch(images, targets, device)
        preds = detector(images)

        if class_agnostic:
            tm_preds = [{
                "boxes":  p["boxes"].detach().cpu(),
                "scores": p["scores"].detach().cpu(),
                "labels": torch.ones_like(p["labels"]).detach().cpu(),
            } for p in preds]
            tm_tgts = [{
                "boxes":  t["boxes"].detach().cpu(),
                "labels": torch.ones_like(t["labels"]).detach().cpu(),
            } for t in targets]
        else:
            tm_preds = [{
                "boxes":  p["boxes"].detach().cpu(),
                "scores": p["scores"].detach().cpu(),
                "labels": p["labels"].detach().cpu(),
            } for p in preds]
            tm_tgts = [{
                "boxes":  t["boxes"].detach().cpu(),
                "labels": t["labels"].detach().cpu(),
            } for t in targets]

        metric.update(tm_preds, tm_tgts)

    res = metric.compute()
    # guard against missing keys
    return float(res.get("map_50", torch.tensor(0.)).item())

# -------------------------
# Optimizer / AMP / TB
# -------------------------
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
detector.to(device)

params = [p for p in detector.parameters() if p.requires_grad]
optimizer = AdamW(params, lr=1e-4, weight_decay=1e-4)
scaler = GradScaler(enabled=torch.cuda.is_available())

log_dir = "./runs/dino_frcnn"
os.makedirs(log_dir, exist_ok=True)
writer = SummaryWriter(log_dir=log_dir)

num_epochs = 20
print_every = 50  # iters
global_step = 0
best_map50 = -1.0
ckpt_path = "./best_map50.pth"

# ========== ZERO-SHOT (before training) ==========
map50_0 = evaluate(detector, test_loader, device, class_agnostic=False, desc="Zero-shot mAP@0.50")
ca50_0  = evaluate(detector, test_loader, device, class_agnostic=True,  desc="Zero-shot CA mAP@0.50")
print(f"[Zero-shot] mAP:50 = {map50_0:.4f} | CA_mAP:50 = {ca50_0:.4f}")
writer.add_scalar("val/mAP50_zero_shot", map50_0, 0)
writer.add_scalar("val/CA_mAP50_zero_shot", ca50_0, 0)

# -------------------------
# Training + per-epoch Validation
# -------------------------
for epoch in range(1, num_epochs + 1):
    detector.train()
    epoch_losses = {"loss_classifier": 0.0, "loss_box_reg": 0.0, "loss_objectness": 0.0, "loss_rpn_box_reg": 0.0}
    iters = 0

    pbar = tqdm(train_loader, desc=f"Train E{epoch:02d}", leave=False)
    for images, targets in pbar:
        images, targets = to_device_batch(images, targets, device)
        optimizer.zero_grad(set_to_none=True)

        with autocast(enabled=torch.cuda.is_available()):
            loss_dict = detector(images, targets)   # dict of losses
            loss = sum(loss_dict.values())

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        # accumulate & log
        iters += 1
        for k in epoch_losses:
            epoch_losses[k] += loss_dict[k].detach().item()

        # tqdm postfix
        avg_tot = sum(epoch_losses.values()) / iters
        pbar.set_postfix({
            "loss": f"{avg_tot:.4f}",
            "cls": f"{(epoch_losses['loss_classifier']/iters):.4f}",
            "box": f"{(epoch_losses['loss_box_reg']/iters):.4f}",
            "obj": f"{(epoch_losses['loss_objectness']/iters):.4f}",
            "rpn": f"{(epoch_losses['loss_rpn_box_reg']/iters):.4f}",
        })

        # TensorBoard per-iteration (optional; comment if too chatty)
        writer.add_scalar("train/loss_total", avg_tot, global_step)
        writer.add_scalar("train/loss_cls", epoch_losses["loss_classifier"]/iters, global_step)
        writer.add_scalar("train/loss_box", epoch_losses["loss_box_reg"]/iters, global_step)
        writer.add_scalar("train/loss_obj", epoch_losses["loss_objectness"]/iters, global_step)
        writer.add_scalar("train/loss_rpn", epoch_losses["loss_rpn_box_reg"]/iters, global_step)
        global_step += 1

    # ---- Validation (two metrics) ----
    map50 = evaluate(detector, test_loader, device, class_agnostic=False, desc=f"Eval E{epoch:02d} mAP@0.50")
    ca50  = evaluate(detector, test_loader, device, class_agnostic=True,  desc=f"Eval E{epoch:02d} CA mAP@0.50")
    print(f"[Epoch {epoch:02d}] mAP:50 = {map50:.4f} | CA_mAP:50 = {ca50:.4f}")

    # TensorBoard per-epoch
    writer.add_scalar("val/mAP50", map50, epoch)
    writer.add_scalar("val/CA_mAP50", ca50, epoch)
    # (Optional) LR logging
    for i, g in enumerate(optimizer.param_groups):
        writer.add_scalar(f"opt/lr_group{i}", g.get("lr", 0.0), epoch)

    # Save best
    if map50 > best_map50:
        best_map50 = map50
        torch.save({"epoch": epoch,
                    "model_state": detector.state_dict(),
                    "optimizer_state": optimizer.state_dict(),
                    "best_map50": best_map50}, ckpt_path)
        print(f"✅ New best mAP:50 = {best_map50:.4f} — checkpoint saved to {ckpt_path}")

writer.close()


2025-09-01 17:24:52.019018: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756727692.035612  103674 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756727692.041461  103674 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1756727692.055830  103674 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1756727692.055848  103674 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1756727692.055850  103674 computation_placer.cc:177] computation placer alr

[Zero-shot] mAP:50 = 0.0000 | CA_mAP:50 = 0.0000


  with autocast(enabled=torch.cuda.is_available()):
                                                                                                                         

[Epoch 01] mAP:50 = 0.0000 | CA_mAP:50 = 0.0000
✅ New best mAP:50 = 0.0000 — checkpoint saved to ./best_map50.pth


                                                                                                                         

[Epoch 02] mAP:50 = 0.0000 | CA_mAP:50 = 0.0000


                                                                                                                         

[Epoch 03] mAP:50 = 0.0000 | CA_mAP:50 = 0.0000


Eval E04 CA mAP@0.50:  40%|███▉      | 32/81 [01:03<01:33,  1.91s/it]                                                    