In [1]:
# ---------- Standard Library ----------
import os
import sys
import io
import csv
import math
import tarfile
import pickle
import urllib
import logging
import argparse
import itertools
from pathlib import Path
from collections import OrderedDict

# ---------- Third-Party Core ----------
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from tqdm import tqdm
from scipy import signal
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.linear_model import LogisticRegression

# ---------- PyTorch ----------
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
import torchvision.transforms.functional as TF
from torchvision import transforms

# ---------- TorchVision Detection ----------
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.anchor_utils import AnchorGenerator
from torchvision.ops import MultiScaleRoIAlign

# ---------- TorchMetrics ----------
from torchmetrics.detection import MeanAveragePrecision

import math
from collections import OrderedDict
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.anchor_utils import AnchorGenerator
from torchvision.ops import MultiScaleRoIAlign


In [2]:

# -------------------------
# Dataset (PNG RGB only) + YOLO-OBB (9-tuple) -> AABB
# -------------------------
class BrickKilnDataset(Dataset):
    def __init__(self, root: str, split: str, input_size: int = 800):
        self.root = Path(root)
        self.split = split
        self.img_dir = self.root / "images"
        self.label_dir = self.root / "yolo_obb_labels"

        # Keep as [0,1], no ImageNet normalization (works better with learned 1x1 RGB->12 adapter)
        self.transform = transforms.Compose([
            transforms.Resize((input_size, input_size)),
            transforms.ToTensor(),
        ])

        self.img_files = []
        all_files = sorted([f for f in os.listdir(self.img_dir) if f.lower().endswith(".png")])
        logging.info(f"Scanning {len(all_files)} PNGs in {self.img_dir}...")
        for img_name in tqdm(all_files, desc=f"Verify {split} data"):
            if self._has_valid_annotations(img_name):
                self.img_files.append(img_name)
        logging.info(f"Found {len(self.img_files)} valid images in {self.img_dir}")

    def _has_valid_annotations(self, img_name: str) -> bool:
        label_path = self.label_dir / f"{Path(img_name).stem}.txt"
        if not label_path.exists():
            return False
        with open(label_path, 'r') as f:
            for line in f:
                if len(line.strip().split()) == 9:
                    return True
        return False

    def __len__(self):
        return len(self.img_files)

    def __getitem__(self, idx: int):
        img_name = self.img_files[idx]
        img_path = self.img_dir / img_name
        label_path = self.label_dir / f"{Path(img_name).stem}.txt"

        img = Image.open(img_path).convert("RGB")
        img_tensor = self.transform(img)
        _, h, w = img_tensor.shape

        boxes, labels = [], []
        with open(label_path, 'r') as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) != 9:
                    continue
                cls_id = int(parts[0]) + 1  # reserve 0 for background
                obb = np.array([float(p) for p in parts[1:]])
                xs, ys = obb[0::2] * w, obb[1::2] * h
                xmin, ymin, xmax, ymax = np.min(xs), np.min(ys), np.max(xs), np.max(ys)
                if xmax > xmin and ymax > ymin:
                    boxes.append([xmin, ymin, xmax, ymax])
                    labels.append(cls_id)

        target = {
            "boxes": torch.as_tensor(boxes, dtype=torch.float32),
            "labels": torch.as_tensor(labels, dtype=torch.int64),
        }
        return img_tensor, target


def collate_fn(batch):
    batch = [item for item in batch if item[1]["boxes"].shape[0] > 0]
    if not batch:
        return None, None
    return tuple(zip(*batch))



In [3]:
DINOV3_GITHUB_LOCATION = "/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/Foundation-Models/dinov3"

if os.getenv("DINOV3_LOCATION") is not None:
    DINOV3_LOCATION = os.getenv("DINOV3_LOCATION")
else:
    DINOV3_LOCATION = DINOV3_GITHUB_LOCATION

print(f"DINOv3 location set to {DINOV3_LOCATION}")



MODEL_DINOV3_VIT7B = "dinov3_vit7b16"

MODEL_NAME = MODEL_DINOV3_VIT7B
model = torch.hub.load(
    repo_or_dir=DINOV3_LOCATION,
    model=MODEL_NAME,
    source="local",
    weights="/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/Foundation-Models/dinov3/notebooks/dinov3_vit7b16_pretrain_sat493m.pth",
    skip_validation=True, 
)

DINOv3 location set to /home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/Foundation-Models/dinov3


In [4]:

class DinoV3BackboneWrapper(nn.Module):
    """
    Wraps a DINOv3 ViT (e.g., dinov3_vitl16) to look like a torchvision backbone.

    Returns a single FPN level '0' with stride 16:
        features = {'0': Tensor[B, C, H/16, W/16]}
    and exposes:
        .out_channels = C
    """
    def __init__(self, dino_model: nn.Module, patch_stride: int = 16):
        super().__init__()
        self.dino = dino_model
        self.patch_stride = patch_stride

        # Try to infer channel dim (embed dim)
        # Common names: embed_dim, num_features, etc.
        C = getattr(dino_model, "embed_dim", None)
        if C is None:
            C = getattr(dino_model, "num_features", None)
        if C is None:
            # fallback: probe with a tiny dummy (32x32, will be rounded up)
            with torch.no_grad():
                x = torch.zeros(1, 3, 32, 32)
                tokens, Ht, Wt = self._get_patch_tokens(x)
                C = tokens.shape[-1]
        self.out_channels = C

    @torch.no_grad()
    def _maybe_h_w(self, x):
        # Height/Width of patch grid (round up)
        B, _, H, W = x.shape
        Ht = math.ceil(H / self.patch_stride)
        Wt = math.ceil(W / self.patch_stride)
        return Ht, Wt

    def _get_patch_tokens(self, x):
        """
        Returns:
            tokens: [B, Ht*Wt, C] — patch tokens (no cls)
            Ht, Wt: patch grid size
        """
        # Preferred: many DINOv3 builds return a dict from forward_features
        try:
            out = self.dino.forward_features(x)  # may return dict or tensor
            if isinstance(out, dict):
                # Common DINOv3 keys:
                if "x_norm_patchtokens" in out:
                    tokens = out["x_norm_patchtokens"]           # [B, Ht*Wt, C]
                    Ht = out.get("H", None)
                    Wt = out.get("W", None)
                    if Ht is None or Wt is None:
                        Ht, Wt = self._maybe_h_w(x)
                    return tokens, Ht, Wt
                if "tokens" in out and out["tokens"] is not None:
                    t = out["tokens"]                             # [B, 1+Ht*Wt, C]?
                    # drop cls if present
                    if t.shape[1] == (self._maybe_h_w(x)[0] * self._maybe_h_w(x)[1] + 1):
                        t = t[:, 1:, :]
                    return t, *self._maybe_h_w(x)
            # If it's a tensor: assume [B, 1+N, C] or [B, N, C]
            if isinstance(out, torch.Tensor):
                t = out
                # find patch count
                Ht, Wt = self._maybe_h_w(x)
                N = Ht * Wt
                if t.shape[1] == N + 1:
                    t = t[:, 1:, :]
                elif t.shape[1] != N:
                    # If shapes mismatch, just compute Ht/Wt from t
                    # assume no cls, make it square-ish if possible
                    N = t.shape[1]
                    Wt = int(round(math.sqrt(N)))
                    Ht = N // Wt
                return t, Ht, Wt
        except Exception:
            pass

        # Fallback: DINOv2/v3 often exposes get_intermediate_layers
        if hasattr(self.dino, "get_intermediate_layers"):
            # return last block tokens (no cls)
            t = self.dino.get_intermediate_layers(x, n=1, return_class_token=False)[0]  # [B, N, C]
            Ht, Wt = self._maybe_h_w(x)
            return t, Ht, Wt

        # Last resort: call the model and hope it returns tokens
        t = self.dino(x)  # [B, N, C] or [B, 1+N, C]
        Ht, Wt = self._maybe_h_w(x)
        if t.dim() == 3 and t.shape[1] == (Ht*Wt + 1):
            t = t[:, 1:, :]
        return t, Ht, Wt

    def forward(self, x: torch.Tensor):
        """
        x: [B,3,H,W] in [0,1] or normalized — up to your preprocessing.
        """
        tokens, Ht, Wt = self._get_patch_tokens(x)   # [B, N, C]
        B, N, C = tokens.shape
        # reshape tokens -> feature map
        feat = tokens.transpose(1, 2).contiguous().view(B, C, Ht, Wt)  # [B,C,H/16,W/16]
        # print("feat_shape",feat.shape)
        return {"0": feat}


In [5]:
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.anchor_utils import AnchorGenerator
from torchvision.ops import MultiScaleRoIAlign

def create_model(dino_model, num_classes: int, image_size: int = 800, freeze_backbone: bool = True):
    backbone = DinoV3BackboneWrapper(dino_model, patch_stride=16)

    # Freeze backbone if requested
    if freeze_backbone:
        for param in backbone.parameters():
            param.requires_grad = False

    anchor_generator = AnchorGenerator(
        sizes=((16, 32, 64, 128, 256),), 
        aspect_ratios=((0.5, 1.0, 2.0),)
    )

    model = FasterRCNN(
        backbone=backbone,
        num_classes=num_classes,
        rpn_anchor_generator=anchor_generator,
        min_size=image_size,
        max_size=image_size,
    )
    return model


In [6]:
device=torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
model=create_model(model, num_classes=4, image_size=800)
model.to(device)

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=800, mode='bilinear')
  )
  (backbone): DinoV3BackboneWrapper(
    (dino): DinoVisionTransformer(
      (patch_embed): PatchEmbed(
        (proj): Conv2d(3, 4096, kernel_size=(16, 16), stride=(16, 16))
        (norm): Identity()
      )
      (rope_embed): RopePositionEmbedding()
      (blocks): ModuleList(
        (0-39): 40 x SelfAttentionBlock(
          (norm1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
          (attn): SelfAttention(
            (qkv): LinearKMaskedBias(in_features=4096, out_features=12288, bias=False)
            (attn_drop): Dropout(p=0.0, inplace=False)
            (proj): Linear(in_features=4096, out_features=4096, bias=True)
            (proj_drop): Dropout(p=0.0, inplace=False)
          )
          (ls1): LayerScale()
          (norm2): LayerNorm((4096,), eps=1e-05, elementwise_affin

In [7]:
# -------------------------
# Train / Validate
# -------------------------
def train_one_epoch(model, optimizer, data_loader, device):
    model.train()
    print(model)
    total_loss = 0.0
    steps = 0
    for images, targets in tqdm(data_loader, desc="Training"):
        if images is None:
            continue
        images = [img.to(device) for img in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad(set_to_none=True)
        losses.backward()
        optimizer.step()

        total_loss += losses.item()
        steps += 1

    return total_loss / max(1, steps)


In [8]:
import torch
from tqdm import tqdm
from torchmetrics.detection import MeanAveragePrecision

@torch.no_grad()
def validate(model, data_loader, device):
    """
    Run evaluation on a detection model with TorchMetrics mAP.

    Args:
        model: detection model (torchvision style)
        data_loader: DataLoader yielding (images, targets)
        device: torch.device("cuda") or torch.device("cpu")

    Returns:
        map_all: mAP@[0.50:0.95] averaged over IoU thresholds
        map_50:  mAP@0.50 (IoU=0.50)
    """
    model.eval()
    metric = MeanAveragePrecision(box_format="xyxy", iou_type="bbox", class_metrics=False)

    for images, targets in tqdm(data_loader, desc="Validation"):
        if images is None:
            continue
        # move to device
        images = [img.to(device) for img in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        # forward
        outputs = model(images)

        # move outputs and targets back to CPU for metric
        outputs = [{k: v.detach().cpu() for k, v in o.items()} for o in outputs]
        targets = [{k: v.detach().cpu() for k, v in t.items()} for t in targets]

        # update metric
        metric.update(outputs, targets)

    res = metric.compute()
    map_all = res.get("map", torch.tensor(0.)).item()      # mAP@[.5:.95]
    map_50  = res.get("map_50", torch.tensor(0.)).item()   # mAP@0.50

    return map_all, map_50


In [9]:
train_dataset = BrickKilnDataset(root="/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/data/processed_data/sentinel/sentinelkilndb_bechmarking_data/train", split='train', input_size=224)
val_dataset = BrickKilnDataset(root="/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/data/processed_data/sentinel/sentinelkilndb_bechmarking_data/val", split='val', input_size=224)
train_loader= DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn, num_workers=8,pin_memory=True)
test_loader= DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn, num_workers=8,pin_memory=True)


Verify train data: 100%|██████████| 71856/71856 [00:02<00:00, 34280.50it/s]
Verify val data: 100%|██████████| 23952/23952 [00:00<00:00, 33730.69it/s]


In [None]:
import os
import torch

# ---- Learning rates / hyperparameters ----
backbone_lr = 1e-5      # smaller LR for backbone
head_lr = 1e-4          # larger LR for heads
weight_decay = 0.04
num_epochs = 20

# ---- Param groups: split backbone vs heads ----
backbone_params, head_params = [], []
for name, p in model.named_parameters():
    if not p.requires_grad:
        continue
    if name.startswith("backbone.dino"):
        backbone_params.append(p)
    else:
        head_params.append(p)

optimizer = torch.optim.AdamW([
    {"params": backbone_params, "lr": backbone_lr},
    {"params": head_params, "lr": head_lr},
], weight_decay=weight_decay)

lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer, T_max=num_epochs
)

# -------------------------
# Training loop with checkpointing
# -------------------------
os.makedirs("checkpoints", exist_ok=True)

for epoch in range(num_epochs):
    train_loss = train_one_epoch(model, optimizer, train_loader, device)
    lr_scheduler.step()
    val_map, val_map50 = validate(model, test_loader, device)

    print(f"Epoch {epoch+1}/{num_epochs} - "
          f"Train Loss: {train_loss:.4f} - "
          f"Val mAP: {val_map:.4f} - "
          f"Val mAP50: {val_map50:.4f}")

    # ---- Save checkpoint ----
    ckpt_path = f"checkpoints_dino_7b/dino_frcnn_epoch{epoch+1:02d}.pth"
    torch.save({
        "epoch": epoch + 1,
        "model_state": model.state_dict(),
        "optimizer_state": optimizer.state_dict(),
        "scheduler_state": lr_scheduler.state_dict(),
        "train_loss": train_loss,
        "val_map": val_map,
        "val_map50": val_map50,
    }, ckpt_path)
    print(f"✅ Saved checkpoint: {ckpt_path}")


FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=800, mode='bilinear')
  )
  (backbone): DinoV3BackboneWrapper(
    (dino): DinoVisionTransformer(
      (patch_embed): PatchEmbed(
        (proj): Conv2d(3, 4096, kernel_size=(16, 16), stride=(16, 16))
        (norm): Identity()
      )
      (rope_embed): RopePositionEmbedding()
      (blocks): ModuleList(
        (0-39): 40 x SelfAttentionBlock(
          (norm1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
          (attn): SelfAttention(
            (qkv): LinearKMaskedBias(in_features=4096, out_features=12288, bias=False)
            (attn_drop): Dropout(p=0.0, inplace=False)
            (proj): Linear(in_features=4096, out_features=4096, bias=True)
            (proj_drop): Dropout(p=0.0, inplace=False)
          )
          (ls1): LayerScale()
          (norm2): LayerNorm((4096,), eps=1e-05, elementwise_affin

Training:  79%|███████▊  | 1161/1476 [12:42:51<3:26:53, 39.41s/it]