In [1]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms.v2 as T
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.tv_tensors import Image as TVImage, BoundingBoxes as TVBoxes
from PIL import Image as PILImage
import os
import matplotlib.pyplot as plt
# This is for the progress bar.
from tqdm import tqdm
import seaborn as sns
from pathlib import Path
import numpy as np
import pandas as pd
from collections import defaultdict
import argparse

#torch.device('cuda')

In [2]:
# faster RCNN resnet50
def _largest_ndarray(obj):
    if isinstance(obj, dict):
        for k in ("image","img","array","data","X","arr"):
            if k in obj and isinstance(obj[k], np.ndarray):
                return obj[k]
        cands = [v for v in obj.values() if isinstance(v, np.ndarray)]
        return max(cands, key=lambda a: a.size)
    if isinstance(obj, (list, tuple)):
        cands = [v for v in obj if isinstance(v, np.ndarray)]
        return max(cands, key=lambda a: a.size)
    return obj

def load_image_hwc_uint8(npy_path: str) -> np.ndarray:
    """
    假设 .npy 存的是 HWC, ndim=3。
    - 若 dtype 非 uint8, 自动判断 0~1 / -1~1 并映射到 0~255
    - 若为单通道或带 alpha, 自动处理到 3 通道 RGB
    """
    a = np.load(npy_path, allow_pickle=False, mmap_mode="r")  # <<< enable mmap，已知是 HWC, ndim=3

    if a.dtype != np.uint8:
        a = a.astype(np.float32, copy=False)
        vmin, vmax = float(a.min()), float(a.max())
        if 0.0 <= vmin and vmax <= 1.0:
            a = (a * 255.0).round()
        elif -1.0 <= vmin and vmax <= 1.0:
            a = ((a + 1.0) * 0.5 * 255.0).round()
        # 否则认为已经接近 0..255，不再缩放
        a = np.clip(a, 0, 255).astype(np.uint8)

    # 通道处理：1通道 -> 3通道；4通道 -> 丢 alpha
    c = a.shape[2]
    if c == 1:
        a = np.repeat(a, 3, axis=2)
    elif c == 4:
        a = a[..., :3]
    return a  # HWC uint8(3)

def sanitize_boxes_and_labels(boxes: torch.Tensor,
                              labels: torch.Tensor,
                              H: int, W: int,
                              min_size: float = 1.0):
    """
    boxes: Tensor [N,4] (xyxy, float32)
    labels: Tensor [N]  (int64)
    将 boxes 限制在 [0,W]x[0,H]，并过滤宽/高 <= min_size 的框。
    返回过滤后的 (boxes, labels)；允许返回 N==0。
    """
    if boxes.numel() == 0:
        return boxes.reshape(0,4).float(), labels.reshape(0).long()

    boxes = boxes.clone()
    # clip 到图像内
    boxes[:, 0::2] = boxes[:, 0::2].clamp(0, float(W))
    boxes[:, 1::2] = boxes[:, 1::2].clamp(0, float(H))

    # 计算宽高
    ws = boxes[:, 2] - boxes[:, 0]
    hs = boxes[:, 3] - boxes[:, 1]
    keep = (ws > min_size) & (hs > min_size)

    if keep.sum() == 0:
        # 返回空目标，torchvision 支持空 GT
        return boxes.new_zeros((0,4)), labels.new_zeros((0,), dtype=torch.long)

    return boxes[keep].float(), labels[keep].long()

class NpyDetDataset(Dataset):
    """
    返回：
      image: FloatTensor [3,H,W] (0..1)
      target: dict(boxes: [N,4] xyxy, labels: [N], image_id, size=[H,W])
    """
    def __init__(self, base_dir, split="train", transform=None):
        base = Path(base_dir)
        self.img_dir = base / split / "images"
        self.lbl_dir = base / split / "labels"
        self.stems = sorted([p.stem for p in self.lbl_dir.glob("*.csv")])
        self.transform = transform

        # 一次性把所有 CSV 读到内存，避免训练期I/O瓶颈
        self.ann = {s: pd.read_csv(self.lbl_dir / f"{s}.csv") for s in self.stems}

    def __len__(self):
        return len(self.stems)

    def __getitem__(self, idx):
        stem = self.stems[idx]
        img_path = self.img_dir / f"{stem}.npy"
        img = load_image_hwc_uint8(img_path)
        h, w = img.shape[:2]

        df = self.ann[stem].copy()
        x_c = df["x_center"].astype("float32").to_numpy()
        y_c = df["y_center"].astype("float32").to_numpy()
        bw  = df["width"].astype("float32").to_numpy()
        bh  = df["height"].astype("float32").to_numpy()

        # 自动识别是否为归一化
        if x_c.max() <= 1.0 and y_c.max() <= 1.0 and bw.max() <= 1.0 and bh.max() <= 1.0:
            x_c *= w; y_c *= h; bw *= w; bh *= h

        x1 = x_c - bw/2; y1 = y_c - bh/2
        x2 = x_c + bw/2; y2 = y_c + bh/2
        boxes = np.stack([x1,y1,x2,y2], axis=1)
        boxes[:, [0,2]] = boxes[:, [0,2]].clip(0, w)
        boxes[:, [1,3]] = boxes[:, [1,3]].clip(0, h)

        labels = df["class"].astype("int64").to_numpy()

        if not img.flags.writeable or not img.flags['C_CONTIGUOUS']:
            img = np.ascontiguousarray(img)   # 或 img = img.copy()

        img_t = torch.from_numpy(img).permute(2,0,1).to(torch.float32).div_(255.0)

        
        target = {
            "boxes":  torch.from_numpy(boxes).float(),
            "labels": torch.from_numpy(labels).long(),
            "image_id": torch.tensor([idx]),
            "size": torch.tensor([h,w])
        }

        if self.transform is not None:
            # v2 需要 tv_tensors 包装，才能自动同步几何变换
            img_tv    = TVImage(img_t)  # Tensor子类，保留元信息
            bboxes_tv = TVBoxes(target["boxes"], format="XYXY", canvas_size=(h, w))
            tv_target = {"boxes": bboxes_tv, "labels": target["labels"]}
    
            img_tv, tv_target = self.transform(img_tv, tv_target)
    
            # 更新尺寸与目标
            _, H2, W2 = img_tv.shape
            boxes2  = tv_target["boxes"]
            if hasattr(boxes2, "as_tensor"):  # tv_tensors/datapoints
                boxes2 = boxes2.as_tensor()
            labels2 = tv_target["labels"]
            
            # 再清洗一次（裁剪/缩放后可能产生 0 宽/高）
            boxes2, labels2 = sanitize_boxes_and_labels(boxes2, labels2, H=H2, W=W2, min_size=1.0)
            
            img_t = img_tv
            target = {
                "boxes": boxes2,
                "labels": tv_target["labels"],
                "image_id": torch.tensor([idx]),
                "size": torch.tensor([img_t.shape[-2], img_t.shape[-1]]),
            }

        return img_t, target

def det_collate_fn(batch):
    imgs, targets = list(zip(*batch))
    return list(imgs), list(targets)

def build_model(
    num_classes,
    small_object=False,
    # 推理后处理（eval 时生效）
    score_thresh=0.30,     # 统一分类阈值（你也可以用外部 per-class 过滤替代），默认是很低的 0.05 左右
    nms_thresh=0.4,       # NMS IoU 从 0.5 降到 0.45 或 0.4 试试
    max_dets=100,          # 每图最多保留多少个检测 或 100, 视任务而定
    # 训练采样/匹配（让负样本多一些，减少 FP）
    roi_batch_size=1024,
    roi_pos_fraction=0.10,
    roi_fg_iou=0.60,
    roi_bg_iou=0.40,
    # RPN Top-N（可略优化 proposal 质量）
    rpn_pre_topn_train=2000,
    rpn_pre_topn_test=1000,
    rpn_post_topn_train=1000,
    rpn_post_topn_test=500,
):
    # 预训练 Faster R-CNN + ResNet50-FPN
    model = fasterrcnn_resnet50_fpn_v2(weights="DEFAULT", weights_backbone="DEFAULT")

    # 可选：更小的 anchors，利于小目标
    if small_object:
        # 对 FPN 的 5 个特征层分别设定 anchor 尺寸
        anchor_sizes = ((16,), (32,), (64,), (128,), (256,))
        aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes)
        model.rpn.anchor_generator = AnchorGenerator(sizes=anchor_sizes,
                                                     aspect_ratios=aspect_ratios)

    # 换分类头（num_classes 包含 background）

    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    # ===== 推理后处理（仅 eval 用到）=====
    rh = model.roi_heads
    # 不同版本可能叫 score_thresh 或 box_score_thresh；都设置以兼容
    if hasattr(rh, "score_thresh"):       rh.score_thresh = score_thresh
    if hasattr(rh, "box_score_thresh"):   rh.box_score_thresh = score_thresh
    if hasattr(rh, "nms_thresh"):         rh.nms_thresh = nms_thresh
    if hasattr(rh, "box_nms_thresh"):     rh.box_nms_thresh = nms_thresh
    if hasattr(rh, "detections_per_img"): rh.detections_per_img = max_dets

    # ===== 训练采样/匹配（影响 FP/学习难度）=====
    if hasattr(rh, "box_batch_size_per_image"): rh.box_batch_size_per_image = roi_batch_size
    if hasattr(rh, "positive_fraction"):        rh.positive_fraction        = roi_pos_fraction
    if hasattr(rh, "box_fg_iou_thresh"):        rh.box_fg_iou_thresh        = roi_fg_iou
    if hasattr(rh, "box_bg_iou_thresh"):        rh.box_bg_iou_thresh        = roi_bg_iou

    # ===== RPN Top-N（略提高 proposal 质量/数量）=====
    rpn = model.rpn
    if hasattr(rpn, "pre_nms_top_n") and isinstance(rpn.pre_nms_top_n, dict):
        rpn.pre_nms_top_n["training"] = rpn_pre_topn_train
        rpn.pre_nms_top_n["testing"]  = rpn_pre_topn_test
    if hasattr(rpn, "post_nms_top_n") and isinstance(rpn.post_nms_top_n, dict):
        rpn.post_nms_top_n["training"] = rpn_post_topn_train
        rpn.post_nms_top_n["testing"]  = rpn_post_topn_test
        
    return model

def box_iou_np(boxes1, boxes2):
    """boxes: [N,4] & [M,4] in xyxy -> IoU [N,M]"""
    if len(boxes1)==0 or len(boxes2)==0:
        return np.zeros((len(boxes1), len(boxes2)), dtype=np.float32)
    x11,y11,x12,y12 = boxes1[:,0],boxes1[:,1],boxes1[:,2],boxes1[:,3]
    x21,y21,x22,y22 = boxes2[:,0],boxes2[:,1],boxes2[:,2],boxes2[:,3]
    inter_w = np.maximum(0, np.minimum(x12, x22) - np.maximum(x11, x21))
    inter_h = np.maximum(0, np.minimum(y12, y22) - np.maximum(y11, y21))
    inter = inter_w * inter_h
    area1 = np.maximum(0, x12-x11) * np.maximum(0, y12-y11)
    area2 = np.maximum(0, x22-x21) * np.maximum(0, y22-y21)
    union = area1[:,None] + area2[None,:] - inter
    return inter / np.clip(union, 1e-6, None)

def train_one_epoch(model, loader, optimizer, device, scaler=None):
    model.train()
    loss_sum = 0.0
    for imgs, targets in loader:
        imgs = [img.to(device) for img in imgs]
        targets = [{k: v.to(device) for k,v in t.items()} for t in targets]

        optimizer.zero_grad(set_to_none=True)
        if scaler:
            with torch.autocast(device_type=device, dtype=torch.float16 if device=='cuda' else torch.bfloat16):
                loss_dict = model(imgs, targets)
                losses = sum(loss_dict.values())
            scaler.scale(losses).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            loss_dict = model(imgs, targets)
            losses = sum(loss_dict.values())
            losses.backward()
            optimizer.step()

        loss_sum += losses.item()
    return loss_sum / max(1, len(loader))

BLOCKED = {3, 8, 12, 24, 26}  
CLASS_THR = { }
def filter_predictions(preds, blocked=None, class_thresholds=None, default_thresh=0.30):
    """
    preds: list of dicts from torchvision (boxes, labels, scores)
    blocked: 需要完全屏蔽的类别
    class_thresholds: 可选，dict{cls:thr} 每类不同分数阈值
    """
    blocked = set(blocked or [])
    out = []
    for p in preds:
        labels, scores = p["labels"], p["scores"]
        keep = scores >= default_thresh
        if blocked:
            for c in blocked: keep &= (labels != c)
        if class_thresholds:
            thr = torch.full_like(scores, default_thresh)
            for c,t in class_thresholds.items():
                thr = torch.where(labels==c, torch.as_tensor(t, device=scores.device, dtype=scores.dtype), thr)
            keep &= (scores >= thr)
        out.append({k: v[keep] for k,v in p.items() if k in ("boxes","labels","scores")})
    return out

@torch.no_grad()
def evaluate_loss_and_pr(
    model,
    loader,
    device,
    num_classes: int,
    iou_thresh: float = 0.5,
    score_thresh: float = 0.05,
):
    """
    适用于 torchvision 检测模型（Faster R-CNN 等）
    - 假设类别 id 为 0..num_classes-1（无“背景类”标签）
    - 返回：{'loss', 'TP','FP','FN','precision','recall'}
    """
    was_training = model.training

    loss_sum = 0.0
    n_loss_batches = 0
    TP = np.zeros(num_classes, dtype=np.int64)
    FP = np.zeros(num_classes, dtype=np.int64)
    FN = np.zeros(num_classes, dtype=np.int64)

    for imgs, targets in loader:
        imgs_dev = [im.to(device) for im in imgs]
        targets_dev = [{k: v.to(device) for k, v in t.items()} for t in targets]

        # ---- 1) 计算验证 loss（需 train() 才会返回 loss dict）----
        model.train()
        loss_dict = model(imgs_dev, targets_dev)
        loss_val = sum(loss_dict.values()).item()
        loss_sum += loss_val
        n_loss_batches += 1

        # ---- 2) 做预测并统计 per-class PR（需 eval()）----
        model.eval()
        raw_preds = model(imgs_dev)
        preds = filter_predictions(raw_preds, blocked=BLOCKED, class_thresholds=CLASS_THR, default_thresh=0.30)
        
        for pred, tgt in zip(preds, targets_dev):
            pb = pred["boxes"].detach().cpu().numpy()
            pl = pred["labels"].detach().cpu().numpy()
            ps = pred["scores"].detach().cpu().numpy()

            tb = tgt["boxes"].detach().cpu().numpy()
            tl = tgt["labels"].detach().cpu().numpy()

            # 按类别逐一匹配
            for c in range(num_classes):
                pb_c = pb[pl == c]
                ps_c = ps[pl == c]
                tb_c = tb[tl == c]

                if len(pb_c) == 0 and len(tb_c) == 0:
                    continue
                if len(tb_c) == 0:
                    FP[c] += len(pb_c)
                    continue
                if len(pb_c) == 0:
                    FN[c] += len(tb_c)
                    continue

                order = np.argsort(-ps_c)  # 先匹配高分框
                pb_c = pb_c[order]
                iou = box_iou_np(pb_c, tb_c)

                matched = set()
                for i in range(len(pb_c)):
                    j = int(np.argmax(iou[i]))
                    if iou[i, j] >= iou_thresh and j not in matched:
                        TP[c] += 1
                        matched.add(j)
                    else:
                        FP[c] += 1
                FN[c] += (len(tb_c) - len(matched))

    # 恢复原模式
    model.train() if was_training else model.eval()

    precision = TP / np.clip(TP + FP, 1, None)
    recall    = TP / np.clip(TP + FN, 1, None)
    mean_loss = loss_sum / max(1, n_loss_batches)

    return {
        "loss": mean_loss,
        "TP": TP,
        "FP": FP,
        "FN": FN,
        "precision": precision,
        "recall": recall,
    }


In [23]:
#main function

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--base", default="/kaggle/input/electrical-component/dataset/", help="root directory")
    ap.add_argument("--bs", type=int, default=2)
    ap.add_argument("--epochs", type=int, default=10)
    ap.add_argument("--lr", type=float, default=1e-4)
    ap.add_argument("--small-object", action="store_true", help="smaller anchors")
    args = ap.parse_args([]) # Pass an empty list to avoid parsing notebook arguments
    num_cpus = os.cpu_count() or 2          # Kaggle 常见是 2
    sizes = [640, 704, 768, 832, 896]
    
    train_tf = T.Compose([
        T.RandomHorizontalFlip(0.5),
        T.RandomVerticalFlip(0.2),
        T.RandomIoUCrop(
            min_scale=0.5, max_scale=1.0,  # 提高 min_scale 降低裁剪难度
            sampler_options=[0.3, 0.7],    # 精简采样档位
        ),
        # T.RandomIoUCrop(min_scale=0.3, max_scale=1.0,
        #                 min_aspect_ratio=0.75, max_aspect_ratio=1.33,
        #                 sampler_options=[0.1, 0.3, 0.5, 0.7, 0.9]),
        T.RandomChoice([T.Resize((s, s)) for s in sizes]),  # ← 多尺度随机
        T.ColorJitter(0.2, 0.2, 0.2, 0.05),
        # 如果你这里也报错说没有 SanitizeBoundingBoxes，就先删掉这一行或看方案 B 的 fallback
        # T.SanitizeBoundingBoxes(min_size=2),
    ])

    test_tf = None  # 验证/测试通常不做随机增广（可以只做 Resize/Normalize 等）
    
    train_set = NpyDetDataset(args.base, split="train", transform=train_tf)
    test_set  = NpyDetDataset(args.base, split="test",  transform=test_tf)

    train_loader = DataLoader(train_set, batch_size=args.bs, shuffle=True,
                              num_workers=2, pin_memory=True, pin_memory_device="cuda",
                                prefetch_factor=4,                  # 默认 2，适当加大
                                persistent_workers=True,            # 复用 worker，跨 epoch 不重启
                                collate_fn=det_collate_fn)
    test_loader  = DataLoader(test_set,  batch_size=args.bs, shuffle=False,
                              num_workers=1, pin_memory=True, pin_memory_device="cuda",
                                prefetch_factor=2,                  # 默认 2，适当加大
                                persistent_workers=True,            # 复用 worker，跨 epoch 不重启
                                collate_fn=det_collate_fn)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = build_model(32, small_object=args.small_object).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=1e-3)
    scaler = torch.amp.GradScaler('cuda', enabled=(device=="cuda"))

    best_val = float("inf")
    for epoch in range(1, args.epochs + 1):
        tr = train_one_epoch(model, train_loader, optimizer, device, scaler)
        va = evaluate_loss_and_pr(model, test_loader, device, num_classes=32,      # 你的类别数
        iou_thresh=0.5,      # PR 的 IoU 阈值
        score_thresh=0.2     # 置信度过滤
)
        print(f"Epoch {epoch:02d} | train loss {tr:.4f} | val loss {va['loss']:.4f}")
        for c in range(32):
          print(f"class {c:02d} | P={va['precision'][c]:.3f} "
          f"R={va['recall'][c]:.3f}  TP={va['TP'][c]} FP={va['FP'][c]} FN={va['FN'][c]}")
        if va['loss'] < best_val:
            best_val = va['loss']
            torch.save(model.state_dict(), "best_frcnn_resnet50fpn.pt")
            print("  (saved best)")

main() # Call the main function directly

  img_t = torch.from_numpy(img).permute(2,0,1).to(torch.float32).div_(255.0)
  img_t = torch.from_numpy(img).permute(2,0,1).to(torch.float32).div_(255.0)


KeyboardInterrupt: 