# Class-agnostic Detector Training (Faster R-CNN R50-FPN)

Plan:
- Use torchvision Faster R-CNN ResNet50-FPN, 1-class (glyph) + background.
- Model handles resizing internally; set min_size=1333, max_size~2000; AMP on; bs=2 if fits.
- Data: train.csv (unicode x y w h); folds from folds_group.csv; images in train_images/.
- Train a 1-epoch smoke test on fold 0 (or capped steps) with logging and checkpoint.
- Next: extend to 6–8 epochs and add OOF eval + threshold sweep.

In [2]:
# One-class Faster R-CNN training: 1-epoch smoke test (fold 0)
import os, sys, time, math, random, gc, json
from pathlib import Path
import pandas as pd
import numpy as np
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
import torchvision
from torchvision.ops import box_convert

torch.backends.cudnn.benchmark = True
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Torch:', torch.__version__, 'CUDA:', torch.cuda.is_available(), 'Device:', device)
if device.type == 'cuda':
    print('GPU:', torch.cuda.get_device_name(0))

CWD = Path('.')
train_csv = CWD / 'train.csv'
folds_csv = CWD / 'folds_group.csv'
train_dir = CWD / 'train_images'
assert train_csv.exists(), 'train.csv missing'
assert folds_csv.exists(), 'folds_group.csv missing'
assert train_dir.exists(), 'train_images dir missing'

df_train = pd.read_csv(train_csv)
df_folds = pd.read_csv(folds_csv)

# Parse labels: quintuplets unicode x y w h
def parse_labels_full(labels: str):
    if not isinstance(labels, str) or labels.strip() == '':
        return []
    toks = labels.strip().split()
    out = []
    if len(toks) % 5 != 0:
        return out
    for i in range(0, len(toks), 5):
        u, x, y, w, h = toks[i:i+5]
        try:
            x = int(x); y = int(y); w = int(w); h = int(h)
            out.append((u, x, y, w, h))
        except Exception:
            continue
    return out

class KuzDetDataset(Dataset):
    def __init__(self, df_train: pd.DataFrame, df_folds: pd.DataFrame, fold: int, split: str='train'):
        self.split = split
        folds_map = dict(df_folds.values)
        self.items = []
        for r in df_train.itertuples(index=False):
            image_id = getattr(r, 'image_id') if hasattr(r, 'image_id') else r[0]
            labels = getattr(r, 'labels') if hasattr(r, 'labels') else r[1]
            f = folds_map.get(image_id, None)
            if f is None:
                continue
            if (split == 'train' and f != fold) or (split == 'val' and f == fold):
                boxes = parse_labels_full(labels)
                self.items.append((image_id, boxes))
        print(f'{split} items:', len(self.items))

    def __len__(self):
        return len(self.items)

    def __getitem__(self, idx):
        image_id, boxes = self.items[idx]
        img_path = train_dir / f'{image_id}.jpg'
        if not img_path.exists():
            # fallback to png if any
            png = train_dir / f'{image_id}.png'
            if png.exists():
                img_path = png
        img = Image.open(img_path).convert('RGB')
        w0, h0 = img.size
        # Build targets
        if len(boxes) > 0:
            xyxy = []
            for (_, x, y, w, h) in boxes:
                x1 = max(0, x); y1 = max(0, y); x2 = min(w0, x + w); y2 = min(h0, y + h)
                if x2 > x1 and y2 > y1:
                    xyxy.append([x1, y1, x2, y2])
            if len(xyxy) == 0:
                xyxy = np.zeros((0, 4), dtype=np.float32)
            boxes_t = torch.as_tensor(xyxy, dtype=torch.float32)
            labels_t = torch.ones((boxes_t.shape[0],), dtype=torch.int64)  # class-agnostic: 1
            area_t = (boxes_t[:, 2] - boxes_t[:, 0]) * (boxes_t[:, 3] - boxes_t[:, 1]) if boxes_t.numel() else torch.zeros((0,), dtype=torch.float32)
        else:
            boxes_t = torch.zeros((0, 4), dtype=torch.float32)
            labels_t = torch.zeros((0,), dtype=torch.int64)
            area_t = torch.zeros((0,), dtype=torch.float32)
        target = {
            'boxes': boxes_t,
            'labels': labels_t,
            'image_id': torch.tensor([idx]),
            'area': area_t,
            'iscrowd': torch.zeros((labels_t.shape[0],), dtype=torch.int64)
        }
        return torchvision.transforms.functional.pil_to_tensor(img).float() / 255.0, target, image_id

def collate_fn(batch):
    imgs, targets, ids = list(zip(*batch))
    return list(imgs), list(targets), list(ids)

fold = 0
batch_size = 2
num_workers = min(4, os.cpu_count() or 2)
train_ds = KuzDetDataset(df_train, df_folds, fold=fold, split='train')
val_ds = KuzDetDataset(df_train, df_folds, fold=fold, split='val')
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=num_workers, collate_fn=collate_fn, pin_memory=True)
val_loader = DataLoader(val_ds, batch_size=1, shuffle=False, num_workers=num_workers, collate_fn=collate_fn, pin_memory=True)

# Model
from torchvision.models.detection import fasterrcnn_resnet50_fpn, FasterRCNN_ResNet50_FPN_Weights
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
weights = FasterRCNN_ResNet50_FPN_Weights.DEFAULT
model = fasterrcnn_resnet50_fpn(weights=weights)
# Replace head for 2 classes (background + glyph)
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes=2)
# Resize settings
model.transform.min_size = (1333,)
model.transform.max_size = 2000
model.to(device)

optimizer = optim.SGD(model.parameters(), lr=0.005, momentum=0.9, weight_decay=0.0005)
lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[1], gamma=0.1)
scaler = GradScaler(enabled=(device.type=='cuda'))

def train_one_epoch(model, loader, optimizer, scaler, epoch, max_steps=None):
    model.train()
    running = 0.0
    t0 = time.time()
    last_log = t0
    steps = 0
    for i, (imgs, targets, ids) in enumerate(loader):
        imgs = [im.to(device, non_blocking=True) for im in imgs]
        tgts = [{k: v.to(device) if torch.is_tensor(v) else v for k, v in t.items()} for t in targets]
        optimizer.zero_grad(set_to_none=True)
        with autocast(enabled=(device.type=='cuda')):
            loss_dict = model(imgs, tgts)
            loss = sum(loss for loss in loss_dict.values())
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        running += loss.item()
        steps += 1
        if (time.time() - last_log) > 5:
            mem = torch.cuda.memory_allocated()/1024**3 if device.type=='cuda' else 0.0
            print(f'Epoch {epoch} Iter {i} loss {loss.item():.3f} avg {running/steps:.3f} mem {mem:.2f}GB elapsed {time.time()-t0:.1f}s', flush=True)
            last_log = time.time()
        if max_steps and steps >= max_steps:
            break
    dt = time.time() - t0
    print(f'Epoch {epoch} done: {steps} steps, avg loss {running/max(1,steps):.4f}, time {dt:.1f}s')

# Smoke test: 1 epoch, cap steps
max_steps = 200
epochs = 1
for ep in range(1, epochs+1):
    train_one_epoch(model, train_loader, optimizer, scaler, ep, max_steps=max_steps)
    try:
        lr_scheduler.step()
    except Exception:
        pass

# Save checkpoint
ckpt_path = Path(f'detector_frcnn_r50_1cls_fold{fold}_ep{epochs}.pth')
torch.save({'model': model.state_dict(), 'epoch': epochs}, ckpt_path)
print('Saved checkpoint to', ckpt_path.resolve())

Torch: 2.4.1+cu121 CUDA: True Device: cuda
GPU: NVIDIA A10-24Q


train items: 2595
val items: 649


Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /app/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth


  0%|          | 0.00/160M [00:00<?, ?B/s]

 33%|███▎      | 52.0M/160M [00:00<00:00, 545MB/s]

 67%|██████▋   | 108M/160M [00:00<00:00, 568MB/s] 

100%|██████████| 160M/160M [00:00<00:00, 570MB/s]




  scaler = GradScaler(enabled=(device.type=='cuda'))


  with autocast(enabled=(device.type=='cuda')):


Epoch 1 Iter 1 loss 5.403 avg 5.491 mem 0.64GB elapsed 6.4s


Epoch 1 Iter 4 loss 1.507 avg 3.127 mem 0.69GB elapsed 11.9s


Epoch 1 Iter 12 loss 1.485 avg 2.140 mem 0.62GB elapsed 16.9s


Epoch 1 Iter 24 loss 1.121 avg 1.708 mem 0.62GB elapsed 23.6s


Epoch 1 Iter 37 loss 0.874 avg 1.450 mem 0.63GB elapsed 29.0s


Epoch 1 Iter 51 loss 1.088 avg 1.294 mem 0.72GB elapsed 34.3s


Epoch 1 Iter 64 loss 0.685 avg 1.207 mem 0.62GB elapsed 39.6s


Epoch 1 Iter 78 loss 0.633 avg 1.130 mem 0.62GB elapsed 44.9s


Epoch 1 Iter 91 loss 0.807 avg 1.070 mem 0.74GB elapsed 50.0s


Epoch 1 Iter 105 loss 0.634 avg 1.026 mem 0.60GB elapsed 55.2s


Epoch 1 Iter 119 loss 0.603 avg 0.978 mem 0.62GB elapsed 60.5s


Epoch 1 Iter 132 loss 0.711 avg 0.947 mem 0.70GB elapsed 65.5s


Epoch 1 Iter 146 loss 0.479 avg 0.910 mem 0.65GB elapsed 70.9s


Epoch 1 Iter 154 loss 0.587 avg 0.896 mem 0.67GB elapsed 76.2s


Epoch 1 Iter 167 loss 0.455 avg 0.870 mem 0.70GB elapsed 81.2s


Epoch 1 Iter 179 loss 0.720 avg 0.850 mem 0.61GB elapsed 86.3s


Epoch 1 Iter 192 loss 0.513 avg 0.828 mem 0.62GB elapsed 91.4s


Epoch 1 done: 200 steps, avg loss 0.8223, time 94.2s
Saved checkpoint to /var/lib/simon/agent_run_states/kuzushiji-recognition-20250929-180012/detector_frcnn_r50_1cls_fold0_ep1.pth


In [4]:
# Inference: generate detections for val fold and test; save to parquet for recognizer
import torch
import pandas as pd
from tqdm import tqdm

model.eval()
# Raise detector caps as per expert advice
try:
    model.roi_heads.detections_per_img = 2000
    model.rpn.pre_nms_top_n_test = 12000
    model.rpn.post_nms_top_n_test = 6000
    print('Applied raised detection caps: detections_per_img=2000, pre_nms_top_n_test=12000, post_nms_top_n_test=6000')
except Exception as e:
    print('Warning: could not set raised caps:', e)

# Use dense threshold; filter later during sweeps
score_thresh = 0.01

def run_inference(loader, desc):
    preds = []
    t0 = time.time()
    with torch.no_grad():
        for imgs, targets, ids in tqdm(loader, desc=desc, total=len(loader), mininterval=1.0):
            imgs = [im.to(device) for im in imgs]
            outputs = model(imgs)
            for out, image_id in zip(outputs, ids):
                boxes = out['boxes'].detach().cpu().numpy() if out is not None and 'boxes' in out else np.zeros((0,4),dtype=np.float32)
                scores = out['scores'].detach().cpu().numpy() if out is not None and 'scores' in out else np.zeros((0,),dtype=np.float32)
                keep = scores >= score_thresh
                boxes = boxes[keep]
                scores = scores[keep]
                for (x1,y1,x2,y2), s in zip(boxes, scores):
                    w = max(0.0, x2 - x1); h = max(0.0, y2 - y1)
                    preds.append((image_id, float(x1), float(y1), float(w), float(h), float(s)))
    dt = time.time() - t0
    print(f'{desc} done in {dt:.1f}s; total preds: {len(preds)}')
    return pd.DataFrame(preds, columns=['image_id','x','y','w','h','score'])

# Val predictions (fold 0 only) to oof parquet
val_preds = run_inference(val_loader, desc='Val infer fold0')
val_preds.to_parquet(f'det_oof_fold{fold}.parquet', index=False)
print('Saved val preds to', f'det_oof_fold{fold}.parquet')

# Test loader
class TestDataset(Dataset):
    def __init__(self, img_dir: Path):
        self.imgs = []
        for p in sorted(img_dir.iterdir()):
            if p.suffix.lower() in {'.jpg','.jpeg','.png','.bmp','.tif','.tiff'}:
                self.imgs.append(p)
        print('Test images:', len(self.imgs))
    def __len__(self):
        return len(self.imgs)
    def __getitem__(self, idx):
        p = self.imgs[idx]
        img = Image.open(p).convert('RGB')
        image_id = p.stem
        return torchvision.transforms.functional.pil_to_tensor(img).float()/255.0, {'image_id': torch.tensor([idx])}, image_id

test_dir = CWD / 'test_images'
test_ds = TestDataset(test_dir)
test_loader = DataLoader(test_ds, batch_size=1, shuffle=False, num_workers=num_workers, collate_fn=collate_fn, pin_memory=True)
test_preds = run_inference(test_loader, desc='Test infer')
test_preds.to_parquet('det_test_preds.parquet', index=False)
print('Saved test preds to det_test_preds.parquet')

# Also prepare a quick detector-only submission with placeholder unicode (will be replaced after recognizer)
df_sample = pd.read_csv('sample_submission.csv')
g = test_preds.groupby('image_id')
rows = []
for image_id in df_sample['image_id']:
    if image_id in g.groups:
        grp = g.get_group(image_id)
        # convert to centers and round
        cx = (grp['x'] + grp['w']/2.0).round().astype(int).tolist()
        cy = (grp['y'] + grp['h']/2.0).round().astype(int).tolist()
        # placeholder unicode token (will be replaced later by recognizer), keep minimal to validate format
        toks = []
        for x_, y_ in zip(cx, cy):
            toks.extend(['U+003F', str(int(x_)), str(int(y_))])
        rows.append(' '.join(toks))
    else:
        rows.append('')
sub = pd.DataFrame({'image_id': df_sample['image_id'], 'labels': rows})
sub.to_csv('submission_detector_only.csv', index=False)
print('Wrote submission_detector_only.csv (placeholder unicodes). Will replace after recognizer.')

Applied raised detection caps: detections_per_img=2000, pre_nms_top_n_test=12000, post_nms_top_n_test=6000


Val infer fold0:   0%|          | 0/649 [00:00<?, ?it/s]

Val infer fold0:   1%|▏         | 9/649 [00:01<01:13,  8.67it/s]

Val infer fold0:   3%|▎         | 21/649 [00:02<01:02, 10.09it/s]

Val infer fold0:   5%|▌         | 33/649 [00:03<00:58, 10.56it/s]

Val infer fold0:   7%|▋         | 45/649 [00:04<00:56, 10.78it/s]

Val infer fold0:  10%|█         | 68/649 [00:06<00:53, 10.92it/s]

Val infer fold0:  12%|█▏        | 79/649 [00:07<00:52, 10.93it/s]

Val infer fold0:  14%|█▍        | 90/649 [00:08<00:51, 10.94it/s]

Val infer fold0:  16%|█▌        | 102/649 [00:09<00:49, 10.97it/s]

Val infer fold0:  17%|█▋        | 113/649 [00:10<00:49, 10.93it/s]

Val infer fold0:  19%|█▉        | 124/649 [00:11<00:48, 10.93it/s]

Val infer fold0:  21%|██        | 136/649 [00:12<00:46, 10.99it/s]

Val infer fold0:  23%|██▎       | 148/649 [00:13<00:45, 11.03it/s]

Val infer fold0:  25%|██▍       | 160/649 [00:14<00:44, 11.02it/s]

Val infer fold0:  27%|██▋       | 172/649 [00:15<00:43, 11.07it/s]

Val infer fold0:  28%|██▊       | 184/649 [00:16<00:42, 11.04it/s]

Val infer fold0:  30%|███       | 196/649 [00:17<00:41, 11.05it/s]

Val infer fold0:  32%|███▏      | 208/649 [00:19<00:40, 10.99it/s]

Val infer fold0:  34%|███▎      | 219/649 [00:20<00:39, 10.93it/s]

Val infer fold0:  35%|███▌      | 230/649 [00:21<00:38, 10.93it/s]

Val infer fold0:  37%|███▋      | 241/649 [00:22<00:37, 10.94it/s]

Val infer fold0:  39%|███▉      | 253/649 [00:23<00:36, 10.98it/s]

Val infer fold0:  41%|████      | 264/649 [00:24<00:35, 10.96it/s]

Val infer fold0:  42%|████▏     | 275/649 [00:25<00:34, 10.96it/s]

Val infer fold0:  44%|████▍     | 286/649 [00:26<00:33, 10.95it/s]

Val infer fold0:  46%|████▌     | 297/649 [00:27<00:32, 10.96it/s]

Val infer fold0:  47%|████▋     | 308/649 [00:28<00:31, 10.90it/s]

Val infer fold0:  49%|████▉     | 319/649 [00:29<00:30, 10.90it/s]

Val infer fold0:  51%|█████     | 331/649 [00:30<00:29, 10.96it/s]

Val infer fold0:  53%|█████▎    | 342/649 [00:31<00:28, 10.92it/s]

Val infer fold0:  55%|█████▍    | 354/649 [00:32<00:26, 10.99it/s]

Val infer fold0:  56%|█████▋    | 366/649 [00:33<00:25, 11.02it/s]

Val infer fold0:  58%|█████▊    | 378/649 [00:34<00:24, 10.98it/s]

Val infer fold0:  60%|█████▉    | 389/649 [00:35<00:23, 10.95it/s]

Val infer fold0:  62%|██████▏   | 401/649 [00:36<00:22, 11.01it/s]

Val infer fold0:  64%|██████▎   | 413/649 [00:37<00:21, 10.92it/s]

Val infer fold0:  65%|██████▌   | 424/649 [00:38<00:20, 10.89it/s]

Val infer fold0:  67%|██████▋   | 435/649 [00:39<00:19, 10.90it/s]

Val infer fold0:  69%|██████▊   | 446/649 [00:40<00:18, 10.89it/s]

Val infer fold0:  70%|███████   | 457/649 [00:41<00:17, 10.90it/s]

Val infer fold0:  72%|███████▏  | 468/649 [00:42<00:16, 10.89it/s]

Val infer fold0:  74%|███████▍  | 480/649 [00:43<00:15, 10.92it/s]

Val infer fold0:  76%|███████▌  | 491/649 [00:44<00:14, 10.91it/s]

Val infer fold0:  78%|███████▊  | 503/649 [00:46<00:13, 10.95it/s]

Val infer fold0:  79%|███████▉  | 514/649 [00:47<00:12, 10.93it/s]

Val infer fold0:  81%|████████  | 525/649 [00:48<00:11, 10.90it/s]

Val infer fold0:  83%|████████▎ | 536/649 [00:49<00:10, 10.90it/s]

Val infer fold0:  84%|████████▍ | 548/649 [00:50<00:09, 10.94it/s]

Val infer fold0:  86%|████████▌ | 559/649 [00:51<00:08, 10.92it/s]

Val infer fold0:  88%|████████▊ | 570/649 [00:52<00:07, 10.93it/s]

Val infer fold0:  90%|████████▉ | 581/649 [00:53<00:06, 10.95it/s]

Val infer fold0:  91%|█████████ | 592/649 [00:54<00:05, 10.94it/s]

Val infer fold0:  93%|█████████▎| 603/649 [00:55<00:04, 10.96it/s]

Val infer fold0:  95%|█████████▍| 614/649 [00:56<00:03, 10.96it/s]

Val infer fold0:  96%|█████████▋| 625/649 [00:57<00:02, 10.93it/s]

Val infer fold0:  98%|█████████▊| 636/649 [00:58<00:01, 10.95it/s]

Val infer fold0: 100%|█████████▉| 647/649 [00:59<00:00, 10.96it/s]

Val infer fold0: 100%|██████████| 649/649 [00:59<00:00, 10.91it/s]




Val infer fold0 done in 59.5s; total preds: 134993
Saved val preds to det_oof_fold0.parquet
Test images: 361


Test infer:   0%|          | 0/361 [00:00<?, ?it/s]

Test infer:   2%|▏         | 7/361 [00:01<00:51,  6.90it/s]

Test infer:   5%|▌         | 19/361 [00:02<00:36,  9.39it/s]

Test infer:   8%|▊         | 30/361 [00:03<00:32, 10.08it/s]

Test infer:  12%|█▏        | 42/361 [00:04<00:30, 10.54it/s]

Test infer:  15%|█▍        | 54/361 [00:05<00:28, 10.80it/s]

Test infer:  18%|█▊        | 66/361 [00:06<00:27, 10.88it/s]

Test infer:  22%|██▏       | 78/361 [00:07<00:25, 10.95it/s]

Test infer:  25%|██▍       | 90/361 [00:08<00:24, 11.00it/s]

Test infer:  28%|██▊       | 102/361 [00:09<00:23, 11.03it/s]

Test infer:  32%|███▏      | 114/361 [00:10<00:22, 11.03it/s]

Test infer:  35%|███▍      | 126/361 [00:11<00:21, 11.07it/s]

Test infer:  38%|███▊      | 138/361 [00:12<00:20, 10.97it/s]

Test infer:  41%|████▏     | 149/361 [00:13<00:19, 10.91it/s]

Test infer:  45%|████▍     | 161/361 [00:14<00:18, 10.98it/s]

Test infer:  48%|████▊     | 172/361 [00:15<00:17, 10.94it/s]

Test infer:  51%|█████     | 183/361 [00:16<00:16, 10.88it/s]

Test infer:  54%|█████▍    | 195/361 [00:18<00:15, 11.01it/s]

Test infer:  57%|█████▋    | 207/361 [00:19<00:15, 10.05it/s]

Test infer:  60%|██████    | 218/361 [00:20<00:13, 10.27it/s]

Test infer:  64%|██████▍   | 231/361 [00:21<00:11, 10.92it/s]

Test infer:  67%|██████▋   | 243/361 [00:22<00:10, 10.83it/s]

Test infer:  70%|███████   | 254/361 [00:23<00:09, 10.73it/s]

Test infer:  73%|███████▎  | 265/361 [00:24<00:08, 10.75it/s]

Test infer:  76%|███████▋  | 276/361 [00:25<00:07, 10.77it/s]

Test infer:  80%|███████▉  | 287/361 [00:26<00:06, 10.79it/s]

Test infer:  83%|████████▎ | 298/361 [00:27<00:05, 10.82it/s]

Test infer:  86%|████████▌ | 310/361 [00:28<00:04, 10.90it/s]

Test infer:  89%|████████▉ | 322/361 [00:29<00:03, 11.05it/s]

Test infer:  93%|█████████▎| 334/361 [00:30<00:02, 11.16it/s]

Test infer:  96%|█████████▌| 346/361 [00:32<00:01, 11.25it/s]

Test infer:  99%|█████████▉| 358/361 [00:33<00:00, 11.31it/s]

Test infer: 100%|██████████| 361/361 [00:33<00:00, 10.82it/s]




Test infer done in 33.4s; total preds: 78959
Saved test preds to det_test_preds.parquet


Wrote submission_detector_only.csv (placeholder unicodes). Will replace after recognizer.


In [5]:
# OOF threshold sweep on fold 0 detections to maximize micro-F1 with center-distance matching
import math, json, time
import pandas as pd
import numpy as np
from pathlib import Path

CWD = Path('.')
train_csv = CWD / 'train.csv'
folds_csv = CWD / 'folds_group.csv'
oof_path = CWD / f'det_oof_fold{fold}.parquet'
assert train_csv.exists() and folds_csv.exists() and oof_path.exists(), 'Required files missing for sweep'

df_train = pd.read_csv(train_csv)
df_folds = pd.read_csv(folds_csv)
fold_map = dict(df_folds.values)

def parse_labels_full(labels: str):
    if not isinstance(labels, str) or labels.strip() == '':
        return []
    toks = labels.strip().split()
    if len(toks) % 5 != 0:
        return []
    out = []
    for i in range(0, len(toks), 5):
        u, x, y, w, h = toks[i:i+5]
        try:
            x = int(x); y = int(y); w = int(w); h = int(h)
            out.append((u, x, y, w, h))
        except:
            pass
    return out

# Build GT centers for fold 0 images
gt_rows = []
for r in df_train.itertuples(index=False):
    image_id = getattr(r, 'image_id') if hasattr(r, 'image_id') else r[0]
    if fold_map.get(image_id, None) != 0:
        continue
    labels = getattr(r, 'labels') if hasattr(r, 'labels') else r[1]
    for (_, x, y, w, h) in parse_labels_full(labels):
        cx = x + w/2.0; cy = y + h/2.0
        gt_rows.append((image_id, float(cx), float(cy)))
df_gt = pd.DataFrame(gt_rows, columns=['image_id','cx','cy'])
print('Fold0 GT points:', df_gt.shape)

# Load OOF predictions and compute centers
df_pred = pd.read_parquet(oof_path)
df_pred = df_pred.copy()
df_pred['cx'] = df_pred['x'] + df_pred['w']/2.0
df_pred['cy'] = df_pred['y'] + df_pred['h']/2.0
print('OOF preds loaded:', df_pred.shape, 'score stats:', df_pred['score'].describe().to_dict())

def eval_threshold(th, cap=None, dmax=25.0):
    # Filter by score and optional per-image cap (after filtering)
    dd = df_pred.loc[df_pred['score'] >= th, ['image_id','cx','cy','score']].copy()
    if cap is not None:
        dd = dd.sort_values(['image_id','score'], ascending=[True, False])
        dd['rn'] = dd.groupby('image_id').cumcount()
        dd = dd[dd['rn'] < cap].drop(columns=['rn'])
    # Group by image for matching
    g_gt = df_gt.groupby('image_id')
    g_pr = dd.groupby('image_id')
    TP = 0; FP = 0; FN = 0
    # Iterate over union of image ids present in GT for fold 0
    for image_id, gt_g in g_gt:
        gt_pts = gt_g[['cx','cy']].to_numpy() if len(gt_g) else np.zeros((0,2), dtype=np.float32)
        pr_g = g_pr.get_group(image_id) if image_id in g_pr.groups else None
        pr_pts = pr_g[['cx','cy','score']].to_numpy() if pr_g is not None else np.zeros((0,3), dtype=np.float32)
        # Greedy matching by score desc
        matched_gt = np.zeros((len(gt_pts),), dtype=bool) if len(gt_pts) else np.zeros((0,), dtype=bool)
        if len(pr_pts) and len(gt_pts):
            pr_ord = np.argsort(-pr_pts[:,2])
            for idx in pr_ord:
                px, py, _ = pr_pts[idx]
                # Find nearest unmatched GT within dmax
                if not len(gt_pts):
                    continue
                d2 = ((gt_pts[:,0]-px)**2 + (gt_pts[:,1]-py)**2)
                j = int(np.argmin(d2))
                if matched_gt[j]:
                    continue
                if math.sqrt(float(d2[j])) <= dmax:
                    matched_gt[j] = True
                    TP += 1
                else:
                    FP += 1
            # Remaining unmatched GTs are FN
            FN += int((~matched_gt).sum())
        else:
            # No preds -> all GT are FN; preds exist but no GT shouldn't happen on fold val
            FN += len(gt_pts)
            FP += 0 if len(pr_pts)==0 else len(pr_pts)
    prec = TP / max(1, TP+FP)
    rec = TP / max(1, TP+FN)
    f1 = 2*prec*rec / max(1e-12, (prec+rec)) if (prec+rec) > 0 else 0.0
    return dict(TP=TP, FP=FP, FN=FN, precision=prec, recall=rec, f1=f1)

thresholds = [round(x,3) for x in np.arange(0.05, 0.50, 0.05)]
caps = [1500, 2000]
dmax_list = [25.0, 30.0]
results = []
t_start = time.time()
for dmax in dmax_list:
    for cap in caps:
        best = (-1.0, None)
        print(f'-- Sweep dmax={dmax} cap={cap} --', flush=True)
        for th in thresholds:
            m = eval_threshold(th, cap=cap, dmax=dmax)
            results.append(dict(threshold=th, cap=cap, dmax=dmax, **m))
            if m['f1'] > best[0]:
                best = (m['f1'], th)
            print(f"th={th:.3f} F1={m['f1']:.4f} P={m['precision']:.4f} R={m['recall']:.4f} TP={m['TP']} FP={m['FP']} FN={m['FN']}")
        print(f'Best for dmax={dmax} cap={cap}: th={best[1]} F1={best[0]:.4f}', flush=True)
print('Sweep took', f'{time.time()-t_start:.1f}s')

df_res = pd.DataFrame(results)
df_res.to_csv('det_threshold_sweep_fold0.csv', index=False)
best_row = df_res.sort_values('f1', ascending=False).iloc[0]
best_cfg = {
    'fold': int(fold),
    'best_threshold': float(best_row['threshold']),
    'best_cap': int(best_row['cap']),
    'best_dmax': float(best_row['dmax']),
    'best_f1': float(best_row['f1'])
}
Path('det_threshold_best_fold0.json').write_text(json.dumps(best_cfg, indent=2))
print('Best config:', best_cfg)

Fold0 GT points: (115648, 3)
OOF preds loaded: (134993, 8) score stats: {'count': 134993.0, 'mean': 0.752571961456331, 'std': 0.303068402622556, 'min': 0.05000726506114006, '25%': 0.6306458711624146, '50%': 0.9149374961853027, '75%': 0.9648087620735168, 'max': 0.9984909296035767}
-- Sweep dmax=25.0 cap=1500 --


th=0.050 F1=0.9233 P=0.9441 R=0.9034 TP=104475 FP=6181 FN=11173


th=0.100 F1=0.9235 P=0.9482 R=0.9001 TP=104094 FP=5688 FN=11554


th=0.150 F1=0.9233 P=0.9508 R=0.8973 TP=103770 FP=5364 FN=11878


th=0.200 F1=0.9227 P=0.9530 R=0.8943 TP=103427 FP=5101 FN=12221


th=0.250 F1=0.9215 P=0.9547 R=0.8906 TP=102998 FP=4887 FN=12650


th=0.300 F1=0.9201 P=0.9563 R=0.8865 TP=102520 FP=4688 FN=13128


th=0.350 F1=0.9182 P=0.9579 R=0.8817 TP=101965 FP=4485 FN=13683


th=0.400 F1=0.9157 P=0.9592 R=0.8760 TP=101306 FP=4313 FN=14342


th=0.450 F1=0.9125 P=0.9604 R=0.8692 TP=100523 FP=4142 FN=15125
Best for dmax=25.0 cap=1500: th=0.1 F1=0.9235


-- Sweep dmax=25.0 cap=2000 --


th=0.050 F1=0.9233 P=0.9441 R=0.9034 TP=104475 FP=6181 FN=11173


th=0.100 F1=0.9235 P=0.9482 R=0.9001 TP=104094 FP=5688 FN=11554


th=0.150 F1=0.9233 P=0.9508 R=0.8973 TP=103770 FP=5364 FN=11878


th=0.200 F1=0.9227 P=0.9530 R=0.8943 TP=103427 FP=5101 FN=12221


th=0.250 F1=0.9215 P=0.9547 R=0.8906 TP=102998 FP=4887 FN=12650


th=0.300 F1=0.9201 P=0.9563 R=0.8865 TP=102520 FP=4688 FN=13128


th=0.350 F1=0.9182 P=0.9579 R=0.8817 TP=101965 FP=4485 FN=13683


th=0.400 F1=0.9157 P=0.9592 R=0.8760 TP=101306 FP=4313 FN=14342


th=0.450 F1=0.9125 P=0.9604 R=0.8692 TP=100523 FP=4142 FN=15125
Best for dmax=25.0 cap=2000: th=0.1 F1=0.9235


-- Sweep dmax=30.0 cap=1500 --


th=0.050 F1=0.9356 P=0.9593 R=0.9130 TP=105587 FP=4477 FN=10061


th=0.100 F1=0.9354 P=0.9626 R=0.9096 TP=105199 FP=4092 FN=10449


th=0.150 F1=0.9348 P=0.9647 R=0.9067 TP=104863 FP=3834 FN=10785


th=0.200 F1=0.9340 P=0.9664 R=0.9037 TP=104510 FP=3634 FN=11138


th=0.250 F1=0.9326 P=0.9678 R=0.9000 TP=104078 FP=3467 FN=11570


th=0.300 F1=0.9310 P=0.9690 R=0.8958 TP=103595 FP=3313 FN=12053


th=0.350 F1=0.9288 P=0.9702 R=0.8909 TP=103027 FP=3164 FN=12621


th=0.400 F1=0.9261 P=0.9711 R=0.8851 TP=102361 FP=3041 FN=13287


th=0.450 F1=0.9228 P=0.9721 R=0.8783 TP=101570 FP=2916 FN=14078
Best for dmax=30.0 cap=1500: th=0.05 F1=0.9356


-- Sweep dmax=30.0 cap=2000 --


th=0.050 F1=0.9356 P=0.9593 R=0.9130 TP=105587 FP=4477 FN=10061


th=0.100 F1=0.9354 P=0.9626 R=0.9096 TP=105199 FP=4092 FN=10449


th=0.150 F1=0.9348 P=0.9647 R=0.9067 TP=104863 FP=3834 FN=10785


th=0.200 F1=0.9340 P=0.9664 R=0.9037 TP=104510 FP=3634 FN=11138


th=0.250 F1=0.9326 P=0.9678 R=0.9000 TP=104078 FP=3467 FN=11570


th=0.300 F1=0.9310 P=0.9690 R=0.8958 TP=103595 FP=3313 FN=12053


th=0.350 F1=0.9288 P=0.9702 R=0.8909 TP=103027 FP=3164 FN=12621


th=0.400 F1=0.9261 P=0.9711 R=0.8851 TP=102361 FP=3041 FN=13287


th=0.450 F1=0.9228 P=0.9721 R=0.8783 TP=101570 FP=2916 FN=14078
Best for dmax=30.0 cap=2000: th=0.05 F1=0.9356


Sweep took 39.4s
Best config: {'fold': 0, 'best_threshold': 0.05, 'best_cap': 1500, 'best_dmax': 30.0, 'best_f1': 0.9355904869922733}


In [None]:
# Full detector training: Faster R-CNN R50-FPN, 1-class, 6-8 epochs with raised caps
import time, math, json, gc
from pathlib import Path
import torch
import torch.optim as optim

torch.backends.cudnn.benchmark = True
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device, 'GPU:', torch.cuda.get_device_name(0) if device.type=='cuda' else None)

# Rebuild datasets/loaders if not in scope
if 'train_loader' not in globals() or 'val_loader' not in globals():
    df_train = pd.read_csv(train_csv)
    df_folds = pd.read_csv(folds_csv)
    train_ds = KuzDetDataset(df_train, df_folds, fold=fold, split='train')
    val_ds = KuzDetDataset(df_train, df_folds, fold=fold, split='val')
    batch_size = 2
    num_workers = min(4, os.cpu_count() or 2)
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=num_workers, collate_fn=collate_fn, pin_memory=True)
    val_loader = DataLoader(val_ds, batch_size=1, shuffle=False, num_workers=num_workers, collate_fn=collate_fn, pin_memory=True)

# Build model fresh for full training
from torchvision.models.detection import fasterrcnn_resnet50_fpn, FasterRCNN_ResNet50_FPN_Weights
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
weights = FasterRCNN_ResNet50_FPN_Weights.DEFAULT
model = fasterrcnn_resnet50_fpn(weights=weights)
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes=2)
# Transform and size
model.transform.min_size = (1333,)
model.transform.max_size = 2000
# Raise caps
model.roi_heads.detections_per_img = 2000
model.rpn.pre_nms_top_n_train = 4000
model.rpn.post_nms_top_n_train = 2000
model.rpn.pre_nms_top_n_test = 12000
model.rpn.post_nms_top_n_test = 6000
model.to(device)

# Optim and scheduler per expert advice
optimizer = optim.SGD(model.parameters(), lr=0.005, momentum=0.9, weight_decay=5e-4)
milestones = [4, 6]
lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=0.1)
scaler = torch.amp.GradScaler('cuda') if device.type=='cuda' else None

def train_one_epoch_full(model, loader, epoch):
    model.train()
    t0 = time.time(); last = t0; tot=0.0; n=0
    for i,(imgs,targets,ids) in enumerate(loader):
        imgs = [im.to(device, non_blocking=True) for im in imgs]
        tgts = [{k:(v.to(device) if torch.is_tensor(v) else v) for k,v in t.items()} for t in targets]
        optimizer.zero_grad(set_to_none=True)
        if scaler is not None:
            with torch.amp.autocast('cuda'):
                loss_dict = model(imgs, tgts)
                loss = sum(loss for loss in loss_dict.values())
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            loss_dict = model(imgs, tgts)
            loss = sum(loss for loss in loss_dict.values())
            loss.backward(); optimizer.step()
        tot += float(loss.detach().item()); n += 1
        if time.time()-last > 10:
            mem = torch.cuda.memory_allocated()/1024**3 if device.type=='cuda' else 0.0
            print(f'Epoch {epoch} iter {i} loss {loss.item():.4f} avg {tot/n:.4f} mem {mem:.2f}GB elapsed {time.time()-t0:.1f}s', flush=True)
            last = time.time()
    print(f'Epoch {epoch} done avg {tot/max(1,n):.4f} time {time.time()-t0:.1f}s')

# Train 8 epochs; save after each
epochs = 8
for ep in range(1, epochs+1):
    train_one_epoch_full(model, train_loader, ep)
    try:
        lr_scheduler.step()
    except Exception:
        pass
    ckpt = Path(f'detector_frcnn_r50_1cls_fold{fold}_ep{ep}.pth')
    torch.save({'model': model.state_dict(), 'epoch': ep}, ckpt)
    print('Saved', ckpt)
    # quick val forward pass timing only (optional)
    with torch.no_grad():
        model.eval()
        cnt = 0; t0 = time.time()
        for imgs,targets,ids in val_loader:
            imgs = [im.to(device) for im in imgs]
            _ = model(imgs)
            cnt += 1
            if cnt >= 5:
                break
        print(f'Val smoke infer {cnt} batches in {time.time()-t0:.1f}s')
    model.train()
    gc.collect();
    if device.type=='cuda': torch.cuda.empty_cache()
print('Full detector training finished.')

In [6]:
# Refinement sweep: th in [0.04..0.08], cap=1500, dmax=30; optional dedup radius=6
import math, json, time
import pandas as pd
import numpy as np
from pathlib import Path

CWD = Path('.')
train_csv = CWD / 'train.csv'
folds_csv = CWD / 'folds_group.csv'
oof_path = CWD / f'det_oof_fold{fold}.parquet'
assert train_csv.exists() and folds_csv.exists() and oof_path.exists(), 'Missing inputs for refinement sweep'

df_train = pd.read_csv(train_csv)
df_folds = pd.read_csv(folds_csv)
fold_map = dict(df_folds.values)

def parse_labels_full(labels: str):
    if not isinstance(labels, str) or labels.strip() == '':
        return []
    toks = labels.strip().split()
    if len(toks) % 5 != 0:
        return []
    out = []
    for i in range(0, len(toks), 5):
        u, x, y, w, h = toks[i:i+5]
        try:
            x = int(x); y = int(y); w = int(w); h = int(h)
            out.append((u, x, y, w, h))
        except:
            pass
    return out

# GT centers for fold 0
gt_rows = []
for r in df_train.itertuples(index=False):
    image_id = getattr(r, 'image_id') if hasattr(r, 'image_id') else r[0]
    if fold_map.get(image_id, None) != 0:
        continue
    labels = getattr(r, 'labels') if hasattr(r, 'labels') else r[1]
    for (_, x, y, w, h) in parse_labels_full(labels):
        gt_rows.append((image_id, x + w/2.0, y + h/2.0))
df_gt = pd.DataFrame(gt_rows, columns=['image_id','cx','cy'])

df_pred = pd.read_parquet(oof_path).copy()
df_pred['cx'] = df_pred['x'] + df_pred['w']/2.0
df_pred['cy'] = df_pred['y'] + df_pred['h']/2.0

def dedup_centers(dd: pd.DataFrame, radius: float = 6.0) -> pd.DataFrame:
    out = []
    for img_id, g in dd.groupby('image_id'):
        g = g.sort_values('score', ascending=False).reset_index(drop=True)
        keep_idx = []
        kept = []
        for i, (cx, cy, sc) in enumerate(g[['cx','cy','score']].itertuples(index=False, name=None)):
            ok = True
            for (kx, ky) in kept:
                if (cx - kx)**2 + (cy - ky)**2 <= radius*radius:
                    ok = False; break
            if ok:
                keep_idx.append(i); kept.append((cx, cy))
        out.append(g.iloc[keep_idx])
    if len(out) == 0:
        return dd.iloc[0:0]
    return pd.concat(out, axis=0).reset_index(drop=True)

def eval_threshold(th, cap=1500, dmax=30.0, use_dedup=False, radius=6.0):
    dd = df_pred.loc[df_pred['score'] >= th, ['image_id','cx','cy','score']].copy()
    dd = dd.sort_values(['image_id','score'], ascending=[True, False])
    dd['rn'] = dd.groupby('image_id').cumcount()
    dd = dd[dd['rn'] < cap].drop(columns=['rn'])
    if use_dedup:
        dd = dedup_centers(dd, radius=radius)
    g_gt = df_gt.groupby('image_id')
    g_pr = dd.groupby('image_id')
    TP = 0; FP = 0; FN = 0
    for image_id, gt_g in g_gt:
        gt_pts = gt_g[['cx','cy']].to_numpy()
        pr_g = g_pr.get_group(image_id) if image_id in g_pr.groups else None
        pr_pts = pr_g[['cx','cy','score']].to_numpy() if pr_g is not None else np.zeros((0,3), dtype=np.float32)
        matched_gt = np.zeros((len(gt_pts),), dtype=bool)
        if len(pr_pts) and len(gt_pts):
            pr_ord = np.argsort(-pr_pts[:,2])
            for idx in pr_ord:
                px, py, _ = pr_pts[idx]
                d2 = ((gt_pts[:,0]-px)**2 + (gt_pts[:,1]-py)**2)
                j = int(np.argmin(d2))
                if matched_gt[j]:
                    FP += 1; continue
                if math.sqrt(float(d2[j])) <= dmax:
                    matched_gt[j] = True; TP += 1
                else:
                    FP += 1
            FN += int((~matched_gt).sum())
        else:
            FN += len(gt_pts)
            FP += 0 if len(pr_pts)==0 else len(pr_pts)
    prec = TP / max(1, TP+FP); rec = TP / max(1, TP+FN)
    f1 = 2*prec*rec / max(1e-12, (prec+rec)) if (prec+rec) > 0 else 0.0
    return dict(TP=TP, FP=FP, FN=FN, precision=prec, recall=rec, f1=f1)

thresholds = [0.04, 0.045, 0.05, 0.055, 0.06, 0.07, 0.08]
results = []
t0 = time.time()
for th in thresholds:
    m0 = eval_threshold(th, cap=1500, dmax=30.0, use_dedup=False)
    m1 = eval_threshold(th, cap=1500, dmax=30.0, use_dedup=True, radius=6.0)
    results.append(dict(threshold=th, cap=1500, dmax=30.0, dedup=False, **m0))
    results.append(dict(threshold=th, cap=1500, dmax=30.0, dedup=True, **m1))
    print(f'th={th:.3f} no-dedup F1={m0["f1"]:.4f} P={m0["precision"]:.4f} R={m0["recall"]:.4f} | dedup F1={m1["f1"]:.4f} P={m1["precision"]:.4f} R={m1["recall"]:.4f}')
print('Refinement sweep took', f'{time.time()-t0:.1f}s')

df_ref = pd.DataFrame(results)
df_ref.to_csv('det_threshold_sweep_refined_fold0.csv', index=False)
df_sorted = df_ref.sort_values(['f1','precision'], ascending=[False, False]).reset_index(drop=True)
best_row = df_sorted.iloc[0]
best_cfg = {
    'fold': int(fold),
    'best_threshold': float(best_row['threshold']),
    'best_cap': int(best_row['cap']),
    'best_dmax': float(best_row['dmax']),
    'best_f1': float(best_row['f1']),
    'dedup': bool(best_row['dedup']),
    'dedup_radius': 6.0
}
Path('det_threshold_best_fold0.json').write_text(json.dumps(best_cfg, indent=2))
print('Refined best config (overwrote det_threshold_best_fold0.json):', best_cfg)

th=0.040 no-dedup F1=0.8425 P=0.7822 R=0.9130 | dedup F1=0.8447 P=0.7860 R=0.9130


th=0.045 no-dedup F1=0.8425 P=0.7822 R=0.9130 | dedup F1=0.8447 P=0.7860 R=0.9130


th=0.050 no-dedup F1=0.8425 P=0.7822 R=0.9130 | dedup F1=0.8447 P=0.7860 R=0.9130


th=0.055 no-dedup F1=0.8458 P=0.7880 R=0.9127 | dedup F1=0.8479 P=0.7916 R=0.9127


th=0.060 no-dedup F1=0.8490 P=0.7939 R=0.9123 | dedup F1=0.8509 P=0.7973 R=0.9123


th=0.070 no-dedup F1=0.8542 P=0.8037 R=0.9115 | dedup F1=0.8560 P=0.8068 R=0.9115


th=0.080 no-dedup F1=0.8588 P=0.8124 R=0.9109 | dedup F1=0.8603 P=0.8151 R=0.9108
Refinement sweep took 31.4s
Refined best config (overwrote det_threshold_best_fold0.json): {'fold': 0, 'best_threshold': 0.08, 'best_cap': 1500, 'best_dmax': 30.0, 'best_f1': 0.8603257962160595, 'dedup': True, 'dedup_radius': 6.0}


In [None]:
# Assemble submission using tuned detector filter (+optional dedup) and best recognizer EMA checkpoint (optimized I/O: open image once per page)
import json, time
import pandas as pd
import numpy as np
from pathlib import Path
from PIL import Image, ImageOps
import torch
import torch.nn as nn
from torchvision import transforms as T
from torchvision.models import resnet50

CWD = Path('.')
best_cfg_path = CWD / 'det_threshold_best_fold0.json'
det_test_path = CWD / 'det_test_preds.parquet'
classes_path = CWD / 'recognizer_classes.json'
best_ckpt_path = CWD / 'recognizer_resnet50_fold0_best.pth'
test_dir = CWD / 'test_images'
sample_path = CWD / 'sample_submission.csv'
assert best_cfg_path.exists() and det_test_path.exists() and classes_path.exists() and best_ckpt_path.exists() and sample_path.exists(), 'Missing artifacts for assembly'

best_cfg = json.loads(best_cfg_path.read_text())
th = float(best_cfg['best_threshold']); cap = int(best_cfg['best_cap'])
use_dedup = bool(best_cfg.get('dedup', False))
dedup_radius = float(best_cfg.get('dedup_radius', 6.0))
print('Using detector filter:', best_cfg)

det_df = pd.read_parquet(det_test_path)
print('Raw test preds:', det_df.shape, 'score stats:', det_df['score'].describe().to_dict())
det_df = det_df[det_df['score'] >= th].copy()
det_df.sort_values(['image_id','score'], ascending=[True, False], inplace=True)
det_df['rn'] = det_df.groupby('image_id').cumcount()
det_df = det_df[det_df['rn'] < cap].drop(columns=['rn'])

def dedup_centers_df(df_img: pd.DataFrame, radius: float = 6.0) -> pd.DataFrame:
    if len(df_img) <= 1:
        return df_img
    g = df_img.sort_values('score', ascending=False).reset_index(drop=True)
    keep_idx = []
    kept = []
    for i in range(len(g)):
        cx = float(g.loc[i, 'x'] + g.loc[i, 'w']/2.0)
        cy = float(g.loc[i, 'y'] + g.loc[i, 'h']/2.0)
        ok = True
        for (kx, ky) in kept:
            if (cx - kx)*(cx - kx) + (cy - ky)*(cy - ky) <= radius*radius:
                ok = False; break
        if ok:
            keep_idx.append(i); kept.append((cx, cy))
    return g.iloc[keep_idx].reset_index(drop=True)

if use_dedup:
    parts = []
    for img_id, g in det_df.groupby('image_id'):
        parts.append(dedup_centers_df(g, radius=dedup_radius))
    det_df = pd.concat(parts, axis=0).reset_index(drop=True) if parts else det_df.iloc[0:0]
print('Filtered test preds:', det_df.shape, 'dedup applied:' , use_dedup)

class_to_idx = json.loads(classes_path.read_text())
idx_to_class = {int(v): k for k, v in class_to_idx.items()}
num_classes = len(idx_to_class)
rec_model = resnet50(weights=None)
rec_model.fc = nn.Linear(rec_model.fc.in_features, num_classes)
state = torch.load(best_ckpt_path, map_location='cpu')
rec_model.load_state_dict(state['model'], strict=False)
rec_model.to(device)
rec_model.eval()

val_tf = T.Compose([T.ToTensor(), T.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])])

def clamp(v, lo, hi):
    return max(lo, min(hi, v))

def crop_from_box_img(img: Image.Image, x: float, y: float, w: float, h: float, pad_ratio: float=0.25, img_size: int=192):
    W, H = img.size
    dx = int(round(w * pad_ratio)); dy = int(round(h * pad_ratio))
    x1 = clamp(int(x) - dx, 0, W-1); y1 = clamp(int(y) - dy, 0, H-1)
    x2 = clamp(int(x + w) + dx, 1, W); y2 = clamp(int(y + h) + dy, 1, H)
    crop = img.crop((x1, y1, x2, y2))
    cw, ch = crop.size
    if cw != ch:
        m = max(cw, ch)
        pad_w = m - cw; pad_h = m - ch
        crop = ImageOps.expand(crop, border=(0,0,pad_w,pad_h), fill=0)
    return crop.resize((192, 192), Image.BILINEAR)

df_sample = pd.read_csv(sample_path)
grp = det_df.groupby('image_id')
rows_out = []
t0 = time.time()
for i, image_id in enumerate(df_sample['image_id'].tolist(), 1):
    if image_id in grp.groups:
        g = grp.get_group(image_id)
        # Open the page once (major speed-up vs opening per box)
        p = test_dir / f'{image_id}.jpg'
        if not p.exists():
            alt = test_dir / f'{image_id}.png'
            if alt.exists():
                p = alt
        img = Image.open(p).convert('RGB')
        # Build crops from the single opened image
        crops = [crop_from_box_img(img, x, y, w, h) for x,y,w,h in zip(g['x'].values, g['y'].values, g['w'].values, g['h'].values)]
        if len(crops) == 0:
            rows_out.append('')
        else:
            xs = torch.stack([val_tf(c) for c in crops]).to(device)
            with torch.no_grad():
                logits = rec_model(xs)
                pred_idx = logits.argmax(1).detach().cpu().numpy().tolist()
            pred_unicodes = [idx_to_class.get(int(k), 'U+003F') for k in pred_idx]
            cx = (g['x'].values + g['w'].values/2.0).round().astype(int).tolist()
            cy = (g['y'].values + g['h'].values/2.0).round().astype(int).tolist()
            toks = []
            for u, x_, y_ in zip(pred_unicodes, cx, cy):
                toks.extend([u, str(int(x_)), str(int(y_))])
            rows_out.append(' '.join(toks))
    else:
        rows_out.append('')
    if i % 25 == 0:
        print(f'Assembled {i}/{len(df_sample)} in {time.time()-t0:.1f}s', flush=True)

sub_df = pd.DataFrame({'image_id': df_sample['image_id'], 'labels': rows_out})
sub_df.to_csv('submission.csv', index=False)
print('Wrote submission.csv with shape', sub_df.shape)
print(sub_df.head(2))

Using detector filter: {'fold': 0, 'best_threshold': 0.08, 'best_cap': 1500, 'best_dmax': 30.0, 'best_f1': 0.8603257962160595, 'dedup': True, 'dedup_radius': 6.0}
Raw test preds: (78959, 6) score stats: {'count': 78959.0, 'mean': 0.7546936687092951, 'std': 0.29837606507519804, 'min': 0.05002911761403084, '25%': 0.6375642716884613, '50%': 0.9123088717460632, '75%': 0.9650175869464874, 'max': 0.9979872703552246}


Filtered test preds: (75767, 6) dedup applied: True


  state = torch.load(best_ckpt_path, map_location='cpu')
