In [1]:
import torch
from yolos.detector import Detector, SetCriterion, PostProcess
from yolos.matcher import HungarianMatcher

import random
import torch
import numpy as np

seed = 0
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

num_classes = 1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = Detector(
    num_classes=num_classes,
    pre_trained='/d/pfournie/YOLOS/deit_small_patch16_224-cd65a155.pth',
    det_token_num=100,
    backbone_name='small',
    init_pe_size=(560,560)
)
n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
nb_tot = sum([int(torch.numel(p)) for p in model.parameters()])
print(f"Training {n_parameters} params out of {nb_tot}")
model.to(device)
matcher = HungarianMatcher(1,5,2)
weight_dict = {'loss_ce': 1, 'loss_bbox': 5, 'loss_giou': 2}
losses = ['labels', 'boxes', 'cardinality']
criterion = SetCriterion(num_classes, matcher=matcher, weight_dict=weight_dict,
                         eos_coef=0.1, losses=losses)
criterion.to(device)
postprocessors = {'bbox': PostProcess()}

No mid pe
Training 22731270 params out of 22731270


  checkpoint = torch.load(pretrained, map_location="cpu")


In [2]:
def build_optimizer(model):
    if hasattr(model.backbone, 'no_weight_decay'):
        skip = model.backbone.no_weight_decay()
    head = []
    backbone_decay = []
    backbone_no_decay = []
    for name, param in model.named_parameters():
        if "backbone" not in name and param.requires_grad:
            head.append(param)
        if "backbone" in name and param.requires_grad:
            if len(param.shape) == 1 or name.endswith(".bias") or name.split('.')[-1] in skip:
                backbone_no_decay.append(param)
            else:
                backbone_decay.append(param)
    param_dicts = [
        {"params": head},
        {"params": backbone_no_decay, "weight_decay": 0., "lr": 0.0001},
        {"params": backbone_decay, "lr": 0.0001},
    ]
    optimizer = torch.optim.AdamW(param_dicts, lr=0.001,
                              weight_decay=0.0001)
    return optimizer

optimizer = build_optimizer(model)

### Dataset & dataloaders

In [3]:
import os
import torch

from torchvision.io import read_image
from torchvision.ops.boxes import masks_to_boxes
from torchvision import tv_tensors
import torchvision.transforms.v2 as v2

import time
import random
import numpy as np
from tqdm.auto import tqdm
from pprint import pformat
from torchmetrics.detection.mean_ap import MeanAveragePrecision
import gc 

from torchvision.transforms import v2 as T
import torch
from torch.utils.data import DataLoader, Subset, RandomSampler

class PennFudanDataset(torch.utils.data.Dataset):
    def __init__(self, root, transforms):
        self.root = root
        self.transforms = transforms
        # load all image files, sorting them to
        # ensure that they are aligned
        self.imgs = list(sorted(os.listdir(os.path.join(root, "PNGImages"))))
        self.masks = list(sorted(os.listdir(os.path.join(root, "PedMasks"))))

    def __getitem__(self, idx):
        # load images and masks
        img_path = os.path.join(self.root, "PNGImages", self.imgs[idx])
        mask_path = os.path.join(self.root, "PedMasks", self.masks[idx])
        img = read_image(img_path)
        mask = read_image(mask_path)
        # instances are encoded as different colors
        obj_ids = torch.unique(mask)
        # first id is the background, so remove it
        obj_ids = obj_ids[1:]
        num_objs = len(obj_ids)

        # split the color-encoded mask into a set
        # of binary masks
        masks = (mask == obj_ids[:, None, None]).to(dtype=torch.uint8)

        # get bounding box coordinates for each mask
        boxes = masks_to_boxes(masks)

        # there is only one class: attention différent de fcos, A UNIFORMISER
        labels = torch.zeros((num_objs,), dtype=torch.int64)

        image_id = idx
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        # suppose all instances are not crowd
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)

        # Wrap sample and targets into torchvision tv_tensors:
        img = tv_tensors.Image(img)

        target = {}
        h, w = v2.functional.get_size(img)
        target["boxes"] = tv_tensors.BoundingBoxes(boxes, format="XYXY", canvas_size=(h,w))
        target["masks"] = tv_tensors.Mask(masks)
        target["labels"] = labels
        target["image_id"] = torch.Tensor([image_id])
        #target["area"] = area
        #target["iscrowd"] = iscrowd
        target["orig_size"] = torch.as_tensor([int(h), int(w)])
        target["size"] = torch.as_tensor([int(h), int(w)])

        if self.transforms is not None:
            img, target = self.transforms(img, target)
            
        return {'image': img, 'target': target}
    
    def __len__(self):
        return len(self.imgs)

from dl_toolbox.transforms import NormalizeBB

tf = T.Compose(
    [
        T.Resize(size=480, max_size=560),
        T.RandomCrop(size=(560,560), pad_if_needed=True, fill=0),
        T.ToDtype(torch.float, scale=True),
        T.SanitizeBoundingBoxes(),
        T.ConvertBoundingBoxFormat(format='CXCYWH'),
        NormalizeBB(),
        #T.ToPureTensor(),
        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ]
)


dataset = PennFudanDataset('/data/PennFudanPed', tf)
dataset_test = PennFudanDataset('/data/PennFudanPed', tf)
# split the dataset in train and test set
indices = torch.randperm(len(dataset)).tolist()
train_set = torch.utils.data.Subset(dataset, indices[:-50])
val_set = torch.utils.data.Subset(dataset_test, indices[-50:])

from collections import defaultdict

def list_of_dicts_to_dict_of_lists(list_of_dicts):
    dict_of_lists = defaultdict(list)
    for dct in list_of_dicts:
        for key, value in dct.items():
            dict_of_lists[key].append(value)
    res = dict(dict_of_lists)
    return res

def collate(batch):
    batch = list_of_dicts_to_dict_of_lists(batch)
    batch['image'] = torch.stack(batch['image'])
    return batch

train_dataloader = DataLoader(
    batch_size=2,
    num_workers=0,
    pin_memory=False,
    dataset=train_set,
    sampler=RandomSampler(
        train_set,
        #replacement=True,
        #num_samples=100*2
    ),
    drop_last=True,
    collate_fn=collate
)

val_dataloader = DataLoader(
    batch_size=2,
    num_workers=0,
    pin_memory=False,
    dataset=val_set,
    shuffle=False,
    drop_last=False,
    collate_fn=collate
)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
lr_scheduler = torch.optim.lr_scheduler.LinearLR(
    optimizer=optimizer,
    start_factor=1.,
    end_factor=0.01,
    total_iters=150*30
)

In [5]:
def unnorm_bounding_boxes(inpt):
    bounding_boxes = inpt.as_subclass(torch.Tensor)
    in_dtype = bounding_boxes.dtype
    bounding_boxes = bounding_boxes.clone() if bounding_boxes.is_floating_point() else bounding_boxes.float()
    whwh = torch.Tensor(inpt.canvas_size).repeat(2).flip(dims=(0,)).to(inpt.device) # canvas_size is H,W hence the flip to WHWH
    out_boxes = bounding_boxes*whwh 
    return tv_tensors.wrap(out_boxes.to(in_dtype), like=inpt)

In [6]:
from yolos.box_ops import box_cxcywh_to_xyxy
gc.collect()
#torch.cuda.empty_cache()
gc.collect()

seed = 0
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

start_epoch = 0
for epoch in range(start_epoch, 150):
    time_ep = time.time()
    
    valid_loss = 0
    valid_bbox_loss = 0
    valid_giou_loss = 0
    valid_ce_loss = 0
    model.eval()
    criterion.eval()
    with torch.no_grad():
        map_metric = MeanAveragePrecision(
            box_format='xyxy', # make sure your dataset outputs target in xywh format
            backend='faster_coco_eval'
        )
        for batch in tqdm(val_dataloader):
            image = batch["image"].to(device)
            targets = batch['target']
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
            outputs = model(image)
            loss_dict = criterion(outputs, targets)
            weight_dict = criterion.weight_dict
            losses = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
            b,c,h,w = image.shape
            results = postprocessors['bbox'](outputs, (h,w))
            for t in targets:
                t['boxes'] = unnorm_bounding_boxes(t['boxes'])
                t['boxes'] = box_cxcywh_to_xyxy(t['boxes'])
            map_metric.update(results, targets)
            valid_loss += losses.detach().item()
            valid_bbox_loss += loss_dict["loss_bbox"].detach().item()
            valid_giou_loss += loss_dict["loss_giou"].detach().item()
            valid_ce_loss += loss_dict["loss_ce"].detach().item()
        valid_loss /= len(val_dataloader)
        valid_bbox_loss /= len(val_dataloader)
        valid_giou_loss /= len(val_dataloader)
        valid_ce_loss /= len(val_dataloader)
        mapmetrics = map_metric.compute()
        print(f"{epoch = }")
        print(f"{valid_loss = }")
        print(f"{valid_bbox_loss = }")
        print(f"{valid_giou_loss = }")
        print(f"{valid_ce_loss = }")
        print(pformat(mapmetrics))
        map_metric.reset()
        
    train_loss = 0
    train_bbox_loss = 0
    train_giou_loss = 0
    train_ce_loss = 0
    model.train()
    criterion.train()
    for batch in tqdm(train_dataloader):
        image = batch["image"].to(device)
        outputs = model(image)
        targets = batch['target']
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        loss_dict = criterion(outputs, targets)
        weight_dict = criterion.weight_dict
        losses = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
        optimizer.zero_grad()
        losses.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        lr_scheduler.step()
        train_loss += losses.detach().item()
        train_bbox_loss += loss_dict["loss_bbox"].detach().item()
        train_giou_loss += loss_dict["loss_giou"].detach().item()
        train_ce_loss += loss_dict["loss_ce"].detach().item()
    train_loss /= len(train_dataloader)
    train_bbox_loss /= len(train_dataloader)
    train_giou_loss /= len(train_dataloader)
    train_ce_loss /= len(train_dataloader)
    print(f"{epoch = }")
    #print(f"lr = {lr_scheduler.get_last_lr()[0]}"),
    print(f"{train_loss = }")
    print(f"{train_bbox_loss = }")
    print(f"{train_giou_loss = }")
    print(f"{train_ce_loss = }")
    #lr_scheduler.step(epoch)
    time_ep = time.time() - time_ep

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:01<00:00, 12.85it/s]


epoch = 0
valid_loss = 5.998540706634522
valid_bbox_loss = 0.6899915099143982
valid_giou_loss = 0.9110201692581177
valid_ce_loss = 0.7265427708625793
{'classes': tensor(0, dtype=torch.int32),
 'map': tensor(4.4341e-05),
 'map_50': tensor(0.0004),
 'map_75': tensor(0.),
 'map_large': tensor(4.4341e-05),
 'map_medium': tensor(0.),
 'map_per_class': tensor(-1.),
 'map_small': tensor(0.),
 'mar_1': tensor(0.0008),
 'mar_10': tensor(0.0030),
 'mar_100': tensor(0.0038),
 'mar_100_per_class': tensor(-1.),
 'mar_large': tensor(0.0043),
 'mar_medium': tensor(0.),
 'mar_small': tensor(0.)}


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 60/60 [00:09<00:00,  6.34it/s]


epoch = 0
train_loss = 3.1916798214117685
train_bbox_loss = 0.3167770400643349
train_giou_loss = 0.5768394584457079
train_ce_loss = 0.4541156828403473


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:01<00:00, 13.85it/s]


epoch = 1
valid_loss = 3.219658088684082
valid_bbox_loss = 0.3059837365150452
valid_giou_loss = 0.5565073442459106
valid_ce_loss = 0.5767246913909913
{'classes': tensor(0, dtype=torch.int32),
 'map': tensor(0.0034),
 'map_50': tensor(0.0184),
 'map_75': tensor(0.0003),
 'map_large': tensor(0.0038),
 'map_medium': tensor(0.0018),
 'map_per_class': tensor(-1.),
 'map_small': tensor(0.),
 'mar_1': tensor(0.0098),
 'mar_10': tensor(0.0523),
 'mar_100': tensor(0.1485),
 'mar_100_per_class': tensor(-1.),
 'mar_large': tensor(0.1658),
 'mar_medium': tensor(0.0154),
 'mar_small': tensor(0.)}


 35%|█████████████████████████████████████▊                                                                      | 21/60 [00:03<00:06,  6.46it/s]


KeyboardInterrupt: 