In [1]:
import json
import os
import shutil

import torch
import yaml

from transformers import OwlViTForObjectDetection

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from src.dataset import get_dataloaders

In [3]:
import importlib

import src
# importlib.reload(src.utils)
from src.utils import paco_to_owl_box, BoxUtil
from torchvision.ops import box_convert as _box_convert

In [4]:
def get_training_config():
    with open("config.yaml", "r") as stream:
        data = yaml.safe_load(stream)
        return data["training"]

In [5]:
training_cfg = get_training_config()

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [7]:
model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32").to(device)

In [8]:
train_dataloader, test_dataloader = get_dataloaders(batch_size=5)

In [9]:
batch1 = next(train_dataloader.__iter__())

In [10]:
batch1_inputs = batch1[0]

In [11]:
batch1_inputs["attention_mask"] = batch1_inputs["attention_mask"].view(-1, 16)
batch1_inputs["input_ids"] = batch1_inputs["input_ids"].view(-1, 16)

In [12]:
outputs = model(**batch1_inputs.to(device))

In [13]:
outputs.logits.shape

torch.Size([5, 576, 3])

In [14]:
b = paco_to_owl_box(batch1[1][:, None, :], batch1[2])

In [16]:
import src.DETR.matcher

In [17]:
importlib.reload(src.DETR)

<module 'src.DETR' (<_frozen_importlib_external._NamespaceLoader object at 0x14b184ad1810>)>

In [18]:
from src.DETR.matcher import HungarianMatcher

In [19]:
matcher = HungarianMatcher()

In [43]:
outputs_for_matcher = {
    "pred_logits": outputs.logits.to(device),
    "pred_boxes": outputs.pred_boxes.to(device)
}

In [44]:
targets = [{"labels": torch.tensor([0]).to(device), "boxes": box.to(device)} for box in b]

In [46]:
targets

[{'labels': tensor([0], device='cuda:0'),
  'boxes': tensor([[371.3494, 560.7751, 448.9760, 768.4835]], device='cuda:0')},
 {'labels': tensor([0], device='cuda:0'),
  'boxes': tensor([[307.9680, 607.2321, 420.8640, 736.2561]], device='cuda:0')},
 {'labels': tensor([0], device='cuda:0'),
  'boxes': tensor([[608.3320,  40.3253, 736.4321, 231.2907]], device='cuda:0')},
 {'labels': tensor([0], device='cuda:0'),
  'boxes': tensor([[  0.0000,   0.0000, 742.6560, 769.0239]], device='cuda:0')},
 {'labels': tensor([0], device='cuda:0'),
  'boxes': tensor([[378.7361,  32.9280, 439.9160,  66.1600]], device='cuda:0')}]

In [45]:
matcher(outputs_for_matcher, targets)

[(tensor([532]), tensor([0])),
 (tensor([15]), tensor([0])),
 (tensor([39]), tensor([0])),
 (tensor([423]), tensor([0])),
 (tensor([419]), tensor([0]))]

Params:
            outputs: This is a dict that contains at least these entries:
                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates

            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
                           objects in the target) containing the class labels
                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates