In [6]:
import os
import math

import torch
import numpy as np


In [53]:
class BoxCoder:
    """
    This class encodes and decodes a set of bounding boxes into the representation used for training the regressors
    From https://github.com/pytorch/vision/blob/main/torchvision/models/detection/_utils.py
    """
    def __init__(self, inverse_weights, bbox_xform_clip=math.log(1000.0 / 16)):
        """
        Args:
            inverse_weights (4-element tuple): used to scale down predicted deltas, typically set to 1 for all elements, 
                i.e., to be applied as 1/inverse_weights
            bbox_xform_clip (float)
        """
        self.inverse_weights=inverse_weights
        self.bbox_xform_clip=bbox_xform_clip

    def decode(self, pred_deltas, anchors):
        """
        Determine predicted bounding box locations based on predicted deltas and anchors. Predicted deltas applied as 
        the scales of width and height of anchor boxes to determine the offsets along x and y, and the width and height of
        the predicted boxes. 
        Args:
            pred_deltas (tensor): Nx4 float deltas/scales of x-offset, y-offset, width, and height where 
                N=BHWA for B batch size, H height, W width, and A anchors (where number of anchors=number of aspect ratios)
            anchors (list): list of B tensors representing anchors per image, each tensor of size HWAx4
        Return:
            pred_boxes (tensor): Nx1x4 or BHWAx1x4 predicted bounding box corner locations, where each row is x1,y1,x2,y2
        """
        assert isinstance(anchors, (list, tuple)), 'This function expects anchors of type list or tuple'
        assert isinstance(pred_deltas, torch.Tensor), 'This function expects pred_deltas of type torch.Tensor'
        anchors_per_images=[a.size(0) for a in anchors]
        concat_anchors=torch.cat(anchors, dim=0) # making the anchors the same size as pred_deltas -> BHWAx4
        anchors_sum=sum(anchors_per_images)
        print('anchors_per_images ', anchors_per_images, ' concat_anchors ', concat_anchors.shape, ' anchors_sum ', anchors_sum)
        if anchors_sum>0: pred_deltas=pred_deltas.reshape(anchors_sum, -1)
        print('pred_deltas ', pred_deltas.shape)
        pred_boxes=self.__decode__(pred_deltas=pred_deltas, anchors=concat_anchors)
        if anchors_sum>0: pred_boxes=pred_boxes.reshape(anchors_sum, -1, 4)
        return pred_boxes

    def __decode__(self, pred_deltas, anchors):
        """
        Determine predicted bounding box locations based on predicted deltas and anchors. Predicted deltas applied as 
        the scales of width and height of anchor boxes to determine the offsets along x and y, and the width and height of
        the predicted boxes. 
        Args:
            pred_deltas (tensor): Nx4 float deltas/scales of x-offset, y-offset, width, and height where 
                N=BHWA for B batch size, H height, W width, and A anchors (where number of anchors=number of aspect ratios)
            anchors (tensor): Nx4 float deltas/scales of x-offset, y-offset, width, and height where 
                N=BHWA for B batch size, H height, W width, and A anchors (where number of anchors=number of aspect ratios)
        Return:
            pred_boxes (tensor): Nx4 or BHWAx4 predicted bounding box corner locations, where each row is x1,y1,x2,y2
        """
        # make sure that anchors and predicted_deltas are of the same type
        anchors=anchors.type(pred_deltas.dtype)
        
        # elements in each row of anchors (refence boxes) are x1,y1,x2,y2 where x1<=x2 and y1<=y2 
        # we compute the width and height of refence boxes
        width=anchors[:, 2]-anchors[:, 0]  # 1D tensor of size N
        height=anchors[:, 3]-anchors[:, 1]  # 1D tensor of size N
        # then we compute the center of the reference boxes
        ctr_x=anchors[:, 0]+ 0.5*width  # 1D tensor of size N, this is the center
        ctr_y=anchors[:, 1]+ 0.5*height  # 1D tensor of size N, this is the center
        
        wx, wy, ww, wh=self.inverse_weights
        # we weights the delta estimates down
        # Nx1 dx here is delta scale relative to width for offset along x (e.g., offset from center along x)
        dx=pred_deltas[:, 0::4]/wx # similar to pred_deltas[:, 0].unsqueeze(-1) for x/wx
        # Nx1 dy here is delta scale relative to height for offset along y (e.g., offset from center along y)
        dy=pred_deltas[:, 1::4]/wy # similar to pred_deltas[:, 1].unsqueeze(-1) for y/wy
        # Nx1 dw is scale in log-space of width (i.e., apply exp before scaling width)
        dw=pred_deltas[:, 2::4]/ww # similar to pred_deltas[:, 2].unsqueeze(-1) for w/ww
        # Nx1 dh is scale in log-space of height (i.e., apply exp before scaling height)
        dh=pred_deltas[:, 3::4]/wh # similar to pred_deltas[:, 3].unsqueeze(-1) for h/wh
        print('dx ', dx.shape, dx.min().item(), dx.max().item())
        print('dy ', dy.shape, dy.min().item(), dy.max().item())
        print('dw ', dw.shape, dw.min().item(), dw.max().item())
        print('dh ', dh.shape, dh.min().item(), dh.max().item())
        
        # prevent sending too large values to torch.exp
        dw=torch.clamp(dw, max=self.bbox_xform_clip)
        dh=torch.clamp(dh, max=self.bbox_xform_clip)
        
        # Nx1 = Nx1 Nx1 Nx1
        pred_ctr_x=dx*width[:,None]+ctr_x[:,None]
        pred_ctr_y=dy*height[:,None]+ctr_y[:,None]
        pred_w=torch.exp(dw)*width[:,None]
        pred_h=torch.exp(dh)*height[:,None]
        
        # Nx1 distance from boundary to the box's center. We use these to determine the bounding box
        # corner location, x1,y1,x2,y2
        c_to_c_h = torch.tensor(0.5, dtype=pred_ctr_y.dtype, device=pred_h.device)*pred_h
        c_to_c_w = torch.tensor(0.5, dtype=pred_ctr_x.dtype, device=pred_w.device)*pred_w
        
        # bounding box location
        pred_boxes1=pred_ctr_x-c_to_c_w # Nx1
        pred_boxes2=pred_ctr_y-c_to_c_h # Nx1
        pred_boxes3=pred_ctr_x+c_to_c_w # Nx1
        pred_boxes4=pred_ctr_y+c_to_c_h # Nx1
        pred_boxes=torch.hstack((pred_boxes1, pred_boxes2, pred_boxes3, pred_boxes4)) # Nx4
        return pred_boxes  

In [50]:
data_dirpath='D:/data/mask_rcnn'

device=torch.device("cpu")
anchors=torch.load(os.path.join(data_dirpath, "anchors.pt"),map_location=device, weights_only=True)
pred_bbox_deltas=torch.load(os.path.join(data_dirpath, "pred_bbox_deltas.pt"),map_location=device, weights_only=True)

In [47]:
pred_boxes.reshape(box_sum, -1, 4).shape

torch.Size([370920, 1, 4])

In [54]:
box_coder = BoxCoder(inverse_weights=(1.0, 1.0, 1.0, 1.0))
pred_boxes=box_coder.decode(pred_deltas=pred_bbox_deltas, anchors=anchors)
pred_boxes.shape, pred_boxes.min(0).values, pred_boxes.max(0).values

anchors_per_images  [185460, 185460]  concat_anchors  torch.Size([370920, 4])  anchors_sum  370920
pred_deltas  torch.Size([370920, 4])
dx  torch.Size([370920, 1]) -1.0294348001480103 0.7720302939414978
dy  torch.Size([370920, 1]) -1.2709095478057861 1.5505026578903198
dw  torch.Size([370920, 1]) -2.542602777481079 1.6185859441757202
dh  torch.Size([370920, 1]) -2.5830938816070557 2.1535117626190186


(torch.Size([370920, 1, 4]),
 tensor([[-326.2812, -294.4279,   -2.9163,    1.6288]], grad_fn=<MinBackward0>),
 tensor([[ 797.8859,  931.3315,  973.0844, 1283.7443]], grad_fn=<MaxBackward0>))

In [15]:
#proposals = box_coder.decode(pred_bbox_deltas.detach(), anchors)
rel_codes=pred_bbox_deltas.detach()
boxes=anchors # anchors per image
print('rel_codes ', rel_codes.shape)
print('boxes ', len(boxes), [b.shape for b in boxes])

assert isinstance(boxes, (list, tuple)), 'This function expects boxes of type list or tuple'
assert isinstance(rel_codes, torch.Tensor), 'This function expects rel_Codes of type torch.Tensor'
boxes_per_images=[b.size(0) for b in boxes]
concat_boxes=torch.cat(boxes, dim=0)
box_sum=sum(boxes_per_images)
print('boxes_per_images ', boxes_per_images, ' concat_boxes ', concat_boxes.shape, ' box_sum ', box_sum)
if box_sum>0: rel_codes=rel_codes.reshape(box_sum, -1)
print('rel_codes ', rel_codes.shape)

rel_codes  torch.Size([370920, 4])
boxes  2 [torch.Size([185460, 4]), torch.Size([185460, 4])]
boxes_per_images  [185460, 185460]  concat_boxes  torch.Size([370920, 4])  box_sum  370920
rel_codes  torch.Size([370920, 4])


In [34]:
# decode_single  pred_boxes = self.decode_single(rel_codes, concat_boxes)
# decode_single(self, rel_codes: Tensor, boxes: Tensor) -> Tensor:
# From a set of original boxes and encoded relative box offsets, get the decoded boxes
# rel_codes --encoded boxes
# concat_boxes -- reference boxes
concat_boxes=concat_boxes.type(rel_codes.dtype)
# elements in each row of refence boxes are x1,y1,x2,y2 where x1<=x2 and y1<=y2 
# we compute the width and height of refence boxes
width=concat_boxes[:, 2]-concat_boxes[:, 0]  # 1D tensor of size N
height=concat_boxes[:, 3]-concat_boxes[:, 1]  # 1D tensor of size N
# then we compute the center of the reference boxes
ctr_x=concat_boxes[:, 0]+ 0.5*width  # 1D tensor of size N, this is the center
ctr_y=concat_boxes[:, 1]+ 0.5*height  # 1D tensor of size N, this is the center

wx, wy, ww, wh=box_coder.weights
# Nx1 we weights the delta estimates down
# dx here is delta scale relative to width for offset along x (e.g., offset from center along x)
dx=rel_codes[:, 0::4]/wx # similar to rel_codes[:, 0].unsqueeze(-1) for x/wx
# dy here is delta scale relative to height for offset along y (e.g., offset from center along y)
dy=rel_codes[:, 1::4]/wy # similar to rel_codes[:, 1].unsqueeze(-1) for y/wy
# dw is scale in log-space of width (i.e., apply exp before scaling width)
dw=rel_codes[:, 2::4]/ww # similar to rel_codes[:, 2].unsqueeze(-1) for w/ww
# dh is scale in log-space of height (i.e., apply exp before scaling height)
dh=rel_codes[:, 3::4]/wh # similar to rel_codes[:, 3].unsqueeze(-1) for h/wh
print('dx ', dx.shape, dx.min().item(), dx.max().item())
print('dy ', dy.shape, dy.min().item(), dy.max().item())
print('dw ', dw.shape, dw.min().item(), dw.max().item())
print('dh ', dh.shape, dh.min().item(), dh.max().item())

# prevent sending too large values to torch.exp
dw=torch.clamp(dw, max=box_coder.bbox_xform_clip)
dh=torch.clamp(dh, max=box_coder.bbox_xform_clip)

# Nx1 = Nx1 Nx1 Nx1
pred_ctr_x=dx*width[:,None]+ctr_x[:,None]
pred_ctr_y=dy*height[:,None]+ctr_y[:,None]
pred_w=torch.exp(dw)*width[:,None]
pred_h=torch.exp(dh)*height[:,None]

# Nx1 distance from boundary to the box's center. We use these to determine the bounding box
# corner location, x1,y1,x2,y2
c_to_c_h = torch.tensor(0.5, dtype=pred_ctr_y.dtype, device=pred_h.device)*pred_h
c_to_c_w = torch.tensor(0.5, dtype=pred_ctr_x.dtype, device=pred_w.device)*pred_w

# bounding box location
pred_boxes1=pred_ctr_x-c_to_c_w # Nx1
pred_boxes2=pred_ctr_y-c_to_c_h # Nx1
pred_boxes3=pred_ctr_x+c_to_c_w # Nx1
pred_boxes4=pred_ctr_y+c_to_c_h # Nx1
pred_boxes=torch.hstack((pred_boxes1, pred_boxes2, pred_boxes3, pred_boxes4)) # Nx4

dx  torch.Size([370920, 1]) -1.0294348001480103 0.7720302939414978
dy  torch.Size([370920, 1]) -1.2709095478057861 1.5505026578903198
dw  torch.Size([370920, 1]) -2.542602777481079 1.6185859441757202
dh  torch.Size([370920, 1]) -2.5830938816070557 2.1535117626190186


In [44]:
pred_boxes.shape

torch.Size([370920, 4])

In [37]:
torch.hstack((pred_boxes1, pred_boxes2, pred_boxes3, pred_boxes4)).shape

torch.Size([370920, 4])

1.391249812950831e+27