In [2]:
import torch
import numpy as np

### Anchor Bounding Box

This is the [`generate_anchors`](https://github.com/pytorch/vision/blob/main/torchvision/models/detection/anchor_utils.py)

In [3]:
sizes=((32,), (64,), (128,), (256,), (512,))
aspect_ratios=((0.5, 1.0, 2.0), (0.5, 1.0, 2.0), (0.5, 1.0, 2.0), (0.5, 1.0, 2.0), (0.5, 1.0, 2.0))

In [6]:
# for every (size, aspect_ratio) combination, output a zero-centered anchor
# We assume aspect_ratio=height/width 

device=torch.device('cpu')
dtype=torch.float32

cell_anchors=[]
for size, aspect_ratio in zip(sizes, aspect_ratios):
    size=torch.as_tensor(size, dtype=dtype, device=device) # 1D say of size S
    aspect_ratio=torch.as_tensor(aspect_ratio, dtype=dtype, device=device) #1D say of size A
    # why sqrt?
    h_ratio=torch.sqrt(aspect_ratio) # sqrt(height/width)
    w_ratio=1./h_ratio # sqrt(width/height)
    print('h_ratio ', h_ratio)
    print('w_ratio ', w_ratio)
    # Ax1 1xS -> AxS -> AS
    ws=(w_ratio[:,None]*size[None,:]).view(-1)
    hs=(h_ratio[:,None]*size[None,:]).view(-1)

    base_anchors=torch.stack([-ws, -hs, ws, hs], dim=1)/2
    cell_anchors.append(base_anchors)
print(len(cell_anchors), '\n', [a.shape for a in cell_anchors], '\n', cell_anchors)

h_ratio  tensor([0.7071, 1.0000, 1.4142])
w_ratio  tensor([1.4142, 1.0000, 0.7071])
h_ratio  tensor([0.7071, 1.0000, 1.4142])
w_ratio  tensor([1.4142, 1.0000, 0.7071])
h_ratio  tensor([0.7071, 1.0000, 1.4142])
w_ratio  tensor([1.4142, 1.0000, 0.7071])
h_ratio  tensor([0.7071, 1.0000, 1.4142])
w_ratio  tensor([1.4142, 1.0000, 0.7071])
h_ratio  tensor([0.7071, 1.0000, 1.4142])
w_ratio  tensor([1.4142, 1.0000, 0.7071])
5 
 [torch.Size([3, 4]), torch.Size([3, 4]), torch.Size([3, 4]), torch.Size([3, 4]), torch.Size([3, 4])] 
 [tensor([[-22.6274, -11.3137,  22.6274,  11.3137],
        [-16.0000, -16.0000,  16.0000,  16.0000],
        [-11.3137, -22.6274,  11.3137,  22.6274]]), tensor([[-45.2548, -22.6274,  45.2548,  22.6274],
        [-32.0000, -32.0000,  32.0000,  32.0000],
        [-22.6274, -45.2548,  22.6274,  45.2548]]), tensor([[-90.5097, -45.2548,  90.5097,  45.2548],
        [-64.0000, -64.0000,  64.0000,  64.0000],
        [-45.2548, -90.5097,  45.2548,  90.5097]]), tensor([[-181.01

### Compute anchors on grid

This is the [`grid_anchors`](https://github.com/pytorch/vision/blob/main/torchvision/models/detection/anchor_utils.py) function.

In [28]:
device=torch.device('cpu')
grid_sizes=[torch.Size([200, 328]), torch.Size([100, 164]), torch.Size([50, 82]), torch.Size([25, 41]), torch.Size([13, 21])] 
image_size=torch.Size([800, 1312])
strides=[
    [torch.empty((), dtype=torch.int64, device=device).fill_(image_size[0]//g[0]),
     torch.empty((), dtype=torch.int64, device=device).fill_(image_size[1]//g[1])]
    for g in grid_sizes
]
print('strides ', strides)

anchors=[]
assert cell_anchors is not None, 'cell_anchors should not be None'
assert len(grid_sizes)==len(strides)==len(cell_anchors), 'Anchors should be Tuple[Tuple[int]] '\
'because each feature map could potentially have different sizes and aspect ratios. There needs to be a match between the number of'\
'feature maps passed and the number of sizes and aspect ratios specified'

for size, stride, base_anchors in zip(grid_sizes, strides, cell_anchors):
    grid_height, grid_width=size
    stride_height, stride_width=stride
    device=base_anchors.device
    
    # For output anchor, compute [x_center, y_center, x_center, y_center]
    # We associate each grid location/index (gx,gy) to location on image via stride
    shifts_x=torch.arange(0, grid_width, dtype=torch.int32, device=device)*stride_width # location on image im_x = stride_x * gx
    shifts_y=torch.arange(0, grid_height, dtype=torch.int32, device=device)*stride_height # location on image im_y = stride_y * gy
    print('\tshifts_x ', shifts_x.shape, shifts_x.min().item(), shifts_x.max().item(), shifts_x.dtype)
    print('shifts_y ', shifts_y.shape, shifts_y.min().item(), shifts_y.max().item(), shifts_y.dtype)
    shift_y, shift_x=torch.meshgrid(shifts_y, shifts_x, indexing='ij')
    print('shift_x ', shift_x.shape, shift_x.min().item(), shift_x.max().item(), shift_x.dtype, shift_x[0][:10])
    print('shift_y ', shift_y.shape, shift_y.min().item(), shift_y.max().item(), shift_y.dtype, shift_y[:,0][:10])
    shift_x=shift_x.reshape(-1)
    shift_y=shift_y.reshape(-1)
    # each linked feature index/location corresponds to location on image
    shifts=torch.stack([shift_x, shift_y, shift_x, shift_y], dim=1) # WHWH or XYXY
    print('shifts ', shifts.shape, shifts.min(dim=0).values, shifts.max(dim=0).values, shifts.dtype)

    # For every (base_anchor, output anchor) pair, offset each zero-centered base anchor by the center of the output anchor
    n_coords=base_anchors.shape[-1] # 4 for 2D, i.e., x1,y1,x2,y2
    # Let n=gxgy, nx1x4 + 1xnbx4 -> nxnbx4 -> (n*nb)x4
    anchors.append(shifts.view(shifts.shape[0], 1, shifts.shape[-1])+base_anchors.view(1,*base_anchors.shape).view(-1, n_coords))
    break
print('anchors ', [anchor.shape for anchor in anchors])

strides  [[tensor(4), tensor(4)], [tensor(8), tensor(8)], [tensor(16), tensor(16)], [tensor(32), tensor(32)], [tensor(61), tensor(62)]]
	shifts_x  torch.Size([328]) 0 1308 torch.int32
shifts_y  torch.Size([200]) 0 796 torch.int32
shift_x  torch.Size([200, 328]) 0 1308 torch.int32 tensor([ 0,  4,  8, 12, 16, 20, 24, 28, 32, 36], dtype=torch.int32)
shift_y  torch.Size([200, 328]) 0 796 torch.int32 tensor([ 0,  4,  8, 12, 16, 20, 24, 28, 32, 36], dtype=torch.int32)
shifts  torch.Size([65600, 4]) tensor([0, 0, 0, 0], dtype=torch.int32) tensor([1308,  796, 1308,  796], dtype=torch.int32) torch.int32
anchors  [torch.Size([65600, 3, 4])]
