In [1]:
import torch
import torchvision

In [51]:
def IoU(bbox1: torch.Tensor, bbox2: torch.Tensor):
    """
    bbox1: Shape(N, 4)
    bbox2: Shape(M, 4)
    """
    
    N = bbox1.size(0)
    M = bbox2.size(0)
    
    lt = torch.max(
        bbox1[:, :2].unsqueeze(1).expand(N, M, 2), # [N, 2] -> [N, 1, 2] -> [N, M, 2]
        bbox2[:, :2].unsqueeze(0).expand(N, M, 2)  # [M, 2] -> [1, M, 2] -> [N, M, 2]
    )
    
    rb = torch.min(
        bbox1[:, 2:].unsqueeze(1).expand(N, M, 2), # [N, 2] -> [N, 1, 2] -> [N, M, 2]
        bbox2[:, 2:].unsqueeze(0).expand(N, M, 2)  # [M, 2] -> [1, M, 2] -> [N, M, 2]
    )
    
    wh = torch.clamp(rb - lt, min=0) # [wh < 0] = 0 # clip at 0
    inter = wh[:, :, 0] * wh[:, :, 1] # [N, M]

    # Compute area of the bboxes
    area1 = (bbox1[:, 2] - bbox1[:, 0]) * (bbox1[:, 3] - bbox1[:, 1]) # [N, ]
    area2 = (bbox2[:, 2] - bbox2[:, 0]) * (bbox2[:, 3] - bbox2[:, 1]) # [M, ]
    area1 = area1.unsqueeze(1).expand_as(inter) # [N, ] -> [N, 1] -> [N, M]
    area2 = area2.unsqueeze(0).expand_as(inter) # [M, ] -> [1, M] -> [N, M]

    union = area1 + area2 - inter # [N, M, 2]
    iou = inter / union           # [N, M, 2]

    return iou

In [139]:
def Norm2DNorm(Boxes: torch.Tensor, S: int):
    Target = torch.zeros_like(Boxes)
    Target[..., :2] = Boxes[..., :2] / torch.Tensor([S]).to(torch.float) - .5 * Boxes[..., 2:]
    Target[..., 2:] = Boxes[..., :2] / torch.Tensor([S]).to(torch.float) + .5 * Boxes[..., 2:]
    return Target

In [263]:
def yololoss(S: int, B: int, C: int, LambdaObj: float=5., LambdaNoObj: float=.5):
    N = B * 5 + C
    CI = [4, 9]
    BI = [0, 1, 2, 3, 5, 6, 7, 8]
    LI = [B * 5 + idx for idx in range(C)]
    XYI = [0, 1]
    WHI = [2, 3]
    def __CALL__(Prediction: torch.Tensor, Target: torch.Tensor):
        BatchSize = Prediction.size(0)
        coordMask = (Target[..., 4] == 1).unsqueeze(-1).expand_as(Target)
        noobjMask = (Target[..., 4] == 0).unsqueeze(-1).expand_as(Target)

        coordP = Prediction[coordMask].reshape(-1, N)  # [coord_n, N]
        noobjP = Prediction[noobjMask].reshape(-1, N)  # [coord_n, N]

        coordT = Target[coordMask].reshape(-1, N)  # [coord_n, N]
        noobjT = Target[noobjMask].reshape(-1, N)  # [coord_n, N]

        # Class Label
        ClassP = coordP[..., LI].reshape(-1, C)  # [coord_n, C]
        ClassT = coordT[..., LI].reshape(-1, C)  # [coord_n, C]
        # No Object Confidence
        NoObjP = noobjP[..., CI].reshape(-1, 1)  # [nooobj_n, 1]
        NoObjT = noobjT[..., CI].reshape(-1, 1)  # [nooobj_n, 1]
        # Object Confidence
        ConfP = coordP[..., CI].reshape(-1, 1);  # [coord_n, 1]
        # BBox
        BBoxP = coordP[..., BI].reshape(-1, B, 4)  # [coord_n, B, 4(XYWH)]
        BBoxT = coordT[..., BI].reshape(-1, B, 4)  # [coord_n, B, 4(XYWH)]

        with torch.no_grad():
            BBoxP = Norm2DNorm(BBoxP.reshape(-1, 4), S)
            BBoxT = Norm2DNorm(BBoxT.reshape(-1, 4), S)
            iou, iouIndex = torch.max(IoU(BBoxP.reshape(-1, 4), BBoxT.reshape(-1, 4)), dim=0)

        NSize= BBoxP.size(0)
        BBoxP = BBoxP.unsqueeze(0).expand(NSize, NSize, 4)[list(range(NSize)), iouIndex]
        BBoxT = BBoxT.unsqueeze(0).expand(NSize, NSize, 4)[list(range(NSize)), iouIndex]
        ConfP = ConfP.unsqueeze(0).expand(NSize, NSize, 1)[list(range(NSize)), iouIndex]
        
        lossXY = torch.nn.functional.mse_loss(BBoxP[..., XYI], BBoxT[..., XYI], reduction="sum")
        lossWH = torch.nn.functional.mse_loss(torch.sqrt(BBoxP[..., WHI]), torch.sqrt(BBoxT[..., WHI]), reduction="sum")

        lossObj = torch.nn.functional.mse_loss(ConfP.reshape(-1), iou, reduction="sum")
        lossNObj = torch.nn.functional.mse_loss(NoObjP, NoObjT, reduction="sum")
        lossClass = torch.nn.functional.mse_loss(ClassP, ClassT, reduction="sum")
        loss = (LambdaObj * (lossXY + lossWH) + LambdaNoObj * (lossNObj) + (lossObj + lossClass)) / BatchSize
        return loss
    return __CALL__

In [295]:
BBox1 = torch.zeros(1, 7, 7, 13)
BBox2 = torch.zeros(1, 7, 7, 13)

In [302]:
BBox1[0, 1, 1] = torch.Tensor([0.11, 0.11, 0.4, 0.4, .9, 0.9, 0.9, 0.4, 0.4, .9, 0., 1., 0.])
BBox2[0, 1, 1] = torch.Tensor([0.1, 0.1, 0.4, 0.4, 1., 0.1, 0.1, 0.4, 0.4, 1., 0., 0., 1.])

In [303]:
yololoss(7, 2, 3)(BBox1, BBox2)

tensor(2.0148)

In [304]:
Boxes1 = torch.Tensor([
    [100, 100, 250, 250],
    [10, 10, 50, 50],
    [50, 75, 70, 80],
    [250, 250, 300, 300],
])
Target = torch.Tensor([
    [90, 90, 300, 300],
    [20, 20, 74, 75],
    [251, 251, 300, 300],
])

In [380]:
scores = torch.Tensor([0.1, 0.4, 0.2, 0.9])
iou = IoU(Target, Boxes1)
iou, index = iou.max(0)

In [381]:
TP = torch.zeros(10)
TF = torch.ones(10)

scores, index = torch.sort(scores, descending=True)
TP[torch.where(iou[index] > 0.5)] = 1.
TF[torch.where(iou[index] > 0.5)] = 0.

In [454]:
def AP(Scores: torch.Tensor, Correct: torch.Tensor) -> torch.Tensor:
        
    if torch.sum(Correct) == 0:
        # return torch.sum(Correct), Correct, Correct
        return 0.

    IndexSort = torch.sort(Scores, descending=True)[-1] # 降順
    # Scores = Scores[IndexSort]
    Correct = Correct[IndexSort]

    TP = torch.cumsum(Correct, dim=-1)
    Precision = TP / (torch.arange(TP.size(0)) + 1.)
    Recall = TP / torch.sum(Correct, dim=-1)
    
    # PrecisionFlip = Precision.flip(dims=(0,))
    # PrecisionFlip = torch.cummax(PrecisionFlip, dim=0)[0].flip(dims=(0,))

    Precision = torch.concat([torch.Tensor([0]), Precision, torch.Tensor([0])], dim=-1)
    Recall = torch.concat([torch.Tensor([0]), Recall, torch.Tensor([1])], dim=-1)

    Recall = Recall[1:] - Recall[:-1]
    
    return torch.sum(Recall * Precision[1:], dim=-1)

In [455]:
Scores = torch.Tensor([95, 92, 85, 80, 70, 60])
Correct = torch.Tensor([1, 1, 0, 1, 0, 1])

In [456]:
AP(Scores, Correct)

tensor(0.8542)

In [None]:
def IoUCul(P, T):
        """
        P (input): [Batch, coord_n, xywh]
        T (input): [Batch, coord_n, xywh]
        """

        XYI = [0, 1]
        WHI = [2, 3]

        S = 7
        P = P.clone()
        T = T.clone()

        PXYMIN = P[..., XYI] / float(S) - 0.5 * P[..., WHI]
        PXYMAX = P[..., XYI] / float(S) + 0.5 * P[..., WHI]

        TXYMIN = T[..., XYI] / float(S) - 0.5 * T[..., WHI]
        TXYMAX = T[..., XYI] / float(S) + 0.5 * T[..., WHI]

        lt = torch.max(PXYMIN, TXYMIN)
        rb = torch.min(PXYMAX, TXYMAX)

        wh = torch.clamp(rb - lt, min=0.)
        intersect = (wh[..., 0] * wh[..., 1])

        Area1 = (PXYMAX - PXYMIN)
        Area1 = Area1[..., 0] * Area1[..., 1]
        Area2 = (TXYMAX - TXYMIN)
        Area2 = Area2[..., 0] * Area2[..., 1]
        Union = Area1 + Area2 - intersect

        iou = intersect / Union
        return torch.max(iou.reshape(-1, 2), dim=1)