In [0]:
!pip3 uninstall pytorch-hrvvi-ext -y
!pip3 install -U git+https://github.com/sbl1996/pytorch-hrvvi-ext.git


In [1]:
import sys
import os

import torch
import hutil
import matplotlib.pyplot as plt
print(hutil.__version__)

1.4.14


In [0]:
%load_ext autoreload
%autoreload 2

In [3]:
gdrive = "/gdrive"
from google.colab import drive
drive.mount(gdrive, force_remount=True)
mydrive = os.path.join(gdrive, "My Drive")
!ls /gdrive/My\ Drive

def gpath(p):
    return os.path.join(mydrive, p)

Mounted at /gdrive
'Colab Notebooks'   eng-fra.pt	 images   repo	   weixin.pkl
 datasets	    fonts	 models   result


In [4]:
import os
import math
import random
from pathlib import Path

import numpy as np
from PIL import Image
from toolz import curry
from toolz.curried import get

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD
from torch.optim.lr_scheduler import LambdaLR, MultiStepLR
from torch.utils.data import DataLoader, ConcatDataset

from torchvision.models.resnet import resnet50, Bottleneck

from hutil import cuda, one_hot
from hutil.data import train_test_split, Fullset
from hutil.train import init_weights, Trainer
from hutil.datasets.voc import VOCDetection, DETECTION_CATEGORIES
from hutil.train.metrics import TrainLoss, MeanAveragePrecision
from hutil.ext.summary import summary
from hutil.detection import transform_bbox, box_collate_fn, draw_bboxes, BBox, non_max_suppression, iou_1m, transform_bboxes
from hutil.transforms import UseOrigin, Compose, RandomChoice
from hutil.transforms.detection import ToTensor, ToPercentCoords, RandomHorizontalFlip, RandomResizedCrop, Resize, CenterCrop
from hutil.inference import freeze
from hutil.model.utils import get_out_channels


█

In [0]:



def inverse_sigmoid(x):
    return math.log(x / (1-x))


def get_whs(scales, aspect_ratios):
    whs = torch.zeros(len(aspect_ratios), 2)
    whs[:, 0] = aspect_ratios.sqrt() * scales[0]
    whs[:, 1] = (1 / aspect_ratios.sqrt()) * scales[1]
    return whs.view(-1, 2)


def get_anchors(lx, ly, whs, width, height):
    anchors = torch.zeros(lx, ly, len(whs), 4)
    anchors[:, :, :, 0] = (torch.arange(
        lx, dtype=torch.float).view(lx, 1, 1).expand(lx, ly, len(whs)) + 0.5) / lx
    anchors[:, :, :, 1] = (torch.arange(
        ly, dtype=torch.float).view(1, ly, 1).expand(lx, ly, len(whs)) + 0.5) / ly
    anchors[:, :, :, 2] = whs[:, 0] / width
    anchors[:, :, :, 3] = whs[:, 1] / height
    return anchors


def compute_loc_target(gt_box, anchors):
    box_txty = (gt_box[:2] - anchors[..., :2]) \
        / anchors[..., 2:]
    box_twth = torch.log(gt_box[2:] / anchors[..., 2:])
    return torch.cat((box_txty, box_twth), dim=-1)


class RefineTransform:

    def __init__(self, f_anchors, num_classes, get_label=get("category_id"), get_bbox=get("bbox")):
        self.f_anchors = f_anchors
        self.num_classes = num_classes
        self.get_label = get_label
        self.get_bbox = get_bbox

    def __call__(self, img, anns):
        num_feature_maps = len(self.f_anchors)
        loc_targets = []
        cls_targets = []
        f_anchors = []
        for i in range(num_feature_maps):
            anchors = self.f_anchors[i].view(-1, 4)
            f_anchors.append(anchors)
            num_anchors = anchors.size(0)
            loc_targets.append(torch.zeros(num_anchors, 4))
            cls_targets.append(torch.zeros((num_anchors,), dtype=torch.long))

        for ann in anns:
            label = self.get_label(ann) + 1
            bbox = torch.tensor(
                transform_bbox(
                    self.get_bbox(ann), BBox.LTWH, BBox.XYWH))

            max_ious = []
            for anchors, loc_t, cls_t in zip(f_anchors, loc_targets, cls_targets):
                ious = iou_1m(bbox, anchors, format=BBox.XYWH)
                max_ious.append(ious.max(dim=0))

                iou_mask = ious > 0.5
                if iou_mask.sum() != 0:
                    cls_t[iou_mask] = label
                    loc_t[iou_mask] = compute_loc_target(
                        bbox, anchors[iou_mask])

            fi, (max_iou, ind) = max(
                enumerate(max_ious), key=lambda x: x[1][0])
            loc_targets[fi][ind] = compute_loc_target(bbox, f_anchors[fi][ind])
            cls_targets[fi][ind] = label

        return img, [loc_targets, cls_targets]


def focal_loss2(input, target, gamma, beta, reduction='mean'):
    target = target.unsqueeze(1)
    logit = gamma * input.gather(1, target) + beta
    input = input.scatter(1, target, logit)
    return F.cross_entropy(input, target.squeeze(1), reduction=reduction) / gamma


def binary_focal_loss2(input, target, gamma=2, beta=1, alpha=0.25, eps=1e-4, reduction='mean'):
    xt = gamma * input + beta * (2 * target - 1)
    eps = inverse_sigmoid(1-eps)
    xt = torch.clamp(xt, -eps, eps)
    return F.binary_cross_entropy_with_logits(
        xt, target,
        reduction=reduction,
        pos_weight=torch.tensor(alpha)) / gamma


class RefineLoss(nn.Module):

    def __init__(self, f_anchors, num_classes, neg_filter_threshold=0.01, p=0.01):
        super().__init__()
        self.f_anchors = f_anchors
        self.num_classes = num_classes
        self.neg_filter_threshold = inverse_sigmoid(neg_filter_threshold)
        self.p = p

    def forward(self, rps, dps, loc_targets, cls_targets):
        batch_size = rps[0].size(0)
        r_loc_loss = 0
        r_cls_loss = 0
        r_num_pos = 0
        d_loc_loss = 0
        d_cls_loss = 0
        for rp, dp, loc_t, cls_t, anchors in zip(rps, dps, loc_targets, cls_targets, self.f_anchors):
            pos = cls_t != 0
            num_pos = pos.sum().item()
            if num_pos == 0:
                continue
            r_num_pos += num_pos
            rp = rp.permute(0, 3, 2, 1).contiguous().view(batch_size, -1, 5)
            r_loc_p = rp[..., :4]
            r_cls_p = rp[..., 4]
            anchors = anchors.view(-1, 4)

            r_loc_loss += F.smooth_l1_loss(
                r_loc_p[pos], loc_t[pos], reduction='sum')
            r_cls_loss += F.binary_cross_entropy_with_logits(
                r_cls_p, pos.float(), reduction='sum')

            r_loc_p = r_loc_p.clone().detach()
            r_cls_p = r_cls_p.detach()

            neg_filter = (~pos) & (r_cls_p > self.neg_filter_threshold)

            dp = dp.permute(0, 3, 2, 1).contiguous().view(
                batch_size, -1, 4 + self.num_classes)
            d_loc_p = dp[..., :4]
            d_cls_p = dp[..., 4:]

            d_loc_t = loc_t - r_loc_p
            d_loc_t[..., :2].div_(r_loc_p[..., 2:].exp_())

            d_loc_loss += torch.clamp(F.smooth_l1_loss(
                d_loc_p[pos], d_loc_t[pos], reduction='sum'), 0, num_pos)

            d_cls_loss += F.cross_entropy(
                d_cls_p[pos], cls_t[pos], reduction='sum')

            d_cls_p_neg = d_cls_p[neg_filter]
            if len(d_cls_p_neg) != 0:
                d_cls_loss_neg = -F.log_softmax(d_cls_p_neg, dim=1)[:, 0]
                num_neg = min(3 * num_pos, len(d_cls_p_neg))
                d_cls_loss += torch.topk(
                    d_cls_loss_neg, num_neg, sorted=False)[0].sum()

        r_loc_loss /= r_num_pos
        r_cls_loss /= r_num_pos
        d_loc_loss /= r_num_pos
        d_cls_loss /= r_num_pos

        loss = r_loc_loss + r_cls_loss + d_loc_loss + d_cls_loss
        if random.random() < self.p:
            print("r_loc: %.4f | r_cls: %3.4f | d_loc: %.4f | d_cls: %.4f" % (
                r_loc_loss.item(), r_cls_loss.item(), d_loc_loss.item(), d_cls_loss.item()))
        return loss


class RefineInference:

    def __init__(self, f_anchors, width, height, num_classes, neg_filter_threshold=0.01, iou_threshold=0.45, r_topk=400, d_topk=200):
        self.f_anchors = f_anchors
        self.width = width
        self.height = height
        self.num_classes = num_classes
        self.neg_filter_threshold = inverse_sigmoid(neg_filter_threshold)
        self.iou_threshold = iou_threshold
        self.r_topk = r_topk
        self.d_topk = d_topk

    def __call__(self, rps, dps):
        detections = []
        batch_size = rps[0].size(0)

        for i in range(batch_size):
            boxes = []
            confs = []
            labels = []
            for rp, dp, anchors in zip(rps, dps, self.f_anchors):
                rp = rp[i].permute(2, 1, 0).contiguous().view(-1, 5)
                r_loc_p = rp[:, :4]
                r_cls_p = rp[:, 4]
                anchors = anchors.view(-1, 4)

                neg_filter = r_cls_p > self.neg_filter_threshold
                r_loc_p = r_loc_p[neg_filter]
                r_cls_p = r_cls_p[neg_filter]
                anchors = anchors[neg_filter]

                r_loc_p[:, :2].mul_(anchors[:, 2:]).add_(anchors[:, :2])
                r_loc_p[:, 2:].exp_().mul_(anchors[:, 2:])

                dp = dp[i].permute(2, 1, 0).contiguous().view(
                    -1, 4 + self.num_classes)[neg_filter]
                if len(dp) == 0:
                    continue
                d_loc_p = dp[:, :4]
                d_cls_p = dp[:, 4:]
                conf, label = F.softmax(d_cls_p, dim=1)[:, 1:].max(dim=1)

                d_loc_p[:, :2].mul_(r_loc_p[:, 2:]).add_(r_loc_p[:, :2])
                d_loc_p[:, 2:].exp_().mul_(r_loc_p[:, 2:])
                d_loc_p[:, [0, 2]] *= self.width
                d_loc_p[:, [1, 3]] *= self.height

                box = d_loc_p

                confs.append(conf)
                boxes.append(box)
                labels.append(label)
            
            if len(boxes) == 0:
                continue

            boxes = torch.cat(boxes, dim=0)
            confs = torch.cat(confs, dim=0)
            labels = torch.cat(labels, dim=0)

            confs, indices = confs.topk(min(len(confs), self.r_topk))
            boxes = boxes[indices]
            labels = labels[indices]

            boxes = transform_bboxes(
                boxes, format=BBox.XYWH, to=BBox.LTRB, inplace=True)
            indices = non_max_suppression(
                boxes, confs, self.iou_threshold)

            confs = confs[indices]
            boxes = boxes[indices]
            labels = labels[indices]

            indices = confs.topk(min(len(confs), self.d_topk))[1]

            for ind in indices:
                detections.append(
                    BBox(
                        image_name=i,
                        class_id=labels[ind].item(),
                        box=boxes[ind].tolist(),
                        confidence=confs[ind].item(),
                        box_format=BBox.LTRB,
                    )
                )
        return detections


In [0]:


def conv1x1(in_channels, out_channels, stride=1):
    return nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride)


def conv3x3(in_channels, out_channels, stride=1, padding=1):
    return nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=padding)


class Bottleneck(nn.Module):

    def __init__(self, in_channels, out_channels, stride=1, expansion=4):
        super().__init__()
        self.stride = stride
        self.in_channels = in_channels
        self.out_channels = out_channels
        channels = out_channels // expansion

        self.conv1 = conv1x1(in_channels, channels)
        self.bn1 = nn.BatchNorm2d(channels)
        self.relu1 = nn.ReLU(inplace=True)

        self.conv2 = conv3x3(channels, channels, stride=stride)
        self.bn2 = nn.BatchNorm2d(channels)
        self.relu2 = nn.ReLU(inplace=True)

        self.conv3 = conv1x1(channels, out_channels)
        self.bn3 = nn.BatchNorm2d(out_channels)
        self.relu3 = nn.ReLU(inplace=True)

        self.downsample = None
        if stride != 1 or in_channels != out_channels:
            self.downsample = conv1x1(in_channels, out_channels, stride=stride)

    def forward(self, x):
        identity = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu1(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu2(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu3(out)

        return out


class TransferConnection(nn.Module):
    def __init__(self, in_channels, out_channels, last=False):
        super().__init__()
        self.last = last
        self.conv1 = conv3x3(in_channels, out_channels)
        self.relu1 = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(out_channels, out_channels)
        if not last:
            self.deconv1 = nn.ConvTranspose2d(
                out_channels, out_channels, 4, stride=2, padding=1)
        self.relu2 = nn.ReLU(inplace=True)
        self.conv3 = conv3x3(out_channels, out_channels)
        self.relu3 = nn.ReLU(inplace=True)

    def forward(self, x, x_next=None):
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.conv2(x)
        if not self.last:
            x = x + self.deconv1(x_next)
        x = self.relu2(x)
        x = self.conv2(x)
        x = self.relu3(x)
        return x


class RefineDet(nn.Module):
    def __init__(self, backbone, num_classes, num_anchors, f_channels):
        super().__init__()
        self.num_classes = num_classes
        self.conv1 = backbone.conv1
        self.bn1 = backbone.bn1
        self.relu = backbone.relu
        self.maxpool = backbone.maxpool

        self.layer1 = backbone.layer1
        self.layer2 = backbone.layer2
        self.layer3 = backbone.layer3
        self.layer4 = backbone.layer4

        stages = [
            get_out_channels(self.layer2),
            get_out_channels(self.layer3),
            get_out_channels(self.layer4),
        ]

        self.layer5 = Bottleneck(stages[2], f_channels, stride=2)

        self.rp1 = conv3x3(stages[0], num_anchors * (4 + 1))
        self.rp2 = conv3x3(stages[1], num_anchors * (4 + 1))
        self.rp3 = conv3x3(stages[2], num_anchors * (4 + 1))
        self.rp4 = conv3x3(f_channels, num_anchors * (4 + 1))

        self.tcb1 = TransferConnection(stages[0], f_channels)
        self.tcb2 = TransferConnection(stages[1], f_channels)
        self.tcb3 = TransferConnection(stages[2], f_channels)
        self.tcb4 = TransferConnection(f_channels, f_channels, last=True)

        self.dp1 = conv3x3(f_channels, num_anchors * (4 + num_classes))
        self.dp2 = conv3x3(f_channels, num_anchors * (4 + num_classes))
        self.dp3 = conv3x3(f_channels, num_anchors * (4 + num_classes))
        self.dp4 = conv3x3(f_channels, num_anchors * (4 + num_classes))

    def init_new_layers(self):
        def init_weight(m):
            name = type(m).__name__
            if name.find("Linear") != -1 or name.find("Conv") != -1:
                nn.init.normal_(m.weight, 0, 0.01)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

        self.layer5.apply(init_weight)

    def forward(self, x):
        b = x.size(0)
        c1 = self.conv1(x)
        c1 = self.bn1(c1)
        c1 = self.relu(c1)
        c2 = self.maxpool(c1)

        c2 = self.layer1(c2)
        c3 = self.layer2(c2)
        c4 = self.layer3(c3)
        c5 = self.layer4(c4)
        c6 = self.layer5(c5)

        rf3 = self.rp1(c3)
        rf4 = self.rp2(c4)
        rf5 = self.rp3(c5)
        rf6 = self.rp4(c6)

        dc6 = self.tcb4(c6)
        dc5 = self.tcb3(c5, dc6)
        dc4 = self.tcb2(c4, dc5)
        dc3 = self.tcb1(c3, dc4)

        df3 = self.dp1(dc3)
        df4 = self.dp2(dc4)
        df5 = self.dp3(dc5)
        df6 = self.dp4(dc6)

        return [rf3, rf4, rf5, rf6], [df3, df4, df5, df6]


In [0]:

WIDTH = 320
HEIGHT = 320
STRIDES = [8, 16, 32, 64]
LOCATIONS = [
    (40, 40),
    (20, 20),
    (10, 10),
    (5, 5),
]
ASPECT_RATIOS = torch.tensor([1, 2, 1/2])
F_WHS = [
    get_whs([4 * s, 4 * s], ASPECT_RATIOS)
    for s in STRIDES
]
F_ANCHORS = [
    get_anchors(lx, ly, whs, WIDTH, HEIGHT)
    for (lx, ly), whs in zip(LOCATIONS, F_WHS)
]
NUM_CLASSES = 21


In [0]:

train_transform = Compose([
#     RandomChoice([
#         UseOrigin(),
#         RandomResizedCrop(size=(HEIGHT, WIDTH), scale=(
#             0.1, 1), ratio=(1/2, 2), drop=True),
#     ]),
    Resize((HEIGHT, WIDTH)),
    RandomHorizontalFlip(0.5),
    ToPercentCoords(),
    RefineTransform(F_ANCHORS, NUM_CLASSES),
    ToTensor(),
])

test_transform = Compose([
    Resize((HEIGHT, WIDTH)),
    ToTensor(),
])

# train_transform = Compose([
#     RandomHorizontalFlip(),
#     Resize(HEIGHT),
#     CenterCrop(HEIGHT),
#     ToPercentCoords(),
#     RefineTransform(F_ANCHORS, NUM_CLASSES),
#     ToTensor(),
# ])

# test_transform = Compose([
#     Resize((HEIGHT, WIDTH)),
#     ToTensor(),
# ])

data_home = Path(".")
ds1 = VOCDetection(data_home / "VOC", year='2007', image_set='trainval', download=True)
ds2 = VOCDetection(data_home / "VOCTest", year='2007', image_set='test', download=True)
ds3 = VOCDetection(data_home / "VOC", year='2012', image_set='trainval', download=True)
ds = ConcatDataset([ds1, ds2, ds3])
# ds = VOCDetection(data_home, year='2012', image_set='trainval', download=True)
# rest, ds = train_test_split(
#     ds, test_ratio=0.01
# )
# ds_train = Fullset(ds, train_transform)
# ds_val = Fullset(ds, test_transform)

ds_train, ds_val = train_test_split(
    ds, test_ratio=0.05,
    transform=train_transform,
    test_transform=test_transform)


In [0]:
backbone = resnet50(pretrained=True)
del backbone.fc

In [0]:


net = RefineDet(backbone, NUM_CLASSES, len(ASPECT_RATIOS), 256)
# net.apply(init_weights(nonlinearity='relu'))
net.init_new_layers()
criterion = RefineLoss(cuda(F_ANCHORS), NUM_CLASSES, p=0.1)
optimizer = SGD(filter(lambda x: x.requires_grad,
                       net.parameters()), lr=0.001, momentum=0.9, dampening=0.9, weight_decay=5e-4)
# optimizer = Adam(filter(lambda x: x.requires_grad,
#                         net.parameters()), lr=1e-3, weight_decay=1e-4)

lr_scheduler = MultiStepLR(optimizer, [120, 160, 200], gamma=0.1)

metrics = {
    'loss': TrainLoss(),
}
inference = RefineInference(cuda(F_ANCHORS), WIDTH, HEIGHT, NUM_CLASSES, d_topk=200)
test_metrics = {
    'mAP': MeanAveragePrecision(inference)
}


trainer = Trainer(net, criterion, optimizer, lr_scheduler,
                  metrics=metrics, evaluate_metrics=test_metrics,
                  save_path=gpath("models"), name="RefineDet-VOC2012")


In [0]:
trainer.load_state_dict(torch.load(gpath("models/RefineDet-VOC2012_trainer_11.pth")))

In [0]:
summary(net, (3,HEIGHT, WIDTH))

In [0]:
train_loader = DataLoader(
    ds_train, batch_size=32, shuffle=True, num_workers=1, pin_memory=True)
val_loader = DataLoader(
    ds_val, batch_size=48, collate_fn=box_collate_fn)


In [0]:
hist = trainer.fit(train_loader, 10, save_per_epochs=1)
plot_history(hist)

Epoch 29/38
r_loc: 0.0570 | r_cls: 2.2942 | d_loc: 0.0661 | d_cls: 3.0268
r_loc: 0.0609 | r_cls: 2.0948 | d_loc: 0.0716 | d_cls: 2.5243
r_loc: 0.0609 | r_cls: 2.3357 | d_loc: 0.0660 | d_cls: 2.6593
r_loc: 0.0501 | r_cls: 2.1343 | d_loc: 0.0578 | d_cls: 2.7128
r_loc: 0.0497 | r_cls: 2.2253 | d_loc: 0.0561 | d_cls: 2.8168
r_loc: 0.0473 | r_cls: 2.4576 | d_loc: 0.0564 | d_cls: 3.1163
r_loc: 0.0471 | r_cls: 2.3319 | d_loc: 0.0548 | d_cls: 3.2444
r_loc: 0.0486 | r_cls: 2.4970 | d_loc: 0.0533 | d_cls: 3.3894
r_loc: 0.0896 | r_cls: 2.6191 | d_loc: 0.0969 | d_cls: 2.7831
r_loc: 0.0676 | r_cls: 2.3495 | d_loc: 0.0782 | d_cls: 2.9043
r_loc: 0.0466 | r_cls: 2.4651 | d_loc: 0.0537 | d_cls: 2.9282
r_loc: 0.0530 | r_cls: 2.1981 | d_loc: 0.0620 | d_cls: 3.1052
r_loc: 0.0608 | r_cls: 2.6797 | d_loc: 0.0660 | d_cls: 3.2953
r_loc: 0.0581 | r_cls: 2.3235 | d_loc: 0.0666 | d_cls: 2.9431
r_loc: 0.0502 | r_cls: 2.4144 | d_loc: 0.0551 | d_cls: 2.8972
r_loc: 0.0546 | r_cls: 2.7840 | d_loc: 0.0619 | d_cls: 2.9

In [0]:
def plot_history(hist):
    for k, v in hist.items():
        fig, ax = plt.subplots()
        ax.plot(v)
        ax.set_title(k)

In [18]:
%time trainer.evaluate(val_loader)

CPU times: user 1min 3s, sys: 14.9 s, total: 1min 18s
Wall time: 1min 18s


{'mAP': 0.44497637203287527}

In [0]:
plot_history(trainer.metric_history)
trainer.metric_history