In [0]:
!pip3 uninstall pytorch-hrvvi-ext -y
!pip3 install -U git+https://github.com/sbl1996/pytorch-hrvvi-ext.git


In [0]:
import sys
import os

import torch
import hutil
import matplotlib.pyplot as plt
print(hutil.__version__)

1.4.14


In [0]:
%load_ext autoreload
%autoreload 2

In [0]:
gdrive = "/gdrive"
from google.colab import drive
drive.mount(gdrive, force_remount=True)
mydrive = os.path.join(gdrive, "My Drive")
!ls /gdrive/My\ Drive

def gpath(p):
    return os.path.join(mydrive, p)

Mounted at /gdrive
'Colab Notebooks'   eng-fra.pt	 images   repo	   weixin.pkl
 datasets	    fonts	 models   result


In [0]:
import math
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD
from torch.optim.lr_scheduler import LambdaLR, MultiStepLR
from torch.utils.data import DataLoader

from hutil import cuda, one_hot
from hutil.train import init_weights, Trainer
from hutil.data import train_test_split, Fullset
from hutil.transforms.detection import Compose, Resize, CenterCrop, ToTensor, ToPercentCoords
from hutil.ext.captcha import ImageCaptcha
from hutil.datasets import CaptchaDetectionOnline
from hutil.train.metrics import TrainLoss, MeanAveragePrecision
from hutil.detection import BBox, box_collate_fn, transform_bboxes, transform_bbox, iou_1m, non_max_suppression
from hutil.ext.summary import summary
from hutil.inference import freeze


█

In [0]:

def get_anchors(lx, ly, priors):
    anchors = torch.zeros(lx, ly, len(priors), 4)
    anchors[:, :, :, 0] = (torch.arange(
        lx, dtype=torch.float).view(lx, 1, 1).expand(lx, ly, len(priors)) + 0.5) / lx
    anchors[:, :, :, 1] = (torch.arange(
        ly, dtype=torch.float).view(1, ly, 1).expand(lx, ly, len(priors)) + 0.5) / ly
    anchors[:, :, :, 2:] = priors / torch.FloatTensor([lx, ly])
    return anchors


def inverse_sigmoid(x):
    x = min(max(x, .01), .99)
    return math.log(x / (1 - x))


def scale_boxes(boxes, width, height, inplace=False):
    r"""
    Args:
        boxes: (lx, ly, num_priors, 4)
    """
    if not inplace:
        boxes = boxes.clone()
    lx, ly = boxes.shape[:2]
    sw = width / lx
    sh = height / ly
    offset_x = torch.arange(
        lx, dtype=boxes.dtype, device=boxes.device).view(-1, 1, 1)
    boxes[..., 0] += offset_x
    boxes[..., [0, 2]] *= sw

    offset_y = torch.arange(
        ly, dtype=boxes.dtype, device=boxes.device).view(1, -1, 1)
    boxes[..., 1] += offset_y
    boxes[..., [1, 3]] *= sh
    return boxes


def filter_tensors(*tensors, indices):
    return [t[indices] for t in tensors]

In [0]:

class YOLOTransform:

    def __init__(self, f_anchors, num_classes, ignore_threshold=0.5, get_label=get("category_id"), get_bbox=get("bbox"), label_offset=0):
        self.f_anchors = f_anchors
        self.num_classes = num_classes
        self.ignore_threshold = ignore_threshold
        self.get_label = get_label
        self.get_bbox = get_bbox
        self.label_offset = label_offset

    def __call__(self, img, anns):
        num_feature_maps = len(self.f_anchors)
        locations = []
        f_anchors = []
        loc_targets = []
        cls_targets = []
        iou_masks = []

        for anchors in self.f_anchors:
            locations.append(anchors.size()[:2])
            anchors = anchors.view(-1, 4)
            num_anchors = anchors.size(0)
            f_anchors.append(anchors)
            loc_targets.append(torch.zeros(num_anchors, 4))
            cls_targets.append(torch.zeros(num_anchors, dtype=torch.long))
            iou_masks.append(torch.zeros(num_anchors, dtype=torch.uint8))

        for ann in anns:
            label = self.get_label(ann) + self.label_offset
            l, t, w, h = self.get_bbox(ann)
            x = l + w / 2
            y = t + h / 2
            bbox = torch.tensor([x, y, w, h])

            max_ious = []
            for anchors, loc_t, cls_t, iou_mask in zip(f_anchors, loc_targets, cls_targets, iou_masks):
                ious = iou_1m(bbox, anchors, BBox.XYWH)
                max_ious.append(ious.max(dim=0))

                iou_mask |= ious > self.ignore_threshold

            f_i, (max_iou, i) = max(
                enumerate(max_ious), key=lambda x: x[1][0])
            lx, ly = locations[f_i]
            loc_targets[f_i][i, 0] = inverse_sigmoid(x * lx % 1)
            loc_targets[f_i][i, 1] = inverse_sigmoid(y * ly % 1)
            loc_targets[f_i][i, 2:] = (bbox[2:] / f_anchors[f_i][i, 2:]).log()
            cls_targets[f_i][i] = label

        return img, [loc_targets, cls_targets, iou_masks]


class YOLOLoss(nn.Module):
    def __init__(self, num_classes, p=0.01):
        super().__init__()
        self.num_classes = num_classes
        self.p = p

    def forward(self, ps, loc_target, cls_target, iou_masks):
        total_pos = 0
        obj_loss_pos = 0
        obj_loss_neg = 0
        loc_loss = 0
        cls_loss = 0
        for p, loc_t, cls_t, iou_mask in zip(ps, loc_target, cls_target, iou_masks):
            p = p.view(p.size(0), -1, 5 + self.num_classes)
            obj_p = p[..., 0]
            loc_p = p[..., 1:5]
            # p[..., 5] is the background class
            cls_p = p[..., 6:]

            pos = cls_t != 0
            num_pos = pos.sum().item()
            total_pos += num_pos
            cls_t = one_hot(cls_t, self.num_classes)[..., 1:]
            obj_p_pos = obj_p[pos]
            obj_loss_pos += F.binary_cross_entropy_with_logits(
                obj_p_pos, torch.ones_like(obj_p_pos), reduction='sum'
            )
            obj_p_neg = obj_p[~pos & ~iou_mask]
            obj_loss_neg += F.binary_cross_entropy_with_logits(
                obj_p_neg, torch.zeros_like(obj_p_neg), reduction='sum'
            )

            if num_pos == 0:
                continue

            loc_loss += F.mse_loss(
                loc_p[pos], loc_t[pos], reduction='sum')
            cls_loss += F.binary_cross_entropy_with_logits(
                cls_p[pos], cls_t[pos], reduction='sum')

        obj_loss_neg = 0.5 * obj_loss_neg
        # loc_loss = 5 * loc_loss
        loss = (obj_loss_pos + obj_loss_neg + loc_loss + cls_loss) / total_pos
        if random.random() < self.p:
            print("pos: %.4f | neg: %.4f | loc: %.4f | cls: %.4f" %
                  (obj_loss_pos.item() / total_pos,
                   obj_loss_neg.item() / total_pos,
                   loc_loss.item() / total_pos,
                   cls_loss.item() / total_pos))
        return loss

class YOLOInference:

    def __init__(self, width, height, f_priors, conf_threshold=0.5, iou_threshold=0.5, topk=10):
        self.width = width
        self.height = height
        self.f_priors = f_priors
        self.conf_threshold = conf_threshold
        self.iou_threshold = iou_threshold
        self.topk = topk

    def __call__(self, preds):
        detections = []
        batch_size = preds[0].size(0)

        for i in range(batch_size):
            confs = []
            boxes = []
            labels = []
            for p, priors in zip(preds, self.f_priors):
                p = p[i]
                lx, ly = p.size()[:2]
                p = p.view(lx, ly, len(priors), -1)
                conf = p[..., 0].sigmoid_()
                box = p[..., 1:5]
                box[..., :2].sigmoid_()
                box[..., 2:].exp_().mul_(priors)
                box = scale_boxes(box, self.width, self.height)
                # p[..., 5] is the background class
                label = p[..., 6:].argmax(dim=-1)

                mask = conf > self.conf_threshold
                conf = conf[mask]
                box = box[mask]
                label = label[mask]

                confs.append(conf)
                boxes.append(box)
                labels.append(label)

            boxes = torch.cat(boxes, dim=0)
            confs = torch.cat(confs, dim=0)
            labels = torch.cat(labels, dim=0)

            boxes = transform_bboxes(
                boxes, format=BBox.XYWH, to=BBox.LTRB, inplace=True)
            indices = non_max_suppression(
                boxes, confs, self.iou_threshold)

            if len(indices) > self.topk:
                confs, boxes, labels = filter_tensors(
                    confs, boxes, labels, indices=indices
                )
                indices = confs.topk(self.topk)[1]

            dets = [
                BBox(
                    image_name=i,
                    class_id=labels[ind].item(),
                    box=boxes[ind].tolist(),
                    confidence=confs[ind].item(),
                    box_format=BBox.LTRB,
                ) for ind in indices
            ]
            detections += dets
        return detections



In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F


def conv1x1(in_channels, out_channels):
    return nn.Conv2d(in_channels, out_channels, kernel_size=1)


def conv3x3(in_channels, out_channels, stride=1):
    return nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)


class Bottleneck(nn.Module):
    def __init__(self, in_channels, out_channels, residual=True):
        super().__init__()
        self.residual = residual
        self.conv1 = nn.Sequential(
            conv1x1(in_channels, out_channels // 2),
            nn.BatchNorm2d(out_channels // 2),
            nn.LeakyReLU(inplace=True),
        )
        self.conv2 = nn.Sequential(
            conv3x3(out_channels // 2, out_channels),
            nn.BatchNorm2d(out_channels),
            nn.LeakyReLU(inplace=True),
        )

    def forward(self, x):
        identity = x
        x = self.conv1(x)
        x = self.conv2(x)
        return x + identity if self.residual else x


def _make_layer(num_layers, in_channels, out_channels):
    layers = []
    layers.append(Bottleneck(in_channels, out_channels))
    for _ in range(num_layers - 1):
        layers.append(Bottleneck(out_channels, out_channels))
    return nn.Sequential(*layers)


def _upsample_concat(x, y):
    h, w = y.size()[2:]
    x = F.interpolate(x, size=(h, w), mode='bilinear', align_corners=False)
    return torch.cat((x, y), dim=1)


class Downsample(nn.Module):
    def __init__(self, in_channels, out_channels=None):
        super().__init__()
        if out_channels is None:
            out_channels = in_channels
            in_channels = in_channels // 2
        self.down = nn.Sequential(
            conv3x3(in_channels, out_channels, stride=2),
            nn.BatchNorm2d(out_channels),
            nn.LeakyReLU(inplace=True),
        )

    def forward(self, x):
        return self.down(x)


class Darknet(nn.Module):
    def __init__(self, out_channels, layers=[1, 2, 8, 8, 4], f_channels=128):
        super().__init__()
        self.conv0 = nn.Sequential(
            conv3x3(3, 32),
            nn.BatchNorm2d(32),
            nn.LeakyReLU(inplace=True),
        )
        self.down1 = Downsample(64)
        self.layer1 = _make_layer(layers[0], 64, 64)

        self.down2 = Downsample(64, f_channels * 1)
        self.layer2 = _make_layer(layers[1], f_channels * 1, f_channels * 1)

        self.down3 = Downsample(f_channels * 2)
        self.layer3 = _make_layer(layers[2], f_channels * 2, f_channels * 2)

        self.down4 = Downsample(f_channels * 4)
        self.layer4 = _make_layer(layers[3], f_channels * 4, f_channels * 4)

        # self.down5 = Downsample(f_channels * 8)
        # self.layer5 = _make_layer(layers[4], f_channels * 8, f_channels * 8)

        self.conv1 = Bottleneck(f_channels * 4,
                                f_channels * 4, residual=False)
        self.conv2 = Bottleneck(f_channels * 4, f_channels * 4, residual=False)
        self.conv3 = Bottleneck(f_channels * 4, f_channels * 4, residual=False)
        self.pred1 = conv1x1(f_channels * 4, out_channels)

        self.lat1 = conv1x1(f_channels * 4, f_channels)

        self.conv4 = Bottleneck(f_channels * 3,
                                f_channels * 2, residual=False)
        self.conv5 = Bottleneck(f_channels * 2, f_channels * 2, residual=False)
        self.conv6 = Bottleneck(f_channels * 2, f_channels * 2, residual=False)
        self.pred2 = conv1x1(f_channels * 2, out_channels)

        # self.lat2 = conv1x1(f_channels * 4, f_channels)

        # self.conv7 = Bottleneck(f_channels * 3,
        #                         f_channels * 2, residual=False)
        # self.conv8 = Bottleneck(f_channels * 2, f_channels * 2, residual=False)
        # self.conv9 = Bottleneck(f_channels * 2, f_channels * 2, residual=False)
        # self.pred3 = conv1x1(f_channels * 2, out_channels)

    def forward(self, x):
        x = self.conv0(x)
        c1 = self.down1(x)
        c1 = self.layer1(c1)

        c2 = self.down2(c1)
        c2 = self.layer2(c2)

        c3 = self.down3(c2)
        c3 = self.layer3(c3)

        c4 = self.down4(c3)
        c4 = self.layer4(c4)

        # c5 = self.down5(c4)
        # c5 = self.layer5(c5)

        p41 = self.conv1(c4)
        p42 = self.conv2(p41)
        p43 = self.conv3(p42)
        p4 = self.pred1(p43)

        c3 = _upsample_concat(self.lat1(p42), c3)
        p31 = self.conv4(c3)
        p32 = self.conv5(p31)
        p33 = self.conv6(p32)
        p3 = self.pred2(p33)

        # c3 = _upsample_concat(self.lat2(p42), c3)
        # p31 = self.conv7(c3)
        # p32 = self.conv8(p31)
        # p33 = self.conv9(p32)
        # p3 = self.pred3(p33)

        preds = [p3, p4]
        preds = [p.permute(0, 3, 2, 1).contiguous() for p in preds]

        return [preds]


In [0]:
# letters = "0123456789"
letters = "0123456789abcdefghijkmnopqrstuvwxyzABDEFGHJKMNRT"
NUM_CLASSES = len(letters)
WIDTH = 128
HEIGHT = 48
LOCATIONS = [
    (16, 6),
    (8, 3),
]
PRIORS = torch.tensor([
    [
        [3.9506, 5.1250],
        [2.1728, 4.8126],
        [2.8750, 3.7504],
    ],
    [
        [1.9753, 2.5625],
        [1.0864, 2.4063],
        [1.4375, 1.8752],
    ],
])
F_ANCHORS = [
    get_anchors(lx, ly, priors)
    for (lx, ly), priors in zip(LOCATIONS, F_PRIORS)
]

In [0]:

fonts = [
    gpath("fonts/msyh.ttf"),
    gpath("fonts/sfsl0800.pfb.ttf"),
    gpath("fonts/SimHei.ttf"),
    gpath("fonts/Times New Roman.ttf"),
]

font_sizes = (28, 32, 36, 40, 44)
image = ImageCaptcha(WIDTH, HEIGHT, fonts=fonts, font_sizes=font_sizes)

train_transform = Compose([
    ToPercentCoords(),
    YOLOTransform(F_ANCHORS, NUM_CLASSES, label_offset=1),
    ToTensor(),
])

test_transform = Compose([
    ToTensor(),
])

ds_train = CaptchaDetectionOnline(
    image, size=10000, letters=letters, transform=train_transform, rotate=20)
ds_val = CaptchaDetectionOnline(
    image, size=500, letters=letters, transform=test_transform, online=False, rotate=20)


# ds = CaptchaDetectionOnline(
#     image, size=100, letters=letters, rotate=20)
# ds_train = Fullset(ds, train_transform)
# ds_val = Fullset(ds, test_transform)


In [0]:

out_channels = (5 + NUM_CLASSES) * F_PRIORS.size(1)
net = Darknet(out_channels, layers=[1, 2, 2, 2, 1], f_channels=32)
criterion = YOLOLoss(NUM_CLASSES, p=0.02)
# optimizer = SGD(filter(lambda x: x.requires_grad, net.parameters()),
#                 lr=1e-2, momentum=0.9, dampening=0.9, weight_decay=5e-4)
optimizer = Adam(filter(lambda x: x.requires_grad,
                        net.parameters()), lr=1e-3, weight_decay=1e-4)
lr_scheduler = MultiStepLR(optimizer, [], gamma=0.1)


metrics = {
    'loss': TrainLoss(),
}
inference = YOLOInference(WIDTH, HEIGHT, F_PRIORS, topk=4)
test_metrics = {
    'mAP': MeanAveragePrecision(inference)
}

trainer = Trainer(net, criterion, optimizer, lr_scheduler,
                  metrics=metrics, evaluate_metrics=test_metrics,
                  save_path=gpath("models"), name="YOLO-CAPTCHA")


In [0]:
summary(net, (3,HEIGHT, WIDTH))

In [0]:
train_loader = DataLoader(
    ds_train, batch_size=16, shuffle=True, num_workers=1, pin_memory=True)
val_loader = DataLoader(
    ds_val, batch_size=64, collate_fn=box_collate_fn)


In [0]:
trainer.fit(train_loader, 10, val_loader=val_loader, save_per_epochs=1)
# trainer.fit(train_loader, 10)


In [0]:
plt.plot(trainer.metric_history['loss'][-20:])

In [0]:
%time trainer.evaluate(val_loader)


CPU times: user 1.14 s, sys: 208 ms, total: 1.35 s
Wall time: 1.38 s


{'mAP': 0.9085555555555556}