In [0]:
!pip3 uninstall pytorch-hrvvi-ext
!pip3 install -U --no-cache-dir --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple pytorch-hrvvi-ext

In [7]:
import sys
import os

import torch
import hutil
import matplotlib.pyplot as plt
print(hutil.__version__)

1.4.4


In [8]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
gdrive = "/gdrive"
from google.colab import drive
drive.mount(gdrive, force_remount=True)
mydrive = os.path.join(gdrive, "My Drive")
!ls /gdrive/My\ Drive

def gpath(p):
    return os.path.join(mydrive, p)

Mounted at /gdrive
'Colab Notebooks'   eng-fra.pt	 images   repo	   weixin.pkl
 datasets	    fonts	 models   result


In [0]:
import os
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD
from torch.optim.lr_scheduler import LambdaLR, MultiStepLR
from torch.utils.data import DataLoader
from torch.utils.data.dataloader import default_collate

from hutil import cuda, one_hot
from hutil.ext.captcha import ImageCaptcha
from hutil.ext.summary import summary
from hutil.datasets import CaptchaDetectionOnline
from hutil.train import init_weights, Trainer, Args
from hutil.data import train_test_split, Fullset
from hutil.transformers import Compose, Resize, ToTensor, ToPercentCoords
from hutil.train.metrics import TrainLoss, MeanAveragePrecision
from hutil.detection import BoundingBox, BoundingBoxFormat, draw_bboxes, iou_11, iou_b11, iou_1m, transform_bboxes, transform_bbox


In [0]:

def scale_boxes(boxes, width, height, inplace=False):
    r"""
    Args:
        boxes: (batch_size, lx, ly, num_anchors, 4)
    """
    if not inplace:
        boxes = boxes.clone()
    lx, ly = boxes.shape[1:3]
    sw = width / lx
    sh = height / ly
    offset_x = torch.arange(lx, dtype=boxes.dtype, device=boxes.device)
    offset_x = offset_x.view(1, -1, 1, 1, 1)
    boxes[..., [0, 2]] = (boxes[..., [0, 2]] + offset_x) * sw

    offset_y = torch.arange(ly, dtype=boxes.dtype, device=boxes.device)
    offset_y = offset_y.view(1, 1, -1, 1, 1)
    boxes[..., [1, 3]] = (boxes[..., [1, 3]] + offset_y) * sh
    return boxes


In [0]:
class YOLOTransform:

    def __init__(self, lx, ly, num_classes):
        self.lx = lx
        self.ly = ly
        self.sw = 1 / lx
        self.sh = 1 / ly
        self.num_classes = num_classes

    def __call__(self, img, anns):
        classes = torch.full((self.lx, self.ly),
                             self.num_classes - 1, dtype=torch.long)
        boxes = torch.zeros(self.lx, self.ly, 4)
        for ann in anns:
            label = ann["category_id"]
            cx, cy, w, h = transform_bbox(
                ann['bbox'],
                format=BoundingBoxFormat.LTWH,
                to=BoundingBoxFormat.XYWH)

            xi, x_offset = divmod(cx, self.sw)
            yi, y_offset = divmod(cy, self.sh)
            xi = int(xi)
            yi = int(yi)

            classes[xi, yi] = label

            boxes[xi, yi, 0] = x_offset / self.sw
            boxes[xi, yi, 1] = y_offset / self.sh
            boxes[xi, yi, 2] = w / self.sw
            boxes[xi, yi, 3] = h / self.sh
        return img, (classes, boxes)


class YOLOLoss(nn.Module):

    def __init__(self, anchors, num_classes):
        self.anchors = anchors
        self.num_classes = num_classes

    def __call__(self, output, classes, boxes):
        r"""
            output:   (batch_size, lx, ly, num_anchors * (5 + num_classes))
            classes:  (batch_size, lx, ly)
            boxes:    (batch_size, lx, ly, 4)
            anchors:  (num_anchors, 2)
        """
        anchors = self.anchors
        num_anchors = len(anchors)
        batch_size, lx, ly = output.size()[:3]
        obj_mask = classes != (self.num_classes - 1)
        output = output.view(batch_size, lx, ly, num_anchors, -1)
        confidences_pred = output[..., 0]
        boxes_txty_pred = output[..., 1:3]
        boxes_twth_pred = output[..., 3:5]
        classes_pred = output[..., 5:]

        boxes_xy_pred_obj = torch.sigmoid(boxes_txty_pred[obj_mask])
        boxes_wh_pred_obj = torch.exp(boxes_twth_pred[obj_mask]).mul(anchors)
        boxes_pred_obj = torch.cat(
            (boxes_xy_pred_obj, boxes_wh_pred_obj), dim=-1)
        boxes_obj = boxes[obj_mask]

        with torch.no_grad():
            ious = []
            for i in range(num_anchors):
                ious.append(iou_b11(boxes_obj, boxes_pred_obj[..., i, :]))
            ious = torch.stack(ious, dim=-1)
            anchor_indices = torch.argmax(ious, dim=-1)

            # (num_objects, num_anchors)
            anchor_mask = one_hot(anchor_indices, C=num_anchors).byte()
            negative_anchor_mask = ious[~anchor_mask] < .5

        boxes_xy_obj = boxes_obj[..., :2]
        boxes_twth_obj = torch.log(
            boxes_obj[..., 2:] / anchors[anchor_indices])

        txty_loss = F.mse_loss(
            boxes_xy_pred_obj[anchor_mask],
            boxes_xy_obj,
            reduction='sum') / batch_size
        twth_loss = F.mse_loss(
            boxes_twth_pred[obj_mask][anchor_mask],
            boxes_twth_obj,
            reduction='sum') / batch_size

        localization_loss = txty_loss + twth_loss

        classes_loss = F.cross_entropy(
            classes_pred[obj_mask][anchor_mask],
            classes[obj_mask],
            reduction='sum') / batch_size

        confidences_pred_obj_anchors = confidences_pred[obj_mask][anchor_mask]
        obj_pos_loss = F.binary_cross_entropy_with_logits(
            confidences_pred_obj_anchors,
            torch.ones_like(confidences_pred_obj_anchors),
            reduction='sum') / batch_size
        confidences_pred_noobj = confidences_pred[~obj_mask]
        obj_neg_loss1 = F.binary_cross_entropy_with_logits(
            confidences_pred_noobj,
            torch.zeros_like(confidences_pred_noobj),
            reduction='sum') / batch_size

        confidences_pred_obj_negative = confidences_pred[obj_mask][~anchor_mask][negative_anchor_mask]
        obj_neg_loss2 = F.binary_cross_entropy_with_logits(
            confidences_pred_obj_negative,
            torch.zeros_like(confidences_pred_obj_negative),
            reduction='sum') / batch_size

        obj_neg_loss = obj_neg_loss1 + obj_neg_loss2

        if random.random() < 0.02:
            print("box: %.4f  class: %.4f  pos: %.4f  neg: %.4f" % (
                localization_loss.item(),
                classes_loss.item(),
                obj_pos_loss.item(),
                obj_neg_loss.item()))

        return localization_loss + classes_loss + obj_pos_loss + obj_neg_loss


def non_max_suppression(boxes, confidences, max_boxes, iou_threshold, inplace=False):
    r"""
        boxes:       (N, 4)
        confidences: (N,)
        max_boxes (int): 
        iou_threshold (float):
    Returns:
        indices: (N,)
    """
    if len(boxes) == 0:
        return []
    if not inplace:
        boxes = boxes.clone()
        confidences = confidences.clone()
    boxes = boxes.view(-1, 4)
    confidences = confidences.view(-1)
    indices = []
    while True:
        ind = torch.argmax(confidences)
        indices.append(ind.item())
        boxes_iou = iou_1m(boxes[ind], boxes)
        mask = boxes_iou > iou_threshold
        boxes.masked_fill_(mask.unsqueeze(-1), 0)
        confidences.masked_fill_(mask, 0)
        if len(indices) >= max_boxes or confidences.sum() == 0:
            return indices


class YOLOInference:

    def __init__(self, width, height, anchors, confidence_threshold, max_boxes, iou_threshold):
        self.width = width
        self.height = height
        self.anchors = anchors
        self.confidence_threshold = confidence_threshold
        self.max_boxes = max_boxes
        self.iou_threshold = iou_threshold

    def __call__(self, output):
        anchors = self.anchors
        num_anchors = len(anchors)
        batch_size, lx, ly = output.size()[:3]
        output = output.view(batch_size, lx, ly, num_anchors, -1)
        confidences = torch.sigmoid_(output[..., 0])
        boxes_txty = torch.sigmoid_(output[..., 1:3])
        boxes_twth = torch.exp_(output[..., 3:5]).mul_(anchors)
        classes = torch.argmax(output[..., 5:], dim=-1)

        boxes = output[..., 1:5]  # inplace
        boxes = transform_bboxes(
            boxes, format=BoundingBoxFormat.XYWH, to=BoundingBoxFormat.LTRB, inplace=True)
        boxes = scale_boxes(boxes, self.width, self.height, inplace=True)

        mask = confidences > self.confidence_threshold

        detections = []
        for i in range(batch_size):
            b_confidences = confidences[i][mask[i]]
            b_boxes = boxes[i][mask[i]]
            b_classes = classes[i][mask[i]]
            indices = non_max_suppression(
                b_boxes, b_confidences, self.max_boxes, self.iou_threshold)
            for ind in indices:
                detections.append(
                    BoundingBox(
                        image_name=i,
                        class_id=b_classes[ind].item(),
                        box=b_boxes[ind].tolist(),
                        confidence=b_confidences[ind].item(),
                        box_format=BoundingBoxFormat.LTRB,
                    )
                )
        return detections


In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F


def conv3x3(in_channels, out_channels, stride=1):
    return nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)


class SELayer(nn.Module):
    def __init__(self, in_channels, reduction=8):
        super().__init__()
        channels = in_channels // reduction
        self.avgpool = nn.AdaptiveAvgPool2d(1)
        self.layers = nn.Sequential(
            nn.Linear(in_channels, channels),
            ReLU(),
            nn.Linear(channels, in_channels),
            nn.Sigmoid(),
        )

    def forward(self, x):
        b, c = x.size()[:2]
        s = self.avgpool(x).view(b, c)
        s = self.layers(s).view(b, c, 1, 1)
        return x * s


class WideSEBasicBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1, with_se=False):
        super().__init__()
        self.bn1 = nn.BatchNorm2d(in_channels)
        self.relu1 = nn.ReLU(inplace=True)
        self.conv1 = conv3x3(in_channels, out_channels, stride=stride)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.relu2 = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(out_channels, out_channels)

        self.se = None
        if with_se:
            self.se = SELayer(out_channels)

        self.downsample = None
        if stride != 1 or in_channels != out_channels:
            self.downsample = nn.Conv2d(
                in_channels, out_channels, 1, stride=stride)

    def forward(self, x):
        residual = x
        o1 = self.relu1(self.bn1(x))
        z = self.conv1(o1)
        o2 = self.relu2(self.bn2(z))
        z = self.conv2(o2)
        if self.se:
            z = self.se(z)
        if self.downsample:
            residual = self.downsample(o1)
        return z + residual


class ResNet(nn.Module):
    stages = [16, 16, 32, 32, 64, 128]

    def __init__(self, in_channels, out_channels, block, layers, k=4, **kwargs):
        super().__init__()
        self.conv = nn.Conv2d(
            in_channels, self.stages[0], kernel_size=3, stride=1, padding=1)

        self.layer1 = self._make_layer(
            block, self.stages[0] * 1, self.stages[1] * k, layers[0], stride=1, **kwargs)
        self.layer2 = self._make_layer(
            block, self.stages[1] * k, self.stages[2] * k, layers[1], stride=2, **kwargs)
        self.layer3 = self._make_layer(
            block, self.stages[2] * k, self.stages[3] * k, layers[2], stride=2, **kwargs)
        self.layer4 = self._make_layer(
            block, self.stages[3] * k, self.stages[4] * k, layers[3], stride=2, **kwargs)
        self.layer5 = self._make_layer(
            block, self.stages[4] * k, self.stages[5] * k, layers[4], stride=2, **kwargs)

        self.bn = nn.BatchNorm2d(self.stages[5] * k)
        self.relu = nn.ReLU(inplace=True)
#         self.avgpool = nn.AdaptiveAvgPool2d((3, 8))
        self.fc = nn.Linear(self.stages[5] * k, out_channels)

    def _make_layer(self, block, in_channels, out_channels, blocks, stride=1, **kwargs):
        layers = []
        layers.append(block(in_channels, out_channels,
                            stride=stride, **kwargs))
        for i in range(1, blocks):
            layers.append(
                block(out_channels, out_channels, **kwargs))
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.layer5(x)

        x = self.bn(x)
        x = self.relu(x)
#         x = self.avgpool(x)
        x = x.permute(0, 3, 2, 1)
        x = self.fc(x)
        return x

In [0]:
def val_collate_fn(batch):
    x, y = zip(*batch)
    ground_truths = []
    for i in range(len(y)):
        for ann in y[i]:
            ground_truths.append(
                BoundingBox(
                    image_name=i,
                    class_id=ann["category_id"],
                    box=ann["bbox"],
                    box_format=BoundingBoxFormat.LTWH,
                )
            )
    return default_collate(x), Args(ground_truths)

# letters="0123456789"
# ANCHORS = cuda(torch.tensor([
#     [1.05791213, 2.08511355],
#     [0.81372366, 1.66612853],
#     [1.32061441, 2.48404214],
# ]))
letters = "0123456789abcdefghijkmnopqrstuvwxyzABDEFGHJKLMNQRT"
ANCHORS = cuda(torch.tensor([
    [1.11794557, 2.5000541 ],
    [1.66142818, 2.26188481],
    [0.95299741, 1.86781582],
]))

NUM_CLASSES = len(letters) + 1
NUM_ANCHORS = len(ANCHORS)
WIDTH = 128
HEIGHT = 48
SW = 16
SH = 16
LX = WIDTH // SW
LY = HEIGHT // SH

In [0]:

fonts = [
    gpath("fonts/msyh.ttf"),
    gpath("fonts/sfsl0800.pfb.ttf"),
    gpath("fonts/SimHei.ttf"),
    gpath("fonts/Times New Roman.ttf"),
]

font_sizes = (28, 32, 36, 40, 44)
image = ImageCaptcha(WIDTH, HEIGHT, fonts=fonts, font_sizes=font_sizes)

train_transform = Compose([
    ToPercentCoords(),
    ToTensor(),
    YOLOTransform(LX, LY, NUM_CLASSES),
])

test_transform = Compose([
    ToTensor(),
])

ds_train = CaptchaDetectionOnline(
    image, size=10000, letters=letters, transform=train_transform, rotate=20)
ds_val = CaptchaDetectionOnline(
    image, size=500, letters=letters, transform=test_transform, online=False, rotate=20)


In [0]:
net = ResNet(3, NUM_ANCHORS * (NUM_CLASSES + 5),
             WideSEBasicBlock, [2, 2, 2, 2, 2], k=2)

In [0]:
criterion = YOLOLoss(ANCHORS, NUM_CLASSES)
optimizer = Adam(net.parameters(), lr=1e-3)
# lr_scheduler = LambdaLR(optimizer, lambda x: 0.96 ** x)
# optimizer = SGD(net.parameters(), lr=0.01, momentum=0.9, dampening=0, nesterov=True)
lr_scheduler = MultiStepLR(optimizer, [30, 60, 80], gamma=0.2)



metrics = {
    'loss': TrainLoss(),
}
test_metrics = {
    'mAP': MeanAveragePrecision(
        YOLOInference(WIDTH, HEIGHT, ANCHORS,
                      confidence_threshold=0.2,
                      max_boxes=10,
                      iou_threshold=0.5)
    ),
}
trainer = Trainer(net, criterion, optimizer, lr_scheduler,
                  metrics=metrics, evaluate_metrics=test_metrics,
                  save_path=gpath("models"), name="YOLO-CAPTCHA50")


In [0]:
summary(net, (3,HEIGHT, WIDTH))

In [0]:
train_loader = DataLoader(
    ds_train, batch_size=32, shuffle=True, num_workers=1, pin_memory=True)
val_loader = DataLoader(
    ds_val, batch_size=64, collate_fn=val_collate_fn)


In [0]:
lr_scheduler.milestones = [20, 30, 40]

In [60]:
trainer.fit(train_loader, 10, val_loader=val_loader, save_by_metric='val_mAP', patience=20)


Epoch 31/40
box: 0.0417  class: 0.0253  pos: 0.9239  neg: 0.8166
box: 0.0348  class: 0.2057  pos: 0.7852  neg: 0.8697
box: 0.0387  class: 0.2922  pos: 1.4968  neg: 1.6805
box: 0.0512  class: 0.4254  pos: 0.7699  neg: 1.0135
box: 0.0321  class: 0.0366  pos: 0.8954  neg: 0.9045
box: 0.0339  class: 0.0573  pos: 1.2197  neg: 1.1045
elapsed: 113s	loss: 2.0908	
validate ------	mAP: 0.9678	
Epoch 32/40
box: 0.0311  class: 0.1225  pos: 1.1222  neg: 1.1724
box: 0.0443  class: 0.2102  pos: 0.9665  neg: 0.9713
box: 0.0354  class: 0.0136  pos: 1.0133  neg: 0.9948
box: 0.0474  class: 0.4412  pos: 0.9570  neg: 1.0948
box: 0.0334  class: 0.1191  pos: 0.7249  neg: 0.6220
elapsed: 114s	loss: 2.0875	
validate ------	mAP: 0.9675	
Epoch 33/40
box: 0.0338  class: 0.0161  pos: 1.0343  neg: 0.7320
box: 0.0332  class: 0.2960  pos: 0.8277  neg: 1.1003
box: 0.0434  class: 0.2310  pos: 1.0274  neg: 0.6409
box: 0.0453  class: 0.4024  pos: 1.0596  neg: 1.5107
box: 0.0395  class: 0.0156  pos: 0.6969  neg: 0.9573
bo

{'loss': [2.090819596481323,
  2.087469929122925,
  2.0391499767303465,
  1.9553374420166016,
  1.9604283769607544,
  1.964926697921753,
  1.9456579275131225,
  1.8864387670516969,
  1.8929335653305053,
  1.8857514793395995],
 'val_mAP': [0.967790639974085,
  0.9675129632653061,
  0.9687172068675088,
  0.9674890432488146,
  0.9676341135435993,
  0.9695655965367966,
  0.9664015927437644,
  0.9677751945578231,
  0.9701188136054423,
  0.9682015782312925]}

In [34]:
trainer.evaluate(val_loader)


{'mAP': 0.908185574070025}