In [1]:
import numpy as np

import torch
from torch import Tensor

import torch.nn as nn
from torch.autograd import Variable

import torch.optim as optim
from torch.optim import lr_scheduler

import torchvision
from torchvision import datasets, models, transforms

import os
import time

# Dataset

In [2]:
import cv2
from PIL import Image
import os
import os.path as osp
import json
from easydict import EasyDict
import random
import torch.utils.data as data


def is_image_file(filename):
    return filename.endswith('png')


def find_classes(dir):
    classes = [d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))]
    classes.sort()
    class_to_idx = {classes[i]: i for i in range(len(classes))}

    return classes, class_to_idx

def make_dataset(dir, class_to_idx):
    images = []
    dir = os.path.expanduser(dir)
    for target in sorted(os.listdir(dir)):
        d = os.path.join(dir, target)
        if not os.path.isdir(d):
            continue

        for root, _, fnames in sorted(os.walk(d)):
            for fname in sorted(fnames):
                if is_image_file(fname):
                    path = os.path.join(root, fname)
                    item = (path, class_to_idx[target])
                    images.append(item)

    return images


def json_load(fn):
    with open(fn) as f:
        return EasyDict(json.load(f))

    
class MyImageFolderAndBbox(data.Dataset):

    def __init__(self, root, transform=None,  mode='train'):
        classes, class_to_idx = find_classes(root)
        imgs = make_dataset(root, class_to_idx)
        if len(imgs) == 0:
            raise(RuntimeError("Found 0 images in subfolders of: " + root + "\n"
                               "Supported image extensions are: " + ",".join(IMG_EXTENSIONS)))

        self.root = root
        self.imgs = imgs
        self.classes = classes
        self.class_to_idx = class_to_idx
        self.transform = transform
        
        self.mode = mode
        
        
    def __getitem__(self, index):
        """
        Args:
            index (int): Index

        Returns:
            tuple: (image, targets, bboxs) where target is class_index of the target class.
        """
        path, target = self.imgs[index]

        hands = json_load(path[:-4]+'.json')
        
        detection_size=(426, 240)
        cimg = cv2.imread(path)
        cimg = cv2.cvtColor(cimg, cv2.COLOR_BGR2RGB)
        block_size = (426, 240)  # 16*27  16*15
        crop_size = (192, 144) # 
        
        # flip
        if random.random() >= 0.5:
            cimg = cv2.flip(cimg, 1)
            hands.l = flip(hands.l, block_size)
            hands.r = flip(hands.r, block_size)
            
        # random scene !!!
        while True:
            x0 = random.randrange(0, block_size[0] - crop_size[0])
            y0 = random.randrange(0, block_size[1] - crop_size[1])
            if percent_of_a_in_b(hands.r, [x0,y0,x0+crop_size[0],y0+crop_size[1]]) > 0.8:
                break
              
        frame = cimg[y0: y0 + crop_size[1], x0: x0 + crop_size[0]]
        hands.l[0] -= x0
        hands.l[2] -= x0
        hands.r[0] -= x0
        hands.r[2] -= x0
        hands.l[1] -= y0
        hands.l[3] -= y0
        hands.r[1] -= y0
        hands.r[3] -= y0
        
        gt_boxes = []
        gt_target = []
        # face
        if percent_of_a_in_b(hands.l, [0,0,crop_size[0], crop_size[1]]) < 0.8:
            x0,y0,x1,y1 = hands.l
            if x1 > 0 and y1 > 0:
                frame[max(0, y0): y1, max(0,x0): x1] = 255
        else:
            hands.l[0] = max(0, hands.l[0])
            hands.l[1] = max(0, hands.l[1])
            hands.l[2] = min(crop_size[0], hands.l[2])
            hands.l[3] = min(crop_size[1], hands.l[3])
            gt_boxes.append(hands.l)
#             # head as extra
            gt_target.append(len(self.classes))
            
        # hand
        if percent_of_a_in_b(hands.r, [0,0,crop_size[0],crop_size[1]]) < 0.8:
            x0,y0,x1,y1 = hands.r
            if x1 > 0 and y1 > 0:
                frame[max(0, y0): y1, max(0,x0): x1] = 255
        else:
            hands.r[0] = max(0, hands.r[0])
            hands.r[1] = max(0, hands.r[1])
            hands.r[2] = min(crop_size[0], hands.r[2])
            hands.r[3] = min(crop_size[1], hands.r[3])
            gt_boxes.append(hands.r)
            gt_target.append(target)
        
        img = Image.fromarray(frame)
        if self.transform is not None:
            img = self.transform(img)

        return img, np.array(gt_target), np.array(gt_boxes)

    def __len__(self):
        return len(self.imgs)

def percent_of_a_in_b(a, b):
    ax0, ay0, ax1, ay1 = a
    bx0, by0, bx1, by1 = b

    if ax1 <= bx0 or ay1 <= by0 or ax0 >= bx1 or ay0 >= by1: # left, up, right, down
        return 0.
    xs = [ax0, ax1, bx0, bx1]
    ys = [ay0, ay1, by0, by1]
    xs = sorted(xs)
    ys = sorted(ys)
    # print xs, ys
    inter_w = xs[2] - xs[1]
    inter_h = ys[2] - ys[1]

    w = ax1 - ax0
    h = ay1 - ay0

    return float(inter_w*inter_h) / (w*h)


def flip(hand, size):
    x0, y0, x1, y1 = hand
    w, h = size
    return w-x1, y0, w-x0, y1

In [3]:
import collections
def my_collate(batch):
    "Puts first data(images) field into a tensor with outer dimension batch size others(class label, gt box) are list"
    if torch.is_tensor(batch[0]):
        out = None
        if torch.utils.data.dataloader._use_shared_memory:
            # If we're in a background process, concatenate directly into a
            # shared memory tensor to avoid an extra copy
            numel = sum([x.numel() for x in batch])
            storage = batch[0].storage()._new_shared(numel)
            out = batch[0].new(storage)
        return torch.stack(batch, 0, out=out)
    elif type(batch[0]).__module__ == 'numpy':
        return batch
    elif isinstance(batch[0], collections.Sequence):
        transposed = zip(*batch)
        return [my_collate(samples) for samples in transposed]

    raise TypeError(("batch must contain tensors, numbers, dicts or lists; found {}"
                     .format(type(batch[0]))))

In [5]:
DatasetDir = 'Datasets/'
mean, std = [0.5, 0.5, 0.5],[0.25, 0.25, 0.25]

data_transforms = {
    'val': transforms.Compose([
        transforms.ColorJitter(brightness=0.3, contrast=0.3),
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ]),
    'test': transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ]),
}

image_datasets = {x: MyImageFolderAndBbox(os.path.join(DatasetDir, x),
                                          data_transforms[x], x)
                  for x in ['val', 'test']}

dataloders = {x: torch.utils.data.DataLoader(
                image_datasets[x], 
                batch_size=8,                            
                shuffle=True, 
                num_workers=4,
                collate_fn=my_collate
)
              for x in ['val', 'test']}

dataset_sizes = {x: len(image_datasets[x]) for x in ['val', 'test']}
class_names = image_datasets['val'].classes


print('dataset_sizes :', dataset_sizes, 'class names :', class_names)

('dataset_sizes :', {'test': 250, 'val': 500}, 'class names :', ['five', 'l', 'one', 'seeyou', 'zero'])


# Model
## keep shallow but deeper
## leaky ReLU

In [6]:
#model
import torch.nn.functional as F
from yolo.utils.cython_bbox import bbox_ious, bbox_intersections, bbox_overlaps, anchor_intersections
from yolo.utils.cython_yolo import yolo_to_bbox
from multiprocessing import Pool
from pyinn.modules import Conv2dDepthwise


import yolo.config as cfg

print cfg.inp_size
print cfg.out_size


class YoloHand(nn.Module):
    def __init__(self, width_mul=0.125, use_init=False):
        super(YoloHand, self).__init__()

        self.width_mul = width_mul;

        def conv_bn(inp, oup, stride):
            return nn.Sequential(
                nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
                nn.BatchNorm2d(oup),
                nn.LeakyReLU(inplace=True),
                
            )
        def conv_dw(inp, oup, stride):
            return nn.Sequential(
                Conv2dDepthwise(inp, 3, padding=1, stride=stride, bias=False),
                nn.BatchNorm2d(inp),
                nn.LeakyReLU(inplace=True),

                nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup),
                nn.LeakyReLU(inplace=True),
            )
        self.feature = nn.Sequential( # feature of hand
            conv_bn(3, 10, 1),  # 3 low level preserve high res
            conv_dw(10, int(self.width_mul* 64), 2), # 7
            conv_dw(int(self.width_mul* 64), int(self.width_mul*64), 1), #  11
            conv_dw(int(self.width_mul*64), int(self.width_mul*128), 2), #  19
            conv_dw(int(self.width_mul*128), int(self.width_mul*128), 1), # 27
            conv_dw(int(self.width_mul*128), int(self.width_mul*256), 2), # 43
            conv_dw(int(self.width_mul*256), int(self.width_mul*256), 1), # 59
            conv_dw(int(self.width_mul*256), int(self.width_mul*512), 2), # 91
            
            conv_dw(int(self.width_mul*512), int(self.width_mul*512), 1), # 
            conv_dw(int(self.width_mul*512), int(self.width_mul*512), 1), # 
            conv_dw(int(self.width_mul*512), int(self.width_mul*512), 1), # 
        )
        
        # transfer
        inp = int(self.width_mul*512) 
        oup = int(self.width_mul*512)
        self.transfer = nn.Sequential(
            nn.Conv2d(inp, oup, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(oup),
            nn.LeakyReLU(inplace=True),
            
            nn.Conv2d(oup, oup, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(oup),
            nn.LeakyReLU(inplace=True),
        )
        
        # linear
        out_channels = cfg.num_anchors * (cfg.num_classes + 5)
        self.final_conv = nn.Conv2d(oup, out_channels, 1, 1, padding=0, bias=True)
        
        # train
        self.bbox_loss = None
        self.iou_loss = None
        self.cls_loss = None
        self.pool = Pool(processes=8)

    
    def forward(self, im_data):
        feature_map = self.feature(im_data) # get hand feature map batchsize x 320x240/8 --> 40x30
        h = self.transfer(feature_map)
        y = self.final_conv(h)
        
        # for detection
        bsize, c, h, w = y.size() # c = cfg.num_anchors * (cfg.num_classes + 5)
        y_reshaped = y.permute(0, 2, 3, 1).contiguous().view(bsize, -1, cfg.num_anchors, cfg.num_classes+5) # shape=(bsize, wxh, num_a, num_c+5)
        # bbox related 0~4
        xy_pred = F.sigmoid(y_reshaped[:, :, :, 0:2])
        wh_pred = torch.exp(y_reshaped[:, :, :, 2:4])
        bbox_pred = torch.cat([xy_pred, wh_pred], 3) # (bsize, wxh, num_a, 4) 4: [sig(tx), sig(ty), exp(tw), exp(th)]
        iou_pred = F.sigmoid(y_reshaped[:, :, :, 4:5]) # (bsize, wxh, num_a, 1)
        # cls related 5~end
        score_pred = y_reshaped[:, :, :, 5:].contiguous()
        prob_pred = F.softmax(score_pred.view(-1, score_pred.size()[-1])).view_as(score_pred) # (bsize, wxh, num_a, num_cls)
        
        return bbox_pred, iou_pred, prob_pred
    
    def get_loss(self, preds, gt_boxes=None, gt_classes=None, dontcare=None):
        bbox_pred, iou_pred, prob_pred = preds
        bbox_pred_np = bbox_pred.data.cpu().numpy()
        iou_pred_np = iou_pred.data.cpu().numpy()
        
        gt_boxes_np = np.array(gt_boxes)
        gt_classes_np = np.array(gt_classes)
        
        # build detection target
        _boxes, _ious, _classes, _box_mask, _iou_mask, _class_mask = self._build_target_on_cpu(
            bbox_pred_np, gt_boxes_np, gt_classes_np, dontcare, iou_pred_np
        )

        num_boxes = sum((len(boxes) for boxes in gt_boxes))
        
        box_mask = np_to_variable(_box_mask, dtype=torch.FloatTensor)
        boxes = np_to_variable(_boxes)
        # _boxes[:, :, :, 2:4] = torch.log(_boxes[:, :, :, 2:4])
        box_mask = box_mask.expand_as(boxes)
        self.bbox_loss = nn.MSELoss(size_average=False)(bbox_pred * box_mask, boxes * box_mask) / num_boxes
        
        iou_mask = np_to_variable(_iou_mask, dtype=torch.FloatTensor)
        ious = np_to_variable(_ious)
        self.iou_loss = nn.MSELoss(size_average=False)(iou_pred * iou_mask, ious * iou_mask) / num_boxes

        class_mask = np_to_variable(_class_mask, dtype=torch.FloatTensor)
        classes = np_to_variable(_classes)
        class_mask = class_mask.expand_as(prob_pred)
        self.cls_loss = nn.MSELoss(size_average=False)(prob_pred * class_mask, classes * class_mask) / num_boxes
        
        return self.bbox_loss + self.iou_loss + self.cls_loss
    
    def _build_target_on_cpu(self, bbox_pred_np, gt_boxes, gt_classes, dontcare, iou_pred_np):
        """
        :param bbox_pred: shape: (bsize, h x w, num_anchors, 4) : (sig(tx), sig(ty), exp(tw), exp(th))
        """

        bsize = bbox_pred_np.shape[0]
        
        try:
            targets = self.pool.map(
                _process_batch, 
                ((bbox_pred_np[b], gt_boxes[b], gt_classes[b], iou_pred_np[b]) for b in range(bsize))
            )
        except Exception as e:
            self.pool.close()
            del self.pool
            raise(e)

        _boxes = np.stack(tuple((row[0] for row in targets)))
        _ious = np.stack(tuple((row[1] for row in targets)))
        _classes = np.stack(tuple((row[2] for row in targets)))
        _box_mask = np.stack(tuple((row[3] for row in targets)))
        _iou_mask = np.stack(tuple((row[4] for row in targets)))
        _class_mask = np.stack(tuple((row[5] for row in targets)))

        return _boxes, _ious, _classes, _box_mask, _iou_mask, _class_mask
    
def _process_batch(data):
    bbox_pred_np, gt_boxes, gt_classes, iou_pred_np = data

    # known cfg
    W, H = cfg.out_size 
    inp_size = cfg.inp_size
    out_size = cfg.out_size

    # net output params
    wxh, num_anchors, _ = bbox_pred_np.shape

    # groud truth
    _classes = np.zeros([wxh, num_anchors, cfg.num_classes], dtype=np.float)
    _class_mask = np.zeros([wxh, num_anchors, 1], dtype=np.float)

    _ious = np.zeros([wxh, num_anchors, 1], dtype=np.float)
    _iou_mask = np.zeros([wxh, num_anchors, 1], dtype=np.float)

    _boxes = np.zeros([wxh, num_anchors, 4], dtype=np.float)
    _boxes[:, :, 0:2] = 0.5
    _boxes[:, :, 2:4] = 1.0
    _box_mask = np.zeros([wxh, num_anchors, 1], dtype=np.float) + 0.01

    # scale pred_bbox
    anchors = np.ascontiguousarray(cfg.anchors, dtype=np.float)
    bbox_pred_np = np.expand_dims(bbox_pred_np, 0)
    bbox_np = yolo_to_bbox(
        np.ascontiguousarray(bbox_pred_np, dtype=np.float),
        anchors,
        H, W)
    bbox_np = bbox_np[0]# bbox_np.shape (wxh, num_anchors, (x1, y1, x2, y2))   range: 0 ~ 1
    bbox_np[:, :, 0::2] *= float(inp_size[0])  # rescale x
    bbox_np[:, :, 1::2] *= float(inp_size[1])  # rescale y

    # gt_boxes_b = np.asarray(gt_boxes[b], dtype=np.float)
    gt_boxes = np.asarray(gt_boxes, dtype=np.float)

    # for each cell, compare predicted_bbox and gt_bbox
    bbox_np_b = np.reshape(bbox_np, [-1, 4]) # (wxhxnum_anchors, 4)
    ious = bbox_ious(
        np.ascontiguousarray(bbox_np_b, dtype=np.float),
        np.ascontiguousarray(gt_boxes, dtype=np.float)
    ) # (wxhxnum_anchors, num_gt_boxes)
    best_ious = np.max(ious, axis=1).reshape(_iou_mask.shape)
    iou_penalty = 0 - iou_pred_np[best_ious < cfg.iou_thresh] # no onect cell operation
    _iou_mask[best_ious <= cfg.iou_thresh] = cfg.noobject_scale * iou_penalty # noobj mask

    # locate the * cell location * of each gt_boxes
    cell_w = float(inp_size[0]) / W # stride
    cell_h = float(inp_size[1]) / H
    cx = (gt_boxes[:, 0] + gt_boxes[:, 2]) * 0.5 / cell_w
    cy = (gt_boxes[:, 1] + gt_boxes[:, 3]) * 0.5 / cell_h
    cell_inds = np.floor(cy) * W + np.floor(cx) # !! 
    cell_inds = cell_inds.astype(np.int)
    # get each gt_box's feature map bbox
    target_boxes = np.empty(gt_boxes.shape, dtype=np.float)
    target_boxes[:, 0] = cx - np.floor(cx)  # cx
    target_boxes[:, 1] = cy - np.floor(cy)  # cy
    target_boxes[:, 2] = (gt_boxes[:, 2] - gt_boxes[:, 0]) / inp_size[0] * out_size[0]  # tw
    target_boxes[:, 3] = (gt_boxes[:, 3] - gt_boxes[:, 1]) / inp_size[1] * out_size[1]  # th

    # for each gt boxes, match the * best match anchor type*
    gt_boxes_resize = np.copy(gt_boxes)
    gt_boxes_resize[:, 0::2] *= (out_size[0] / float(inp_size[0]))
    gt_boxes_resize[:, 1::2] *= (out_size[1] / float(inp_size[1]))
    anchor_ious = anchor_intersections(
        anchors,
        np.ascontiguousarray(gt_boxes_resize, dtype=np.float)
    )
    anchor_inds = np.argmax(anchor_ious, axis=0)

    ious_reshaped = np.reshape(ious, [wxh, num_anchors, len(cell_inds)]) # len(cell_inds) == num_gt_boxes
    for i, gt_box_cell_ind in enumerate(cell_inds):
        if gt_box_cell_ind >= wxh or gt_box_cell_ind < 0:
            print gt_box_cell_ind
            continue

        a = anchor_inds[i] # best match anchor index

        iou_pred_of_best_anchor_cell = iou_pred_np[gt_box_cell_ind, a, :]  # 0 ~ 1, should be close to 1
        _iou_mask[gt_box_cell_ind, a, :] = cfg.object_scale * (1 - iou_pred_of_best_anchor_cell)
        _ious[gt_box_cell_ind, a, :] = ious_reshaped[gt_box_cell_ind, a, i]

        _box_mask[gt_box_cell_ind, a, :] = cfg.coord_scale
        target_boxes[i, 2:4] /= anchors[a]
        _boxes[gt_box_cell_ind, a, :] = target_boxes[i]

        _class_mask[gt_box_cell_ind, a, :] = cfg.class_scale
        _classes[gt_box_cell_ind, a, gt_classes[i]] = 1.

    return _boxes, _ious, _classes, _box_mask, _iou_mask, _class_mask
    
def np_to_variable(x, is_cuda=True, dtype=torch.FloatTensor, volatile=False):
    v = Variable(torch.from_numpy(x).type(dtype), volatile=volatile)
    if is_cuda:
        v = v.cuda()
    return v

(192, 144)
(12, 9)


In [7]:
yolohandnet = YoloHand(width_mul=0.158)
yolohandnet.load_state_dict(torch.load('models/yolohanddetect-crop-5-face-lowres-deeper-leaky-0_158-0.0187'))

# Finetuning

In [15]:
def finetue_model(model, optimizer, scheduler, num_epochs=25):       

    since = time.time()

    best_model_wts = model.state_dict()
    min_loss = 2

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
#         for phase in ['val', 'test']:
        for phase in ['test', 'val']:
            if phase == 'test':
                scheduler.step()
                model.train(False)
                model.final_conv.train(True)  # Set model to training mode
            else:
                model.train(False)  # Set model to evaluate mode

            train_loss = 0.0
            bbox_loss, iou_loss, cls_loss = 0., 0., 0.
    
            for data in dataloders[phase]:
                # get the inputs
                im, gt_classes, gt_boxes = data
                # wrap them in Variable
                if use_gpu:
                    im = Variable(im.cuda())
                else:
                    im = Variable(im)
                
                # forward
                preds = model(im)

                # loss
                loss = model.get_loss(preds, gt_boxes, gt_classes)
                bbox_loss += model.bbox_loss.data.cpu().numpy()[0]
                iou_loss += model.iou_loss.data.cpu().numpy()[0]
                cls_loss += model.cls_loss.data.cpu().numpy()[0]
                train_loss += loss.data.cpu().numpy()[0]
     
                # zero the parameter gradients
                optimizer.zero_grad()
                # backward + optimize only if in training phase
                if phase == 'test':
                    loss.backward()
                    optimizer.step()

            print phase
            # analysis
            print 'train_loss ', train_loss / dataset_sizes[phase] * dataloders[phase].batch_sampler.batch_size 
            print 'bbox_loss ', bbox_loss / dataset_sizes[phase] * dataloders[phase].batch_sampler.batch_size
            print 'iou_loss ', iou_loss / dataset_sizes[phase] * dataloders[phase].batch_sampler.batch_size
            print 'cls_loss ', cls_loss / dataset_sizes[phase] * dataloders[phase].batch_sampler.batch_size

            # save best model
            epoch_loss = train_loss / dataset_sizes[phase]  * dataloders[phase].batch_sampler.batch_size
#             if phase == 'train' and epoch_loss < min_loss:
            if phase == 'test' and epoch_loss < min_loss:
                min_loss = epoch_loss
                best_model_wts = model.state_dict()

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best finetune Loss: {:4f}'.format(min_loss))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [16]:
## set dataloder 
dataloders['val'].batch_sampler.batch_size=16
dataloders['test'].batch_sampler.batch_size=16

In [17]:
yolohandnet.load_state_dict(torch.load('models/yolohanddetect-crop-5-face-lowres-deeper-leaky-0_158-0.0187'))

In [18]:
## train setting
model = yolohandnet
use_gpu = True
if use_gpu:
    model = model.cuda()

import itertools    
finetune_params = itertools.chain(model.final_conv.parameters(), model.transfer.parameters()) 
optimizer = torch.optim.Adadelta(finetune_params, lr=1) # !!! transfer learn final conv param
step_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.1)

model = finetue_model(model, optimizer, step_lr_scheduler, 4)

Epoch 0/3
----------
test
train_loss  0.797084196091
bbox_loss  0.0393536375761
iou_loss  0.560372293949
cls_loss  0.197358255863
val
train_loss  0.908167345047
bbox_loss  0.0654243674278
iou_loss  0.491021844625
cls_loss  0.351721132755
()
Epoch 1/3
----------
test
train_loss  0.420025598526
bbox_loss  0.0322082184553
iou_loss  0.300335801125
cls_loss  0.0874815744162
val
train_loss  0.867137432098
bbox_loss  0.0689067374468
iou_loss  0.499203795433
cls_loss  0.299026900768
()
Epoch 2/3
----------
test
train_loss  0.271883547783
bbox_loss  0.0317906343937
iou_loss  0.163116732121
cls_loss  0.0769761806428
val
train_loss  1.11156804657
bbox_loss  0.0673407982588
iou_loss  0.749857230425
cls_loss  0.294370014191
()
Epoch 3/3
----------
test
train_loss  0.25831751442
bbox_loss  0.0272524838448
iou_loss  0.159776792526
cls_loss  0.0712882406116
val
train_loss  1.06182183456
bbox_loss  0.073970068574
iou_loss  0.742362075806
cls_loss  0.245489682734
()
Training complete in 0m 6s
Best finet

In [None]:
## train setting
model = yolohandnet
use_gpu = True
if use_gpu:
    model = model.cuda()

import itertools    
finetune_params = itertools.chain(model.final_conv.parameters(), model.transfer.parameters()) 
optimizer = torch.optim.Adadelta(finetune_params, lr=0.3) # !!! transfer learn final conv param
step_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.1)

model = finetue_model(model, optimizer, step_lr_scheduler, 4)

In [None]:
torch.save(model.state_dict(), 'models/yolohanddetect-crop-5-face-lowres-deeper-leaky-finetue')

# Practice

In [None]:
yolohandnet.load_state_dict(torch.load('models/yolohanddetect-crop-5-face-lowres-deeper-leaky-finetue'))
yolohandnet.eval()

In [None]:
import cv2
from collections import deque
from utils import postprocess, my_draw_detection

cam = cv2.VideoCapture('/dev/video0')

means, stds = [0.5]*3, [0.25]*3
trans = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(means, stds)
])

use_gpu = False
if use_gpu:
    yolohandnet.cuda()
else:
    yolohandnet.cpu()

fpss = deque(maxlen=10)
while True:
    
    ret, frame = cam.read()
    if ret == False:
        break
    
    # inference start
    since = time.time()
    
    # transfrom
    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, cfg.infer_inp_size)
    timg = trans(img)
    timg = timg.view(1, *timg.shape)
    if use_gpu:
        cimg = Variable(timg.cuda())
    else:
        cimg = Variable(timg)
    
    # forward
    net_output = yolohandnet(cimg)
    
    # post process
    bbox_pred, iou_pred, prob_pred = net_output
    bbox_pred, iou_pred, prob_pred = bbox_pred.data.numpy(), iou_pred.data.numpy(), prob_pred.data.numpy()
    post_output = postprocess(bbox_pred, iou_pred, prob_pred, cfg, 0.6)
    bboxes, scores, cls_inds = post_output

    # inference end
    now = time.time()
    t_frame = now - since
    fps = 1 / t_frame
    fpss.append(fps)
    fps = np.mean(np.array(fpss))
    
    # draw rect and msg
    frame = my_draw_detection(frame,
                                  bboxes, scores, cls_inds,
                                  cfg,
                                  scale=1.0 * frame.shape[0] / img.shape[0],
                                  thr=0,
                                  fps=fps)
        
    cv2.imshow('', frame)
    key = cv2.waitKey(1)
        
    if key is ord('q'):
        break
        
cam.release()
cv2.destroyAllWindows()