In [None]:
import numpy as np

import torch
from torch import Tensor

import torch.nn as nn
from torch.autograd import Variable

import torch.optim as optim
from torch.optim import lr_scheduler

import torchvision
from torchvision import datasets, models, transforms

import time
import os

# PyTorch Models

## combine BN and conv

In [None]:
import torch.nn.functional as F

import yolo.config as cfg

In [None]:
class YoloHandComb(nn.Module):
    def __init__(self, width_mul=0.125):
        super(YoloHandComb, self).__init__()

        self.width_mul = width_mul;

        def conv_bn(inp, oup, stride):
            return nn.Sequential(
                nn.Conv2d(inp, oup, 3, stride, 1, bias=True),
                nn.LeakyReLU(inplace=True),
                
            )
        def conv_dw(inp, oup, stride):
            return nn.Sequential(
                nn.Conv2d(inp, inp, 3, stride=stride, padding=1, groups=inp, bias=True),
                nn.LeakyReLU(inplace=True),

                nn.Conv2d(inp, oup, 1, 1, 0, bias=True),
                nn.LeakyReLU(inplace=True),
            )
        self.feature = nn.Sequential( # feature of hand
            conv_bn(3, 10, 1),  # 3 low level preserve high res
            conv_dw(10, int(self.width_mul* 64), 2), # 7
            conv_dw(int(self.width_mul* 64), int(self.width_mul*64), 1), #  11
            conv_dw(int(self.width_mul*64), int(self.width_mul*128), 2), #  19
            conv_dw(int(self.width_mul*128), int(self.width_mul*128), 1), # 27
            conv_dw(int(self.width_mul*128), int(self.width_mul*256), 2), # 43
            conv_dw(int(self.width_mul*256), int(self.width_mul*256), 1), # 59
            conv_dw(int(self.width_mul*256), int(self.width_mul*512), 2), # 91
            
            conv_dw(int(self.width_mul*512), int(self.width_mul*512), 1), # 
            conv_dw(int(self.width_mul*512), int(self.width_mul*512), 1), # 
            conv_dw(int(self.width_mul*512), int(self.width_mul*512), 1), # 
        )
        
        # transfer
        inp = int(self.width_mul*512) 
        oup = int(self.width_mul*512)
        self.transfer = nn.Sequential(
            nn.Conv2d(inp, oup, kernel_size=3, stride=1, padding=1, bias=True),
            nn.LeakyReLU(inplace=True),
            
            nn.Conv2d(oup, oup, kernel_size=3, stride=1, padding=1, bias=True),
            nn.LeakyReLU(inplace=True),
        )
        
        # linear
        out_channels = cfg.num_anchors * (cfg.num_classes + 5)
        self.final_conv = nn.Conv2d(oup, out_channels, 1, 1, padding=0, bias=True)
        
    def forward(self, im_data):
        feature_map = self.feature(im_data) # get hand feature map batchsize x 320x240/8 --> 40x30
        h = self.transfer(feature_map)
        y = self.final_conv(h)
        
        return y
    
    def post_process(self, y):
        # for detection
        bsize, c, h, w = y.size() # c = cfg.num_anchors * (cfg.num_classes + 5)
        y_reshaped = y.permute(0, 2, 3, 1).contiguous().view(bsize, -1, cfg.num_anchors, cfg.num_classes+5) # shape=(bsize, wxh, num_a, num_c+5)
        
        # bbox related 0~4
        xy_pred = F.sigmoid(y_reshaped[:, :, :, 0:2])
        wh_pred = torch.exp(y_reshaped[:, :, :, 2:4])
        bbox_pred = torch.cat([xy_pred, wh_pred], 3) # (bsize, wxh, num_a, 4) 4: [sig(tx), sig(ty), exp(tw), exp(th)]
        
        iou_pred = F.sigmoid(y_reshaped[:, :, :, 4:5]) # (bsize, wxh, num_a, 1)
        
        # cls related 5~end
        score_pred = y_reshaped[:, :, :, 5:].contiguous()
        prob_pred = F.softmax(score_pred.view(-1, score_pred.size()[-1])).view_as(score_pred) # (bsize, wxh, num_a, num_cls)
        
        return bbox_pred, iou_pred, prob_pred

In [None]:
class YoloHand(nn.Module):
    def __init__(self, width_mul=0.125):
        super(YoloHand, self).__init__()

        self.width_mul = width_mul;

        def conv_bn(inp, oup, stride):
            return nn.Sequential(
                nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
                nn.BatchNorm2d(oup),
                nn.LeakyReLU(inplace=True),
                
            )
        def conv_dw(inp, oup, stride):
            return nn.Sequential(
                nn.Conv2d(inp, inp, 3, stride=stride, padding=1, groups=inp, bias=False),
                nn.BatchNorm2d(inp),
                nn.LeakyReLU(inplace=True),

                nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup),
                nn.LeakyReLU(inplace=True),
            )
        self.feature = nn.Sequential( # feature of hand
            conv_bn(3, 10, 1),  # 3 low level preserve high res
            conv_dw(10, int(self.width_mul* 64), 2), # 7
            conv_dw(int(self.width_mul* 64), int(self.width_mul*64), 1), #  11
            conv_dw(int(self.width_mul*64), int(self.width_mul*128), 2), #  19
            conv_dw(int(self.width_mul*128), int(self.width_mul*128), 1), # 27
            conv_dw(int(self.width_mul*128), int(self.width_mul*256), 2), # 43
            conv_dw(int(self.width_mul*256), int(self.width_mul*256), 1), # 59
            conv_dw(int(self.width_mul*256), int(self.width_mul*512), 2), # 91
            
            conv_dw(int(self.width_mul*512), int(self.width_mul*512), 1), # 
            conv_dw(int(self.width_mul*512), int(self.width_mul*512), 1), # 
            conv_dw(int(self.width_mul*512), int(self.width_mul*512), 1), # 
        )
        
        # transfer
        inp = int(self.width_mul*512) 
        oup = int(self.width_mul*512)
        self.transfer = nn.Sequential(
            nn.Conv2d(inp, oup, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(oup),
            nn.LeakyReLU(inplace=True),
            
            nn.Conv2d(oup, oup, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(oup),
            nn.LeakyReLU(inplace=True),
        )
        
        # linear
        out_channels = cfg.num_anchors * (cfg.num_classes + 5)
        self.final_conv = nn.Conv2d(oup, out_channels, 1, 1, padding=0, bias=True)
        
    def forward(self, im_data):
        feature_map = self.feature(im_data) # get hand feature map batchsize x 320x240/16 --> 20x15
        h = self.transfer(feature_map)
        y = self.final_conv(h)
        
        return y
    
    def post_process(self, y):
        # for detection
        bsize, c, h, w = y.size() # c = cfg.num_anchors * (cfg.num_classes + 5)
        y_reshaped = y.permute(0, 2, 3, 1).contiguous().view(bsize, -1, cfg.num_anchors, cfg.num_classes+5) # shape=(bsize, wxh, num_a, num_c+5)
        
        # bbox related 0~4
        xy_pred = F.sigmoid(y_reshaped[:, :, :, 0:2])
        wh_pred = torch.exp(y_reshaped[:, :, :, 2:4])
        bbox_pred = torch.cat([xy_pred, wh_pred], 3) # (bsize, wxh, num_a, 4) 4: [sig(tx), sig(ty), exp(tw), exp(th)]
        
        iou_pred = F.sigmoid(y_reshaped[:, :, :, 4:5]) # (bsize, wxh, num_a, 1)
        
        # cls related 5~end
        score_pred = y_reshaped[:, :, :, 5:].contiguous()
        prob_pred = F.softmax(score_pred.view(-1, score_pred.size()[-1])).view_as(score_pred) # (bsize, wxh, num_a, num_cls)
        
        return bbox_pred, iou_pred, prob_pred

In [None]:
net = YoloHand(width_mul=0.158)
net.load_state_dict(torch.load('models/yolohanddetect-crop-5-face-lowres-deeper-leaky-finetue'))
net.eval()

In [None]:
netc = YoloHandComb(width_mul=0.158)
netc.eval()

In [None]:
# first conv bn
conv = net.feature[0][0]
bn = net.feature[0][1]
conv2 = netc.feature[0][0]

W = conv.weight.data
gamma = bn.weight.data 
beta = bn.bias.data
svar = torch.sqrt(bn.running_var+bn.eps)
mean = bn.running_mean

conv2.weight.data = W * (gamma/svar).view(-1, 1,1,1)
conv2.bias.data = beta - gamma*mean/svar

# feature conv dw
for i, layer in enumerate(net.feature):
    if i != 0:
        conv = net.feature[i][0]
        bn = net.feature[i][1]
        conv2 = netc.feature[i][0]
        
        W = conv.weight.data
        gamma = bn.weight.data 
        beta = bn.bias.data
        svar = torch.sqrt(bn.running_var+bn.eps)
        mean = bn.running_mean

        conv2.weight.data = W * (gamma/svar).view(-1, 1,1,1)
        conv2.bias.data = beta - gamma*mean/svar
        
        
        conv = net.feature[i][3]
        bn = net.feature[i][4]
        conv2 = netc.feature[i][2]
        
        W = conv.weight.data
        gamma = bn.weight.data 
        beta = bn.bias.data
        svar = torch.sqrt(bn.running_var+bn.eps)
        mean = bn.running_mean

        conv2.weight.data = W * (gamma/svar).view(-1, 1,1,1)
        conv2.bias.data = beta - gamma*mean/svar

# transfer
for i, layer in enumerate(net.transfer):
    i = i // 3
    conv = net.transfer[i*3+0]
    bn = net.transfer[i*3+1]
    conv2 = netc.transfer[i*2+0]

    W = conv.weight.data
    gamma = bn.weight.data 
    beta = bn.bias.data
    svar = torch.sqrt(bn.running_var+bn.eps)
    mean = bn.running_mean

    conv2.weight.data = W * (gamma/svar).view(-1, 1,1,1)
    conv2.bias.data = beta - gamma*mean/svar

# final conv
netc.final_conv.weight = net.final_conv.weight

In [None]:
torch.save(netc.state_dict(), 'models/combined-yolohanddetect-crop-5-face-lowres-deeper-leaky-finetune')

In [None]:
netc.load_state_dict(torch.load('models/combined-yolohanddetect-crop-5-face-lowres-deeper-leaky-finetune'))
netc.eval()

## Practice and test combine model speed

In [None]:
import cv2
from torchvision import transforms
from torch.autograd import Variable
import time
from collections import deque
from utils import postprocess, my_draw_detection


cam = cv2.VideoCapture('/dev/video0')

means, stds = [0.5]*3, [0.25]*3
trans = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(means, stds)
])

use_gpu = False
yolohandnet = netc
if use_gpu:
    yolohandnet.cuda()
else:
    yolohandnet.cpu()

fpss = deque(maxlen=10)
while True:
    
    ret, frame = cam.read()
    if ret == False:
        break
    
    # inference start
    since = time.time()
    
    # transfrom
    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, cfg.infer_inp_size)
    timg = trans(img)
    timg = timg.view(1, *timg.shape)
    if use_gpu:
        cimg = Variable(timg.cuda())
    else:
        cimg = Variable(timg)
    
    # forward
    net_output = yolohandnet.post_process(yolohandnet(cimg))
    
    # post process
    bbox_pred, iou_pred, prob_pred = net_output
    bbox_pred, iou_pred, prob_pred = bbox_pred.data.numpy(), iou_pred.data.numpy(), prob_pred.data.numpy()
    post_output = postprocess(bbox_pred, iou_pred, prob_pred, cfg, 0.6)
    bboxes, scores, cls_inds = post_output

    # inference end
    now = time.time()
    t_frame = now - since
    fps = 1 / t_frame
    fpss.append(fps)
    fps = np.mean(np.array(fpss))
    
    # draw rect and msg
    frame = my_draw_detection(frame,
                                  bboxes, scores, cls_inds,
                                  cfg,
                                  scale=1.0 * frame.shape[0] / img.shape[0],
                                  thr=0,
                                  fps=fps)
        
    cv2.imshow('', frame)
    key = cv2.waitKey(1)
        
    if key is ord('q'):
        break
        
cam.release()
cv2.destroyAllWindows()

## convert to Caffe Model

In [None]:
from pytorch2caffe.pytorch2caffe import pytorch2caffe, plot_graph

In [None]:
input_var = Variable(torch.rand(1, 3, 240, 320))
output_var = netc(input_var)

output_dir = 'models'

# plot graph to png
plot_graph(output_var, os.path.join(output_dir, 'yolov2.dot'))

pytorch2caffe(input_var, output_var, 
              os.path.join(output_dir, 'combined-yolohanddetect-crop-5-face-lowres-deeper-leaky-finetune.prototxt'),
              os.path.join(output_dir, 'combined-yolohanddetect-crop-5-face-lowres-deeper-leaky-finetune.caffemodel'))

## load caffe model and test

## load model

In [None]:
import os, caffe

output_dir = 'models'
model_def = os.path.join(output_dir, 'combined-yolohanddetect-crop-5-face-lowres-deeper-leaky-finetune.prototxt')
model_weights = os.path.join(output_dir, 'combined-yolohanddetect-crop-5-face-lowres-deeper-leaky-finetune.caffemodel')

caffe_net = caffe.Net(model_def,      # defines the structure of the model
                model_weights,  # contains the trained weights
                caffe.TEST)     # use test mode (e.g., don't perform dropout)

In [None]:
import numpy as np
import cv2, time
from collections import deque
from utils import net_postprocess, postprocess, my_draw_detection
from yolo import config as cfg

cam = cv2.VideoCapture('/dev/video0')

mu = np.asarray([0.5, 0.5, 0.5])
std = np.asarray([0.25, 0.25, 0.25])
print 'mean-subtracted values:', zip('RGB', mu)

# create transformer for the input called 'data'
transformer = caffe.io.Transformer({'data': caffe_net.blobs['data'].data.shape})
transformer.set_transpose('data', (2,0,1))  # move image channels to outermost dimension
transformer.set_raw_scale('data', 1/255.)      # rescale to [0, 1] from [0, 255]
transformer.set_mean('data', mu)            # subtract the dataset-mean value in each channel
transformer.set_input_scale('data', 1/0.25)

yolohandnet = caffe_net

fpss = deque(maxlen=10)
while True:
    
    ret, frame = cam.read()
    if ret == False:
        break
    
    # inference start
    since = time.time()
    
    # transfrom
    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, tuple(cfg.infer_inp_size))
    transformed_image = transformer.preprocess('data', img)
    
    # forward
    yolohandnet.blobs['data'].data[...] = transformed_image
    net_output = yolohandnet.forward()['ConvNdBackward47']
    net_output = net_postprocess(net_output, cfg)
    
    # post process
    bbox_pred, iou_pred, prob_pred = net_output
    post_output = postprocess(bbox_pred, iou_pred, prob_pred, cfg, 0.6)
    bboxes, scores, cls_inds = post_output

    # inference end
    now = time.time()
    t_frame = now - since
    fps = 1 / t_frame
    fpss.append(fps)
    fps = np.mean(np.array(fpss))
    
    # draw rect and msg
    frame = my_draw_detection(frame,
                                  bboxes, scores, cls_inds,
                                  cfg,
                                  scale=1.0 * frame.shape[0] / img.shape[0],
                                  thr=0,
                                  fps=fps)
        
    cv2.imshow('', frame)
    key = cv2.waitKey(1)
        
    if key is ord('q'):
        break
        
cam.release()
cv2.destroyAllWindows()