In [52]:
import torchvision.models
import torchvision.models._utils as _utils

mov3sm = torchvision.models.mobilenet_v3_small()

In [53]:
print(mov3sm)

MobileNetV3(
  (features): Sequential(
    (0): ConvBNActivation(
      (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
      (2): Hardswish()
    )
    (1): InvertedResidual(
      (block): Sequential(
        (0): ConvBNActivation(
          (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=16, bias=False)
          (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
          (2): ReLU(inplace=True)
        )
        (1): SqueezeExcitation(
          (fc1): Conv2d(16, 8, kernel_size=(1, 1), stride=(1, 1))
          (relu): ReLU(inplace=True)
          (fc2): Conv2d(8, 16, kernel_size=(1, 1), stride=(1, 1))
        )
        (2): ConvBNActivation(
          (0): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_r

In [54]:
# 64 128 256
# 64 128 256 512
# 16 24 40 48 96

# 40 48 96
body5 = _utils.IntermediateLayerGetter(mov3sm.features, {'4':0, '7':1, '11':2})

In [55]:
body5

IntermediateLayerGetter(
  (0): ConvBNActivation(
    (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
    (2): Hardswish()
  )
  (1): InvertedResidual(
    (block): Sequential(
      (0): ConvBNActivation(
        (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=16, bias=False)
        (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
      )
      (1): SqueezeExcitation(
        (fc1): Conv2d(16, 8, kernel_size=(1, 1), stride=(1, 1))
        (relu): ReLU(inplace=True)
        (fc2): Conv2d(8, 16, kernel_size=(1, 1), stride=(1, 1))
      )
      (2): ConvBNActivation(
        (0): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
        (2): Identity()
      )
   

In [56]:
import numpy as np
import cv2
import torch


In [57]:
from data import WiderFaceDetection, detection_collate, preproc, cfg_mnet, cfg_re50

img_dim = 840
rgb_mean = (104, 117, 123) # bgr order

dataset = WiderFaceDetection( './data/widerface/train/label.txt',preproc(img_dim, rgb_mean))

In [58]:
import torch.utils.data as data

batch_iterator = iter(data.DataLoader(dataset, 2, shuffle=True, num_workers=4, collate_fn=detection_collate))

In [59]:
images, targets = next(batch_iterator)

In [60]:
images.shape

torch.Size([2, 3, 840, 840])

In [61]:
targets

[tensor([[0.6300, 0.0920, 0.6540, 0.1420, 0.6380, 0.1080, 0.6400, 0.1100, 0.6300,
          0.1140, 0.6320, 0.1260, 0.6340, 0.1260, 1.0000],
         [0.5120, 0.0360, 0.5660, 0.1280, 0.5329, 0.0688, 0.5353, 0.0706, 0.5115,
          0.0851, 0.5173, 0.1031, 0.5225, 0.1042, 1.0000],
         [0.1480, 0.1320, 0.1700, 0.1720, 0.1560, 0.1440, 0.1580, 0.1440, 0.1460,
          0.1500, 0.1540, 0.1580, 0.1520, 0.1580, 1.0000],
         [0.0020, 0.0460, 0.0520, 0.1060, 0.0214, 0.0642, 0.0302, 0.0566, 0.0364,
          0.0689, 0.0352, 0.0857, 0.0414, 0.0827, 1.0000]]),
 tensor([[0.7122, 0.0203, 0.8983, 0.2645, 0.7449, 0.1163, 0.8283, 0.1225, 0.7681,
          0.1657, 0.7495, 0.1950, 0.8206, 0.2012, 1.0000],
         [0.4215, 0.0378, 0.6134, 0.1948, 0.5008, 0.0641, 0.5732, 0.1052, 0.5153,
          0.0979, 0.4718, 0.1257, 0.5092, 0.1474, 1.0000],
         [0.0262, 0.0203, 0.1831, 0.2471, 0.0571, 0.1193, 0.1306, 0.1193, 0.0888,
          0.1726, 0.0557, 0.1856, 0.1378, 0.1884, 1.0000]])]

In [62]:
body5outputs = body5(images)
body5outputs

OrderedDict([(0,
              tensor([[[[-1.0287, -0.2955, -0.8011,  ..., -0.6917, -0.3188, -0.1718],
                        [-0.8955, -0.4953, -0.2492,  ..., -1.3434, -0.6797, -0.5948],
                        [ 0.1749,  0.8077,  0.0800,  ..., -0.4142, -1.0549, -0.2239],
                        ...,
                        [-0.0838, -0.6984,  0.2127,  ...,  0.8075,  0.8470, -0.3681],
                        [ 0.0345, -0.4816,  0.2770,  ...,  0.3087,  0.5350,  0.1047],
                        [ 0.0804, -0.0887,  0.6788,  ...,  0.2320,  0.6985, -0.1012]],
              
                       [[-0.8315,  0.6266, -0.2660,  ..., -0.7062,  0.7801,  0.5138],
                        [ 0.4540, -1.8000, -0.7581,  ..., -1.2792,  0.1847, -1.1135],
                        [-0.1133, -1.6506, -0.6727,  ...,  1.6151,  0.3576,  0.6341],
                        ...,
                        [-0.3389,  0.7374,  0.2495,  ...,  0.4880,  0.3129,  0.1078],
                        [-0.3701,  1.2801, -0.079

In [63]:
body5outputs.keys()

odict_keys([0, 1, 2])

In [64]:
body5outputs.values()

odict_values([tensor([[[[-1.0287, -0.2955, -0.8011,  ..., -0.6917, -0.3188, -0.1718],
          [-0.8955, -0.4953, -0.2492,  ..., -1.3434, -0.6797, -0.5948],
          [ 0.1749,  0.8077,  0.0800,  ..., -0.4142, -1.0549, -0.2239],
          ...,
          [-0.0838, -0.6984,  0.2127,  ...,  0.8075,  0.8470, -0.3681],
          [ 0.0345, -0.4816,  0.2770,  ...,  0.3087,  0.5350,  0.1047],
          [ 0.0804, -0.0887,  0.6788,  ...,  0.2320,  0.6985, -0.1012]],

         [[-0.8315,  0.6266, -0.2660,  ..., -0.7062,  0.7801,  0.5138],
          [ 0.4540, -1.8000, -0.7581,  ..., -1.2792,  0.1847, -1.1135],
          [-0.1133, -1.6506, -0.6727,  ...,  1.6151,  0.3576,  0.6341],
          ...,
          [-0.3389,  0.7374,  0.2495,  ...,  0.4880,  0.3129,  0.1078],
          [-0.3701,  1.2801, -0.0799,  ...,  0.4561,  0.3373,  0.3979],
          [-0.4876,  0.8539,  0.1361,  ...,  0.3005,  0.3353,  0.2256]],

         [[-0.2215,  0.2319,  0.1879,  ...,  1.5161, -0.6371,  0.1451],
          [-0.29

In [65]:
body5outputsvalues = list(body5outputs.values())

In [66]:
body5outputsvalues[0].shape

torch.Size([2, 40, 53, 53])

In [67]:
body5outputsvalues[1].shape

torch.Size([2, 48, 53, 53])

In [68]:
body5outputsvalues[2].shape

torch.Size([2, 96, 27, 27])

In [69]:
from models.retinaface import RetinaFace

cfg = cfg_mnet

mnet1 = RetinaFace(cfg)


In [70]:
import torch

with torch.no_grad():
    mnet1outputs = mnet1(images)

In [71]:
import numpy as np

len(mnet1outputs)

3

In [72]:
mnet1outputs[0].shape

torch.Size([2, 29126, 4])

In [73]:
mnet1outputs[1].shape

torch.Size([2, 29126, 2])

In [74]:
mnet1outputs[2].shape

torch.Size([2, 29126, 10])

In [199]:
import torch
import torch.nn as nn
import torchvision.models.detection.backbone_utils as backbone_utils
import torchvision.models._utils as _utils
import torch.nn.functional as F
from collections import OrderedDict

from models.net import MobileNetV1 as MobileNetV1
from models.net import FPN as FPN
from models.net import SSH as SSH

def conv_bn(inp, oup, stride = 1, leaky = 0):
    return nn.Sequential(
        nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
        nn.BatchNorm2d(oup),
        nn.LeakyReLU(negative_slope=leaky, inplace=True)
    )

def conv_bn_no_relu(inp, oup, stride):
    return nn.Sequential(
        nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
        nn.BatchNorm2d(oup),
    )

def conv_bn1X1(inp, oup, stride, leaky=0):
    return nn.Sequential(
        nn.Conv2d(inp, oup, 1, stride, padding=0, bias=False),
        nn.BatchNorm2d(oup),
        nn.LeakyReLU(negative_slope=leaky, inplace=True)
    )

def conv_dw(inp, oup, stride, leaky=0.1):
    return nn.Sequential(
        nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
        nn.BatchNorm2d(inp),
        nn.LeakyReLU(negative_slope= leaky,inplace=True),

        nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
        nn.BatchNorm2d(oup),
        nn.LeakyReLU(negative_slope= leaky,inplace=True),
    )

class SSH_mv3(nn.Module):
    def __init__(self, in_channel, out_channel):
        super(SSH, self).__init__()
        assert out_channel % 4 == 0
        leaky = 0
        if (out_channel <= 64):
            leaky = 0.1
        self.conv3X3 = conv_bn_no_relu(in_channel, out_channel//2, stride=1)

        self.conv5X5_1 = conv_bn(in_channel, out_channel//4, stride=1, leaky = leaky)
        self.conv5X5_2 = conv_bn_no_relu(out_channel//4, out_channel//4, stride=1)

        self.conv7X7_2 = conv_bn(out_channel//4, out_channel//4, stride=1, leaky = leaky)
        self.conv7x7_3 = conv_bn_no_relu(out_channel//4, out_channel//4, stride=1)

    def forward(self, input):
        conv3X3 = self.conv3X3(input)

        conv5X5_1 = self.conv5X5_1(input)
        conv5X5 = self.conv5X5_2(conv5X5_1)

        conv7X7_2 = self.conv7X7_2(conv5X5_1)
        conv7X7 = self.conv7x7_3(conv7X7_2)

        out = torch.cat([conv3X3, conv5X5, conv7X7], dim=1)
        out = F.relu(out)
        return out

class ClassHead(nn.Module):
    def __init__(self,inchannels=512,num_anchors=3):
        super(ClassHead,self).__init__()
        self.num_anchors = num_anchors
        self.conv1x1 = nn.Conv2d(inchannels,self.num_anchors*2,kernel_size=(1,1),stride=1,padding=0)

    def forward(self,x):
        out = self.conv1x1(x)
        out = out.permute(0,2,3,1).contiguous()

        return out.view(out.shape[0], -1, 2)

class BboxHead(nn.Module):
    def __init__(self,inchannels=512,num_anchors=3):
        super(BboxHead,self).__init__()
        self.conv1x1 = nn.Conv2d(inchannels,num_anchors*4,kernel_size=(1,1),stride=1,padding=0)

    def forward(self,x):
        out = self.conv1x1(x)
        out = out.permute(0,2,3,1).contiguous()

        return out.view(out.shape[0], -1, 4)

class LandmarkHead(nn.Module):
    def __init__(self,inchannels=512,num_anchors=3):
        super(LandmarkHead,self).__init__()
        self.conv1x1 = nn.Conv2d(inchannels,num_anchors*10,kernel_size=(1,1),stride=1,padding=0)

    def forward(self,x):
        out = self.conv1x1(x)
        out = out.permute(0,2,3,1).contiguous()

        return out.view(out.shape[0], -1, 10)


class RetinaFace_mobilenetv3sm(nn.Module):
    def __init__(self, cfg = None, phase = 'train'):
        """
        :param cfg:  Network related settings.
        :param phase: train or test.
        """
        super(RetinaFace_mobilenetv3sm,self).__init__()
        self.phase = phase
        backbone = None
        backbone = torchvision.models.mobilenet_v3_small()
        # backbone.features[4].block[3][0].stride=(1,1)
        # backbone.features[9].block[3][0].stride=(1,1)
        print(backbone)

        self.body = _utils.IntermediateLayerGetter(backbone.features, {'4':0, '7':1, '11':2})
        # in_channels_stage2 = cfg['in_channel']
        in_channels_list = [
            40,
            48,
            96,
        ]
        # out_channels = cfg['out_channel']
        out_channels = 64
        self.fpn = FPN(in_channels_list,out_channels)
        self.ssh1 = SSH(out_channels, out_channels)
        self.ssh2 = SSH(out_channels, out_channels)
        self.ssh3 = SSH(out_channels, out_channels)

        self.ClassHead = self._make_class_head(fpn_num=3, inchannels=out_channels)
        self.BboxHead = self._make_bbox_head(fpn_num=3, inchannels=out_channels)
        self.LandmarkHead = self._make_landmark_head(fpn_num=3, inchannels=out_channels)

    def _make_class_head(self,fpn_num=3,inchannels=64,anchor_num=2):
        classhead = nn.ModuleList()
        for i in range(fpn_num):
            classhead.append(ClassHead(inchannels,anchor_num))
        return classhead

    def _make_bbox_head(self,fpn_num=3,inchannels=64,anchor_num=2):
        bboxhead = nn.ModuleList()
        for i in range(fpn_num):
            bboxhead.append(BboxHead(inchannels,anchor_num))
        return bboxhead

    def _make_landmark_head(self,fpn_num=3,inchannels=64,anchor_num=2):
        landmarkhead = nn.ModuleList()
        for i in range(fpn_num):
            landmarkhead.append(LandmarkHead(inchannels,anchor_num))
        return landmarkhead

    def forward(self,inputs):
        out = self.body(inputs)

        # FPN
        fpn = self.fpn(out)

        # SSH
        feature1 = self.ssh1(fpn[0])
        feature2 = self.ssh2(fpn[1])
        feature3 = self.ssh3(fpn[2])
        features = [feature1, feature2, feature3]
        print("feature1.shape: ", feature1.shape)
        print("feature2.shape: ", feature2.shape)
        print("feature3.shape: ", feature3.shape)

        bbox_regressions = torch.cat([self.BboxHead[i](feature) for i, feature in enumerate(features)], dim=1)
        classifications = torch.cat([self.ClassHead[i](feature) for i, feature in enumerate(features)],dim=1)
        ldm_regressions = torch.cat([self.LandmarkHead[i](feature) for i, feature in enumerate(features)], dim=1)

        if self.phase == 'train':
            output = (bbox_regressions, classifications, ldm_regressions)
        else:
            output = (bbox_regressions, F.softmax(classifications, dim=-1), ldm_regressions)
        return output

In [200]:
mnetv3 = RetinaFace_mobilenetv3sm()

MobileNetV3(
  (features): Sequential(
    (0): ConvBNActivation(
      (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
      (2): Hardswish()
    )
    (1): InvertedResidual(
      (block): Sequential(
        (0): ConvBNActivation(
          (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=16, bias=False)
          (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
          (2): ReLU(inplace=True)
        )
        (1): SqueezeExcitation(
          (fc1): Conv2d(16, 8, kernel_size=(1, 1), stride=(1, 1))
          (relu): ReLU(inplace=True)
          (fc2): Conv2d(8, 16, kernel_size=(1, 1), stride=(1, 1))
        )
        (2): ConvBNActivation(
          (0): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_r

In [201]:
with torch.no_grad():
    mnetv3outputs = mnetv3(images)

feature1.shape:  torch.Size([2, 64, 53, 53])
feature2.shape:  torch.Size([2, 64, 53, 53])
feature3.shape:  torch.Size([2, 64, 27, 27])


In [202]:
len(mnetv3outputs)

3

In [203]:
mnetv3outputs[0].shape

torch.Size([2, 12694, 4])

In [204]:
mnetv3outputs[1].shape

torch.Size([2, 12694, 2])

In [137]:
mnetv3outputs[2].shape

torch.Size([2, 66150, 10])

In [31]:
cfgres = cfg_re50

res50fnet = RetinaFace(cfgres)

In [32]:
with torch.no_grad():
    res50fnetoutputs = res50fnet(images)

feature1.shape:  torch.Size([2, 256, 105, 105])
feature2.shape:  torch.Size([2, 256, 53, 53])
feature3.shape:  torch.Size([2, 256, 27, 27])


In [33]:
len(res50fnetoutputs)

3

In [34]:
res50fnetoutputs[0].shape

torch.Size([2, 29126, 4])

In [35]:
res50fnetoutputs[1].shape

torch.Size([2, 29126, 2])

In [36]:
res50fnetoutputs[2].shape

torch.Size([2, 29126, 10])

In [37]:
len(targets)

2

In [38]:
targets[0].shape

torch.Size([9, 15])

In [39]:
targets[1].shape

torch.Size([1, 15])