In [21]:
%run lib.ipynb
%run l2_norm.ipynb
%run default_box.ipynb

             0         1         2         3
0     0.013333  0.013333  0.100000  0.100000
1     0.013333  0.013333  0.141421  0.141421
2     0.013333  0.013333  0.200000  0.100000
3     0.013333  0.013333  0.100000  0.200000
4     0.013333  0.040000  0.100000  0.100000
...        ...       ...       ...       ...
8727  0.833333  0.833333  0.558928  1.000000
8728  0.500000  0.500000  0.880000  0.880000
8729  0.500000  0.500000  0.961249  0.961249
8730  0.500000  0.500000  1.000000  0.679706
8731  0.500000  0.500000  0.679706  1.000000

[8732 rows x 4 columns]


In [22]:
def create_vgg():
    layers = []
    in_channels = 3
    
    configs = [64, 64, "M", 128, 128, "M",
              256, 256, 256, "MC", 512, 512, 512, "M",
              512, 512, 512]
    
    for config in configs:
        if config == "M":  # floor
            layers += [torch.nn.MaxPool2d(kernel_size=2, stride=2)]
        elif config == "MC":   # ceiling
            layers += [torch.nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
        else:
            conv2d = torch.nn.Conv2d(in_channels, config, kernel_size=3, padding=1)
            layers += [conv2d, torch.nn.ReLU(inplace=True)]
            in_channels = config
        
    pool5 = torch.nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
    conv6 = torch.nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
    conv7 = torch.nn.Conv2d(1024, 1024, kernel_size=1)
    
    layers += [pool5, conv6, torch.nn.ReLU(inplace=True), conv7, torch.nn.ReLU(inplace=True)]
    
    return torch.nn.ModuleList(layers)

In [23]:
def create_extras():
    layers = []
    in_channels = 1024
    
    configs = [256, 512, 128, 256, 128, 256, 128, 256]
    layers += [torch.nn.Conv2d(in_channels, configs[0], kernel_size=1)]
    layers += [torch.nn.Conv2d(configs[0], configs[1], kernel_size=3, stride=2, padding=1)]
    layers += [torch.nn.Conv2d(configs[1], configs[2], kernel_size=1)]
    layers += [torch.nn.Conv2d(configs[2], configs[3], kernel_size=3, stride=2, padding=1)]
    layers += [torch.nn.Conv2d(configs[3], configs[4], kernel_size=1)]
    layers += [torch.nn.Conv2d(configs[4], configs[5], kernel_size=3)]
    layers += [torch.nn.Conv2d(configs[5], configs[6], kernel_size=1)]
    layers += [torch.nn.Conv2d(configs[6], configs[7], kernel_size=3)]
    
    return torch.nn.ModuleList(layers)

In [24]:
def create_loc_conf(num_classes=21, bbox_ratio_num=[4, 6, 6, 6, 4, 4]):
    loc_layers = []
    conf_layers = []
    
    # source1
    # loc
    loc_layers += [torch.nn.Conv2d(512, bbox_ratio_num[0] * 4, kernel_size=3, padding=1)]
    # conf
    conf_layers += [torch.nn.Conv2d(512, bbox_ratio_num[0] * num_classes, kernel_size=3, padding=1)]
    
    # source2
    loc_layers += [torch.nn.Conv2d(1024, bbox_ratio_num[1] * 4, kernel_size=3, padding=1)]
    conf_layers += [torch.nn.Conv2d(1024, bbox_ratio_num[1] * num_classes, kernel_size=3, padding=1)]
    
    # source3
    loc_layers += [torch.nn.Conv2d(512, bbox_ratio_num[2] * 4, kernel_size=3, padding=1)]
    conf_layers += [torch.nn.Conv2d(512, bbox_ratio_num[2] * num_classes, kernel_size=3, padding=1)]
    
    # source4
    loc_layers += [torch.nn.Conv2d(256, bbox_ratio_num[3] * 4, kernel_size=3, padding=1)]
    conf_layers += [torch.nn.Conv2d(256, bbox_ratio_num[3] * num_classes, kernel_size=3, padding=1)]
    
    # source5
    loc_layers += [torch.nn.Conv2d(256, bbox_ratio_num[4] * 4, kernel_size=3, padding=1)]
    conf_layers += [torch.nn.Conv2d(256, bbox_ratio_num[4] * num_classes, kernel_size=3, padding=1)]
    
    # source6
    loc_layers += [torch.nn.Conv2d(256, bbox_ratio_num[5] * 4, kernel_size=3, padding=1)]
    conf_layers += [torch.nn.Conv2d(256, bbox_ratio_num[5] * num_classes, kernel_size=3, padding=1)]
    
    return torch.nn.ModuleList(loc_layers), torch.nn.ModuleList(conf_layers)

In [25]:
config = {
    "num_classes": 21,
    "input_size": 300,
    "bbox_aspect_num": [4, 6, 6, 6, 4, 4],
    "feature_maps": [38, 19, 10, 5, 3, 1],
    "steps": [8, 16, 32, 64, 100, 300], # size of default box
    "min_size": [30, 60, 111, 162, 213, 264],
    "max_size": [60, 111, 162, 213, 264, 315],
    "aspect_ratios": [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
}

class SSD(torch.nn.Module):
    def __init__(self, phase, config):
        super(SSD, self).__init__()
        self.phase = phase
        self.num_classes = config["num_classes"]
        
        # create main module
        self.vgg = create_vgg()
        self.extras = create_extras()
        self.loc, self.conf = create_loc_conf(self.num_classes, config["bbox_aspect_num"])
        self.L2Norm = L2Norm()
        
        # create default box
        dbox = DefaultBox(config)
        self.dbox_list = dbox.create_defbox()
        
        if phase == "inference":
            self.detect = Detect()
            
    def forward(self, x):
        sources = list()
        loc = list()
        conf = list()
        
        for k in range(23):
            x = self.vgg[k](x)
        
        # source1
        source1 = self.L2Norm(x)
        sources.append(source1)
        
        for k in range(23, len(self.vgg)):
            x = self.vgg[k](x)
            
        # source2
        sources.append(x)
        
        # source3 -> 6
        for k, v in enumerate(self.extras):
            x = F.relu(v(x), inplace=True)
            if k % 2 == 1:
                sources.append(x)
        
        for (x, l, c) in zip(sources, self.loc, self.conf):
            # (batch_size, 4 * aspect_ratio_num, featuremap_height, featuremap_width)
            # --> (batch_size, featuremap_height, featuremap_width, 4 * aspect_ratio_num)
            loc.append(l(x).permute(0, 2, 3, 1).contiguous())
            conf.append(c(x).permute(0, 2, 3, 1).contiguous())
            
        loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)   # (batch_size, 34928)  - 34928 = 4 * 8732
        conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)    # (batch_size, 8732*21)
        
        loc = loc.view(loc.size(0), -1, 4)   # (batch_size, 8732, 4)
        conf = conf.view(conf.size(0), -1, self.num_classes)    # (batch_size, 8732, 21)
        
        output = (loc, conf, self.dbox_list)
        
        if self.phase == "inference":
            return self.detect(output[0], output[1], output[2])
        else:
            return output

In [26]:
def decode(loc, defbox_list):
    """
    parameters:
        loc: [8732, 4]
        defbox_list: [8732, 4]
    
    returns:
        boxes [xmin, ymin, xmax, ymax]
    """
    
    boxes = torch.cat((
        defbox_list[:, :2] + 0.1 * loc[:, :2] * defbox_list[:, 2:],
        defbox_list[:, 2:] * torch.exp(loc[:, 2:] * 0.2)
    ), dim=1)
    
    boxes[:, :2] -= boxes[:, 2:]/2  # calculate xmin, ymin
    boxes[:, 2:] += boxes[:, :2]    # calculate xmax, ymax
    
    return boxes

In [27]:
# non-maximum supression
def nms(boxes, scores, overlap=0.45, top_k=200):
    '''
    boxes: [numbox, 4(xmin, ymin, xmax, ymax)]
    scores: [numbox]
    '''
    
    count = 0
    keep = scores.new(scores.size()).zero_().long()
    
    # boxes coordinates
    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    x2 = boxes[:, 2]
    y2 = boxes[:, 3]
    
    # area of boxes
    area = torch.mul(x2 - x1, y2 - y1)
    
    tmp_x1 = boxes.new()
    tmp_y1 = boxes.new()
    tmp_x2 = boxes.new()
    tmp_y2 = boxes.new()
    tmp_w = boxes.new()
    tmp_h = boxes.new()
    
    value, idx = scores.sort(0)
    idx = idx[-top_k:]  # lay ra top_k phan tu cuoi cung
    
    while idx.numel() > 0:
        i = idx[-1]
        keep[count] = i
        count += 1
        
        if idx.size(0) == 1:
            break;
            
        idx = idx[:-1]  # bo di thang cuoi cung
        
        # box information
        torch.index_select(x1, 0, idx, out=tmp_x1)
        torch.index_select(y1, 0, idx, out=tmp_y1)
        torch.index_select(x2, 0, idx, out=tmp_x2)
        torch.index_select(y2, 0, idx, out=tmp_y2)
        
        tmp_x1 = torch.clamp(tmp_x1, min=x1[i])  # x1[i] if tmp_x1 < x1
        tmp_y1 = torch.clamp(tmp_y1, min=y1[i])
        tmp_x2 = torch.clamp(tmp_x2, max=x2[i])
        tmp_y2 = torch.clamp(tmp_y2, max=y2[i])
        
        # chuyen ve tensor co size ma idx duoc giam di 1
        tmp_w.resize_as_(tmp_x2)
        tmp_h.resize_as_(tmp_y2)
        
        tmp_w = torch.clamp(tmp_w, min=0.0)
        tmp_h = torch.clamp(tmp_h, min=0.0)
        
        
        # compute intersect area
        inter = tmp_w * tmp_h
        
        other_areas = torch.index_select(area, 0, idx)
        union = area[i] + other_areas - inter
        
        iou = inter / union
        
        idx = idx[iou.le(overlap)]
        
    return keep, count

In [28]:
class Detect(Function):
    def __init__(self, conf_thres=0.01, top_k=200, nms_thresh=0.45):
        self.softmax = torch.nn.Softmax(dim=-1)
        self.conf_thres = conf_thres
        self.top_k = top_k
        self.nms_thresh = nms_thresh
        
    def forward(self, loc_data, conf_data, dbox_list):
        num_batch = loc_data.size(0)  # batch size
        num_dbox = loc_data.size(1)  # 8732
        num_classes = conf_data.size(2)  #21
        
        conf_data = self.softmax(conf_data)  
        # (bactch_num, num_dbox, num_classes) --> (batch_num, num_classes, num_dbox)
        conf_preds = conf_data.transpose(2, 1)
        
        output = torch.zeros(num_batch, num_classes, self.top_k, 5) # 5 - xmax, xmin, ymax, ymin, label
        
        # xu ly tung anh trong 1 batch cac buc anh
        for i in range(num_batch):
            # tinh bbox tu offset information va default box
            decode_boxes = decode(loc_data[i], dbox_list[i])
            
            # copy confident score cua anh thu i
            conf_scores = conf_preds[i].clone()
            
            for cl in range(1, num_classes):
                # chi lay nhung confidence > conf_thres (neu > conf_thres thi c_mask = 1, nguoc lai c_mask = 0)
                c_mask = conf_preds[cl].gt(self.conf_thres)
                scores = conf_preds[cl][c_mask]
                
                if scores.nelement() == 0:
                    continue
                
                # Đưa chiều về giống chiều của decode_boxes
                l_mask = c_mask.unsquezee(1).expand_as(decode_boxes)
                
                boxes = decode_boxes[l_mask].view(-1, 4)
                
                ids, count = nms(boxes, scores, self.nms_thresh, self.top_k)
                
                output[i, cl, :count] = torch.cat((scores[ids[:count]].unsquezee(1), boxes[ids[:count]]), 1)
        return output

In [29]:
if __name__ == "__main__":
#     vgg = create_vgg()
# #     print("vgg ", vgg)
#     extras = create_extras()
# #     print("extras ", extras)

#     loc, conf = create_loc_conf()
#     print("Loc ", loc)
#     print("Conf ", conf)
    ssd = SSD(phase="train", config=config)
    print(ssd)

SSD(
  (vgg): ModuleList(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, cei