# Backborn 
YOLO V3는 백본으로 DarkNet-53을 사용한다.

In [2]:
# 필요한 라이브러리
import torch
from torch import nn


In [3]:
# 입력이미지 랜덤생성
input_image=torch.randn(1,3,416,416)
print(input_image.shape)

torch.Size([1, 3, 416, 416])


In [4]:
def DBL(in_c,out_c,kernel_size,stride,padding):
    dbl_block=nn.Sequential(nn.Conv2d(in_c,out_c,kernel_size=kernel_size,padding=padding,stride=stride),
                            nn.BatchNorm2d(out_c),
                            nn.LeakyReLU())
    return dbl_block

class Res_unit(nn.Module):
    def __init__(self,in_c):
        super(Res_unit,self).__init__()
        
        reduce_c=int(in_c/2)
        self.layer1=DBL(in_c,reduce_c,1,1,0)
        self.layer2=DBL(reduce_c,in_c,3,1,1)
    
    def forward(self,x):
        res_connection= x
        out=self.layer1(x)
        out=self.layer2(out)
        out=out+res_connection
        return out
    

In [10]:
# Res_unit(3).forward(input_image)

In [33]:
class Darknet53(nn.Module):
    def __init__(self,block):
        super(Darknet53,self).__init__()
        
        self.conv1=DBL(3,32,3,1,1)
        self.conv2=DBL(32,64,3,2,1)
        
        self.res_block1=self.num_block(block,64,num=1)
        self.conv3=DBL(64,128,3,2,1)
        
        self.res_block2=self.num_block(block,128,2)
        self.conv4=DBL(128,256,3,2,1)
        
        self.res_block3=self.num_block(block,256,8)
        self.conv5=DBL(256,512,3,2,1) # 3*3 conv하면 마진1 > 패딩으로 채워줌
        
        self.res_block4=self.num_block(block,512,8)
        self.conv6=DBL(512,1024,3,2,1)
        
        self.res_block5=self.num_block(block,1024,4)

    def forward(self,x):
        x=self.conv1(x)
        x=self.conv2(x)
        x=self.res_block1(x)
        x=self.conv3(x)
        x=self.res_block2(x)
        x=self.conv4(x)
        x=self.res_block3(x)
        feature3=x
        x=self.conv5(x)
        x=self.res_block4(x)
        feature2=x
        x=self.conv6(x)
        x=self.res_block5(x)
        feature1=x
        
        return feature1,feature2,feature3
    def num_block(self,block,in_c,num):
        layers=[]
        for i in range(num):
            layers.append(Res_unit(in_c))
        return nn.Sequential(*layers)
        

In [9]:
# Darknet53(Res_unit).forward(input_image)

#  yolov3 Architecture

In [36]:
class Yolo_v3(nn.Module):
    def __init__(self):
        super(Yolo_v3,self).__init__()
        self.class_num=80
        self.img_size=416
        anchor = {'grid52': [(10, 13), (16, 30), (33, 23)],
                   'grid26': [(30, 61), (62, 45), (59, 119)],
                   'grid13': [(116, 90), (156, 198), (373, 326)]}
        
        
        self.darknet53=Darknet53(Res_unit)
        
        self.conv_set1=self.conv_set(1024,512)
        self.conv_final1=self.conv_final(512,255)
        self.anchor_box1=YoloDetection(anchor['grid13'],self.img_size,self.class_num)
        
        self.conv_layer1=DBL(512,256,1,1,0)
        self.upsampling1=nn.Upsample(scale_factor=2,mode='nearest')
        
        self.conv_set2=self.conv_set(768,256) # 왜 나누기 3???????????
        self.conv_final2=self.conv_final(256,255)
        self.anchor_box2=YoloDetection(anchor['grid26'],self.img_size,self.class_num)
        
        self.conv_layer2=DBL(256,128,1,1,0)
        self.upsampling2=nn.Upsample(scale_factor=2,mode='nearest')
        self.conv_set3=self.conv_set(384,128)
        self.conv_final3=self.conv_final(128,255)
        self.anchor_box3=YoloDetection(anchor['grid52'],self.img_size,self.class_num)
        
    def forward(self,x):
        print("darknet53 ")
        res5,res4,res3=self.darknet53(x)
        
        print("=========================================")
        print("yolov3")
#         print("res3: ",res3.shape) # res 1이후
#         print("res4: ",res4.shape) # res 3이후
#         print("res5: ",res5.shape)# 마지막

        # 1번째 feature 뽑기
        out1=self.conv_set1(res5)
        first=self.conv_final1(out1)
        
        anchor13=self.anchor_box1(first) #[1,507,85]
        
        # 2번째 feature
        out2=self.conv_layer1(out1)
        out2=self.upsampling1(out2)
        out2=torch.cat((out2,res4),dim=1)
#         print("concate1_result:",out2.shape)
        out2=self.conv_set2(out2)
        second=self.conv_final2(out2)
        
        anchor26=self.anchor_box2(second) #[1, 2028, 85]

        
        #3번째 feature
        out3=self.conv_layer2(out2)
        out3=self.upsampling2(out3)
        out3=torch.cat((out3,res3),dim=1)
#         print("concate2_result:", out3.shape)
        out3=self.conv_set3(out3)
        thrid=self.conv_final3(out3)
        
        anchor52=self.anchor_box3(thrid) #[1, 8112, 85]
        
        # feature 크기출력
        print(">>>>> featuremap extract <<<<<")
        print("first_feature:",first.shape)
        print("second_feature:", second.shape)            
        print("thrid_feature:",thrid.shape)
        
        # anchor box합치기
        print(">>>>> anchor box prediction <<<<<")
        pred_BBOX=[anchor13,anchor26,anchor52]
        pred_BBOX=torch.cat(pred_BBOX,1) # 인덱스1번째 차원으로 합치기. shape: [1,10647,85]
        
        
    def conv_set(self,in_c,out_c):
        increase_c=out_c*2
        result=nn.Sequential(DBL(in_c,out_c,1,1,0),
                             DBL(out_c,increase_c,3,1,1),
                             DBL(increase_c,out_c,1,1,0),
                             DBL(out_c,increase_c,3,1,1),
                             DBL(increase_c,out_c,1,1,0))
        return result
    
    def conv_final(self,in_c,out_c):
        result=nn.Sequential(DBL(in_c,in_c*2,3,1,1),
                            nn.Conv2d(in_c*2,out_c,1,1,0))
        return result
    


In [35]:
Yolo_v3().forward(x=input_image)

darknet53 layer size
torch.Size([1, 3, 13, 13, 85])
torch.Size([1, 3, 26, 26, 85])
torch.Size([1, 3, 52, 52, 85])
first_feature: torch.Size([1, 255, 13, 13])
second_feature: torch.Size([1, 255, 26, 26])
thrid_feature: torch.Size([1, 255, 52, 52])


In [25]:
class YoloDetection(nn.Module):
    def __init__(self,anchor,img_size,classnum):
        super(YoloDetection,self).__init__()
        self.anchor=anchor
        self.img_size=img_size
        self.numclass=classnum
        self.mse_loss=nn.MSELoss()
        self.bce_loss=nn.BCELoss()
        
    def forward(self,x):
        batch_size=x.size(0)
        num_anchor=len(self.anchor)
        grid_size=x.size(2) # 13-> 26-> 52
        
        pred=(x.view(batch_size,num_anchor,5+self.numclass,grid_size,grid_size)
             .permute(0,1,3,4,2).contiguous())
        print(pred.shape) # batch, anchor,img,img,5+class
        
        tx=torch.sigmoid(pred[...,0]) #  시작은 그냥 특징맵의 x좌표로 변화량 계산 -> 학습을 통해 변화
        ty=torch.sigmoid(pred[...,1]) # 모두 0~1 사이값 즉, 변화량
        w= pred[...,2]
        h=pred[...,3]
        pred_conf=torch.sigmoid(pred[...,4]) # object confidence
        pred_cls=torch.sigmoid(pred[...,5:]) # class prediction
        
        stride=self.img_size/grid_size  # 416/13 = 32
        left_x=torch.arange(grid_size,dtype=torch.float).repeat(grid_size,1).view([1,1,grid_size,grid_size])
        left_y=torch.arange(grid_size,dtype=torch.float).repeat(grid_size,1).t().view([1,1,grid_size,grid_size]) # y니까 전치해주기 t()
        
        # grid 나눈거에 맞춰서 anchor크기도 맞춰주기 ex) 4 -> 2 
        grid_anchor=torch.as_tensor([(anchor_w/stride, anchor_h/stride) for anchor_w,anchor_h in self.anchor],dtype=float)
        
        anchor_w=grid_anchor[:,0].view((1,num_anchor,1,1))
        anchor_h=grid_anchor[:,1].view((1,num_anchor,1,1)) # 왜이런 형태로?
        
        # 상대좌표 구하기 (grid상 좌표) 좌상단 + 변화량
        pred_bbox=torch.zeros_like(pred[...,:4]) # x,y,w,h 크기의 예측박스 
        pred_bbox[...,0]=left_x + tx # 좌상단좌표 + 변화량
        pred_bbox[...,1]=left_y + ty
        pred_bbox[...,2]=torch.exp(w)*anchor_w
        pred_bbox[...,3]=torch.exp(h)*anchor_h
        
        #print(pred_bbox.shape) # (1,3,13,13,4)
        
        # x,y,w,h 와 conf, cls 합쳐주기
        # 절대좌표구하기
        pred=(pred_bbox.view(batch_size,-1,4)*stride, #(1,507,4)
             pred_conf.view(batch_size,-1,1), # (1,507,1)
             pred_cls.view(batch_size,-1,self.numclass)) # (1,507,80)
        output=torch.cat(pred,-1) # 마지막차원을 기준으로 합친다. (1,507,85)
        
#         if target is None :
#             print("target is none")
#             return output, 0
        
        return output