# LET'S START! 
참고: https://deep-learning-study.tistory.com/568

**1. Importing packages**

In [None]:
# import pakages
import torch
from torch import nn
from torch import optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchvision import utils
# from torchsummary import summary
import torchvision.transforms.functional as TF
from torchvision.transforms.functional import to_pil_image
from PIL import Image, ImageDraw, ImageFont
import matplotlib.pyplot as plt
import cv2
import os
import copy
import numpy as np
import pandas as pd
import random
import albumentations as A
import ast
from albumentations.pytorch import ToTensorV2
%matplotlib inline

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
b = [1, 2, 3, 45623, 3, 13]

print(b[:2])

**2. Classes Name**

In [None]:
classes = [
    "STARFISH",
]

**3. CUSTOM DATASET**

In [None]:
'''
GBRDataset (Great-Barrier-Reef Dataset)
'''

class GBRDataset(Dataset):
    def __init__(self, csv_file, img_dir, transform=None, trans_params=None, is_test = False, is_train=True, idx=0):
        self.img_dir = img_dir
        self.transform = transform
        self.trans_params = trans_params
        self.is_train = is_train
        
        csv_dataset = pd.read_csv(csv_file)
        if not is_test:
            self.annotations = csv_dataset[:idx] if is_train else csv_dataset[idx:] # idx: train/test boundary index
        else:
            self.annotations = csv_dataset

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        img_id = self.annotations.iloc[index, 4].strip('-') # 0-24
        coordinates = ast.literal_eval(self.annotations.iloc[index, 5])
        label_data = np.array([list(coord.values()) + [0] for coord in coordinates], dtype = 'float32') if (len(coordinates) > 0) else np.array([])
        img_subpath = '/video_' + img_id[0] + '/' + img_id[2:] + '.jpg'
        img_path = self.img_dir + img_subpath # /tensorflow-great-barrier-reef/train_images/video_{}/{}.jpg
        image = np.array(Image.open(img_path).convert("RGB")) # albumentation을 적용하기 위해 np.array로 변환합니다.
        
        if label_data != np.array([]): # normalization
            label_data[:, 2] = label_data[:, 2] / 1280 # normalization of w
            label_data[:, 3] = label_data[:, 3] / 1280 # normalization of h
            label_data[:, 0] = label_data[:, 0] / 1280 + label_data[:, 2] / 2 # normalization of x
            label_data[:, 1] = label_data[:, 1] / 1280 + label_data[:, 3] / 2 + 280 / 1280 # normalization of y (considering padding 280)

    
        if self.transform:
            # apply albumentations
            augmentations = self.transform(image=image, bboxes=label_data)
            image = augmentations['image']
            targets = augmentations['bboxes']
            
            # for DataLoader
            # lables: ndarray -> tensor
            # dimension: [batch, cx, cy, w, h, class]
            if targets is not None:
                targets = torch.zeros((len(label_data), 6))
                targets[:, 1:] = torch.tensor(label_data).reshape(-1, 5) 
        else:
            targets = label_data

        return image, targets

**Test**

In [None]:

idx = 11
anno = pd.read_csv('../input/tensorflow-great-barrier-reef/train.csv').iloc[0:5]
print(anno)
                                     

4. Generating Training Dataset, Validating Dataset

In [None]:
# dataset 생성하기
dataset_csv_file = '../input/tensorflow-great-barrier-reef/train.csv' # train + val 동시에 생성 
img_dir = '../input/tensorflow-great-barrier-reef/train_images'

train_ratio = 0.9
idx = int(train_ratio * len(pd.read_csv(dataset_csv_file)))
          
train_ds = GBRDataset(dataset_csv_file, img_dir, is_train = True, idx = idx)
val_ds = GBRDataset(dataset_csv_file, img_dir, is_train = False, idx = idx)

img, labels = train_ds[31]


print('number of train data : {}, val data : {}, total data : {}'.format(len(train_ds), len(val_ds), len(train_ds) + len(val_ds)))
print('image size:', img.shape, type(img)) # 720 x 1280 x 3
print('labels shape:', labels.shape, type(labels))  # x1,y1,x2,y2
print('lables \n', labels)

**val dataset 생성하기 -> 안 해도 될 듯**

**5. Data Augmentation Using Albumentation**

In [None]:
# transforms 정의하기
IMAGE_COL, IMAGE_ROW = 1280, 1280
IMAGE_SIZE = max(IMAGE_COL, IMAGE_ROW)
scale = 1.0

# for train
train_transforms = A.Compose([
        # 이미지의 maxsize를 max_size로 rescale합니다. aspect ratio는 유지합니다.
        A.LongestMaxSize(max_size=int(IMAGE_SIZE * scale)),
        # min_size보다 작으면 pad
        A.PadIfNeeded(min_height=int(IMAGE_SIZE * scale), min_width=int(IMAGE_SIZE * scale), border_mode=cv2.BORDER_CONSTANT),
        # random crop
        A.RandomCrop(width=IMAGE_SIZE, height=IMAGE_SIZE),
        # brightness, contrast, saturation을 무작위로 변경합니다.
        A.ColorJitter(brightness=0.6, contrast=0.6, saturation=0.6, hue=0.6, p=0.4),
        # 수평 뒤집기
        A.HorizontalFlip(p=0.5),
        # blur
        A.Blur(p=0.1),
        # Contrast Limited Adaptive Histogram Equalization 적용
        A.CLAHE(p=0.1),
        # 각 채널의 bit 감소
        A.Posterize(p=0.1),
        # grayscale로 변환
        A.ToGray(p=0.1),
        # 무작위로 channel을 섞기
        A.ChannelShuffle(p=0.05),
        # normalize
        A.Normalize(mean=[0,0,0], std=[1,1,1], max_pixel_value=255),
        ToTensorV2()
        ],
        # (x1, y1, x2, y2) -> (cx, cy, w, h)
        bbox_params=A.BboxParams(format='yolo', min_visibility=0.4, label_fields=[])
        )

train_transforms_check = A.Compose([
        # 이미지의 maxsize를 max_size로 rescale합니다. aspect ratio는 유지합니다.
        A.LongestMaxSize(max_size=int(IMAGE_SIZE * scale)),
        # min_size보다 작으면 pad
        A.PadIfNeeded(min_height=int(IMAGE_SIZE * scale), min_width=int(IMAGE_SIZE * scale), border_mode=cv2.BORDER_CONSTANT),
        A.Normalize(mean=[0, 0, 0], std=[1, 1, 1], max_pixel_value=255),
        ToTensorV2()
        ], 
        bbox_params=A.BboxParams(format='yolo', min_visibility=0.4, label_fields=[])
        )

'''
생략
     # transforms 중 하나를 선택해 적용합니다.
        A.OneOf([
                 # shift, scale, rotate 를 무작위로 적용합니다.
                 A.ShiftScaleRotate(rotate_limit=20, p=0.5, border_mode=cv2.BORDER_CONSTANT),
                 # affine 변환
                 A.IAAAffine(shear=15, p=0.5, mode='constant')
        ], p=1.0),
'''
# for validation
val_transforms = A.Compose([
        A.LongestMaxSize(max_size=int(IMAGE_SIZE * scale)),
        A.PadIfNeeded(min_height=int(IMAGE_SIZE * scale), min_width=int(IMAGE_SIZE * scale), border_mode=cv2.BORDER_CONSTANT),
        A.Normalize(mean=[0, 0, 0], std=[1, 1, 1], max_pixel_value=255),
        ToTensorV2(),
        ],
        bbox_params=A.BboxParams(format='yolo', min_visibility=0.4, label_fields=[])
        )

In [None]:
# 데이터셋에 transforms 적용하기
train_ds.transform = train_transforms
val_ds.transform = val_transforms

**6. Applying Transforms into Images**

In [None]:
# 정규화된 x,y,w,h를 이미지 크기에 맞게 변경
def rescale_bbox(bb, W, H):
    x,y,w,h = bb
    return [x*W, y*H, w*W, h*H]

# 바운딩 박스 색상
COLORS = np.random.randint(0, 255, size=(80,3), dtype='uint8')

# image 출력 함수 정의
def show_img_bbox(img, targets, classes=classes):
    if torch.is_tensor(img):
        img=to_pil_image(img)
    if torch.is_tensor(targets):
        targets=targets.numpy()[:,1:]
    
    H, W = 1280, 1280 # resized to maxsize
    draw = ImageDraw.Draw(img)

    for tg in targets:
        id_=int(tg[-1])
        bbox=tg[:4]
        bbox=rescale_bbox(bbox,W,H)
        xc,yc,w,h = bbox

        color = [int(c) for c in COLORS[id_]]
        name=classes[id_]

        draw.rectangle(((xc-w/2, yc-h/2), (xc+w/2, yc+h/2)), outline=tuple(color), width=3)
        draw.text((xc-w/2, yc-h/2), name, fill=(255,255,255,0))
    plt.imshow(np.array(img))

In [None]:
# transforms가 적용된 sample image 확인
np.random.seed(25)

grid_size = 2
rnd_ind = np.random.randint(0, len(train_ds), grid_size)
print('image indices:',rnd_ind)

# train_transform
plt.figure(figsize=(20, 20))
for i, indice in enumerate(rnd_ind):
    img, label = train_ds[indice]
    plt.subplot(1, grid_size, i+1)
    show_img_bbox(img, label)


# train_transforms_check for checking bboxes (completed!)
train_ds.transform = train_transforms_check
plt.figure(figsize=(20, 20))
for i, indice in enumerate(rnd_ind):
    img, label = train_ds[indice]
    plt.subplot(2, grid_size, i+1)
    show_img_bbox(img, label)


**7. Collate_fn**

In [None]:
# collate_fn 를 정의합니다.
# collate_fn은 DataLoader의 인자로 사용되며, batch 단위로 imgs와 targets를 묶습니다.
def collate_fn(batch):
    imgs, targets = list(zip(*batch))
    # 빈 박스 제거하기
    targets = [boxes for boxes in targets if boxes is not None]
    # index 설정하기
    for b_i, boxes in enumerate(targets):
        boxes[:, 0] = b_i
    targets = torch.cat(targets, 0)
    imgs = torch.stack([img for img in imgs])
    return imgs, targets

In [None]:
# make DataLoader
train_dl = DataLoader(train_ds, batch_size=1, shuffle=True, collate_fn=collate_fn)
val_dl = DataLoader(val_ds, batch_size=1, shuffle=True, collate_fn=collate_fn)

**8. YOLO Model**

In [None]:
class BasicConv(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding):
        super().__init__()

        self.conv = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.LeakyReLU(0.1)
        )

    def forward(self, x):
        return self.conv(x)

In [None]:
class ResidualBlock(nn.Module): # image size : conservative
    def __init__(self, channels):
        super().__init__()

        self.residual = nn.Sequential(
            BasicConv(channels, channels//2, 1, stride=1, padding=0),
            BasicConv(channels//2, channels, 3, stride=1, padding=1)
        )

        self.shortcut = nn.Sequential()

    def forward(self, x):
        x_shortcut = self.shortcut(x)
        x_residual = self.residual(x)

        return x_shortcut + x_residual

In [None]:
# FPN의 Top_down layer 입니다.
# lateral connection과 Upsampling이 concatate 한 뒤에 수행합니다.
class Top_down(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()

        self.conv = nn.Sequential(
            BasicConv(in_channels, out_channels, 1, stride=1, padding=0),
            BasicConv(out_channels, out_channels*2, 3, stride=1, padding=1),
            BasicConv(out_channels*2, out_channels, 1, stride=1, padding=0),
            BasicConv(out_channels, out_channels*2, 3, stride=1, padding=1),
            BasicConv(out_channels*2, out_channels, 1, stride=1, padding=0)
        )

    def forward(self, x):
        return self.conv(x)

In [None]:
# YOLO Layer를 정의합니다.
# YOLO Layer는 40x40, 80x80, 160x160 피쳐맵에서 예측을 수행합니다.
class YOLOLayer(nn.Module):
    def __init__(self, channels, anchors, num_classes=1, img_dim=1280):
        super().__init__()
        self.anchors = anchors # three anchors per YOLO Layer
        self.num_anchors = len(anchors) # 3
        self.num_classes = num_classes # VOC classes 1
        self.img_dim = img_dim # 입력 이미지 크기 1280
        self.grid_size = 0

        # 예측을 수행하기 전, smooth conv layer 입니다.
        self.conv = nn.Sequential(
            BasicConv(channels, channels*2, 3, stride=1, padding=1),
            nn.Conv2d(channels*2, 18, 1, stride=1, padding=0)
        )

    def forward(self, x):
        x = self.conv(x)

        # prediction
        # x: batch, channels, W, H
        batch_size = x.size(0)
        grid_size = x.size(2) # S = 40 or 80 or 160
        device = x.device

        prediction = x.view(batch_size, self.num_anchors, self.num_classes + 5,
                            grid_size, grid_size) # shape = (batch, 3, 6, S, S)
        
        # shape change (batch, 3, 6, S, S) -> (batch, 3, S, S, 6)
        prediction = prediction.permute(0, 1, 3, 4, 2)
        prediction = prediction.contiguous() # continuous data address

        obj_score = torch.sigmoid(prediction[..., 4]) # Confidence: 1 if object, else 0
        pred_cls = torch.sigmoid(prediction[..., 5:]) # 바운딩 박스 좌표

        # grid_size 갱신
        if grid_size != self.grid_size:
            # grid_size를 갱신하고, transform_outputs 함수를 위해 anchor 박스를 전처리 합니다.
            self.compute_grid_offsets(grid_size, cuda=x.is_cuda)

        # calculate bounding box coordinates
        pred_boxes = self.transform_outputs(prediction)

        # output shape(batch, num_anchors x S x S, 25)
        # ex) at 13x13 -> [batch, 507, 25], at 26x26 -> [batch, 2028, 25], at 52x52 -> [batch, 10647, 25]
        # 최종적으로 YOLO는 10647개의 바운딩박스를 예측합니다.
        output = torch.cat((pred_boxes.view(batch_size, -1, 4),
                    obj_score.view(batch_size, -1, 1),
                    pred_cls.view(batch_size, -1, self.num_classes)), -1)
        return output


    # grid_size를 갱신하고, transform_outputs 함수를 위해 anchor 박스를 전처리 합니다.
    def compute_grid_offsets(self, grid_size, cuda=True):
        self.grid_size = grid_size # ex) 16, 32, 64
        self.stride = self.img_dim / self.grid_size # ex) 80, 40, 20

        # cell index 생성
        # transform_outputs 함수에서 바운딩 박스의 x, y좌표를 예측할 때 사용합니다.
        # 1, 1, S, S
        self.grid_x = torch.arange(grid_size, device=device).repeat(1, 1, grid_size, 1).type(torch.float32)
        # 1, 1, S, S
        self.grid_y = torch.arange(grid_size, device=device).repeat(1, 1, grid_size, 1).transpose(3,2).type(torch.float32)

        # anchors를 feature map 크기로 정규화, [0~1] 범위
        scaled_anchors = [(a_w / self.stride, a_h / self.stride) for a_w, a_h in self.anchors]
        # tensor로 변환
        self.scaled_anchors = torch.tensor(scaled_anchors, device=device)

        # transform_outputs 함수에서 바운딩 박스의 w, h를 예측할 때 사용합니다.
        # shape=(3,2) -> (1, 3, 1, 1)
        self.anchor_w = self.scaled_anchors[:, 0:1].view(1, self.num_anchors, 1, 1)
        self.anchor_h = self.scaled_anchors[:, 1:2].view(1, self.num_anchors, 1, 1)


    # 예측한 바운딩 박스 좌표를 계산하는 함수입니다.
    def transform_outputs(self, prediction):
        # prediction = (batch, num_anchors, S, S, coordinates + classes)
        device = prediction.device
        x = torch.sigmoid(prediction[..., 0]) # sigmoid(box x), 예측값을 sigmoid로 감싸서 [0~1] 범위
        y = torch.sigmoid(prediction[..., 1]) # sigmoid(box y), 예측값을 sigmoid로 감싸서 [0~1] 범위
        w = prediction[..., 2] # 예측한 바운딩 박스 너비
        h = prediction[..., 3] # 예측한 바운딩 박스 높이

        pred_boxes = torch.zeros_like(prediction[..., :4]).to(device)
        pred_boxes[..., 0] = x.data + self.grid_x # sigmoid(box x) + cell x 좌표
        pred_boxes[..., 1] = y.data + self.grid_y # sigmoid(box y) + cell y 좌표
        pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w
        pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h

        return pred_boxes * self.stride

In [None]:
class DarkNet(nn.Module):
    def __init__(self, anchors, num_blocks=[1,2,8,8,4], num_classes=20):
        super().__init__()

        # feature extractor
        self.conv1 = BasicConv(3, 32, 3, stride=1, padding=1)
        self.res_block_1 = self._make_residual_block(64, num_blocks[0]) # 208x208
        self.res_block_2 = self._make_residual_block(128, num_blocks[1]) # 104x104
        self.res_block_3 = self._make_residual_block(256, num_blocks[2]) # 52x52, FPN lateral connection
        self.res_block_4 = self._make_residual_block(512, num_blocks[3]) # 26x26, FPN lateral connection
        self.res_block_5 = self._make_residual_block(1024, num_blocks[4]) # 13x13, Top layer

        # FPN Top down, conv + upsampling을 수행합니다.
        self.topdown_1 = Top_down(1024, 512)
        self.topdown_2 = Top_down(768, 256)
        self.topdown_3 = Top_down(384, 128)

        # FPN lateral connection
        # 차원 축소를 위해 사용합니다.
        self.lateral_1 = BasicConv(512, 256, 1, stride=1, padding=0)
        self.lateral_2 = BasicConv(256, 128, 1, stride=1, padding=0)

        # prediction, 13x13, 26x26, 52x52 피쳐맵에서 예측을 수행합니다.
        self.yolo_1 = YOLOLayer(512, anchors=anchors[2]) # 13x13
        self.yolo_2 = YOLOLayer(256, anchors=anchors[1]) # 26x26
        self.yolo_3 = YOLOLayer(128, anchors=anchors[0]) # 52x52

        self.upsample = nn.Upsample(scale_factor=2)


    def forward(self, x):
        # feature extractor
        x = self.conv1(x)
        c1 = self.res_block_1(x)
        c2 = self.res_block_2(c1)
        c3 = self.res_block_3(c2)
        c4 = self.res_block_4(c3)
        c5 = self.res_block_5(c4)

        # FPN Top-downm, Upsample and lateral connection 
        p5 = self.topdown_1(c5)
        p4 = self.topdown_2(torch.cat((self.upsample(p5), self.lateral_1(c4)), 1))
        p3 = self.topdown_3(torch.cat((self.upsample(p4), self.lateral_2(c3)), 1))

        # prediction
        yolo_1 = self.yolo_1(p5)
        yolo_2 = self.yolo_2(p4)
        yolo_3 = self.yolo_3(p3)

        return torch.cat((yolo_1, yolo_2, yolo_3), 1), [yolo_1, yolo_2, yolo_3]

    def _make_residual_block(self,in_channels, num_block):
        blocks = []

        # down sample
        blocks.append(BasicConv(in_channels//2, in_channels, 3, stride=2, padding=1))

        for i in range(num_block):
            blocks.append(ResidualBlock(in_channels))
        
        return nn.Sequential(*blocks)

**9. Checking Model**

In [None]:
anchors = [[(10,13),(16,30),(33,23)],[(30,61),(62,45),(59,119)],[(116,90),(156,198),(373,326)]]
x = torch.randn(1, 3, 1280, 1280).to(device=device)
with torch.no_grad():
    model = DarkNet(anchors).to(device=device)
    output_cat , output = model(x)
    print(output_cat.size())
    print(output[0].size(), output[1].size(), output[2].size())


**10. Loss Function**

In [None]:
def get_loss_batch(output,targets, params_loss, opt=None):
    ignore_thres=params_loss["ignore_thres"]
    scaled_anchors= params_loss["scaled_anchors"] # 정규화된 anchor   
    mse_loss= params_loss["mse_loss"] # nn.MSELoss
    bce_loss= params_loss["bce_loss"] # nn.BCELoss, 이진 분류에서 사용
    
    num_yolos=params_loss["num_yolos"] # 3
    num_anchors= params_loss["num_anchors"] # 3
    obj_scale= params_loss["obj_scale"] # 1
    noobj_scale= params_loss["noobj_scale"] # 100

    loss = 0.0

    for yolo_ind in range(num_yolos):
        yolo_out = output[yolo_ind] # yolo_out: batch, num_boxes, class+coordinates
        batch_size, num_bbxs, _ = yolo_out.shape

        # get grid size
        gz_2 = num_bbxs/num_anchors # ex) at 40x40, 4800 / 3
        grid_size=int(np.sqrt(gz_2))

        # (batch, num_boxes, class+coordinates) -> (batch, num_anchors, S, S, class+coordinates)
        yolo_out = yolo_out.view(batch_size, num_anchors, grid_size, grid_size, -1)

        pred_boxes = yolo_out[:,:,:,:,:4] # get box coordinates
        x,y,w,h = transform_bbox(pred_boxes, scaled_anchors[yolo_ind]) # cell 내에서 x,y 좌표와  
        pred_conf = yolo_out[:,:,:,:,4] # get confidence
        pred_cls_prob = yolo_out[:,:,:,:,5:]

        yolo_targets = get_yolo_targets({
            'pred_cls_prob':pred_cls_prob,
            'pred_boxes':pred_boxes,
            'targets':targets,
            'anchors':scaled_anchors[yolo_ind],
            'ignore_thres':ignore_thres,
        })

        obj_mask=yolo_targets["obj_mask"]        
        noobj_mask=yolo_targets["noobj_mask"]            
        tx=yolo_targets["tx"]                
        ty=yolo_targets["ty"]                    
        tw=yolo_targets["tw"]                        
        th=yolo_targets["th"]                            
        tcls=yolo_targets["tcls"]                                
        t_conf=yolo_targets["t_conf"]

        loss_x = mse_loss(x[obj_mask], tx[obj_mask])
        loss_y = mse_loss(y[obj_mask], ty[obj_mask])
        loss_w = mse_loss(w[obj_mask], tw[obj_mask])
        loss_h = mse_loss(h[obj_mask], th[obj_mask])
        
        loss_conf_obj = bce_loss(pred_conf[obj_mask], t_conf[obj_mask])
        loss_conf_noobj = bce_loss(pred_conf[noobj_mask], t_conf[noobj_mask])
        loss_conf = obj_scale * loss_conf_obj + noobj_scale * loss_conf_noobj
        loss_cls = bce_loss(pred_cls_prob[obj_mask], tcls[obj_mask])
        loss += loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls
        
    if opt is not None:
        opt.zero_grad()
        loss.backward()
        opt.step()
        
    return loss.item()

In [None]:
def transform_bbox(bbox, anchors):
    # bbox: predicted bbox coordinates
    # anchors: scaled anchors

    x = bbox[:,:,:,:,0]
    y = bbox[:,:,:,:,1]
    w = bbox[:,:,:,:,2]
    h = bbox[:,:,:,:,3]
    anchor_w = anchors[:,0].view((1,3,1,1))
    anchor_h = anchors[:,1].view((1,3,1,1))

    x=x-x.floor() # 전체 이미지의 x 좌표에서 셀 내의 x좌표로 변경
    y=y-y.floor() # 전체 이미지의 y 좌표에서 셀 내의 y좌표로 변경
    w=torch.log(w / anchor_w + 1e-16)
    h=torch.log(h / anchor_h + 1e-16)
    return x, y, w, h

In [None]:
def get_yolo_targets(params):
    pred_boxes = params['pred_boxes']
    pred_cls_prob = params['pred_cls_prob']
    target = params['targets'] # batchsize, cls, cx, cy, w, h
    anchors = params['anchors']
    ignore_thres = params['ignore_thres']

    batch_size = pred_boxes.size(0)
    num_anchors = pred_boxes.size(1)
    grid_size = pred_boxes.size(2)
    num_cls = pred_cls_prob.size(-1)


    sizeT = batch_size, num_anchors, grid_size, grid_size
    obj_mask = torch.zeros(sizeT, device=device, dtype=torch.uint8)
    noobj_mask = torch.ones(sizeT, device=device, dtype=torch.uint8)
    tx = torch.zeros(sizeT, device=device, dtype=torch.float32)
    ty = torch.zeros(sizeT, device=device, dtype=torch.float32)
    tw = torch.zeros(sizeT, device=device, dtype=torch.float32)
    th = torch.zeros(sizeT, device=device, dtype=torch.float32)

    sizeT = batch_size, num_anchors, grid_size, grid_size, num_cls
    tcls = torch.zeros(sizeT, device=device, dtype=torch.float32)

    # target = batch, cx, cy, w, h, class
    target_bboxes = target[:, 1:5] * grid_size
    t_xy = target_bboxes[:, :2]
    t_wh = target_bboxes[:, 2:]
    t_x, t_y = t_xy.t() # .t(): 전치
    t_w, t_h = t_wh.t() # .t(): 전치

    grid_i, grid_j = t_xy.long().t() # .long(): int로 변환

    # anchor와 target의 iou 계산
    iou_with_anchors = [get_iou_WH(anchor, t_wh) for anchor in anchors]
    iou_with_anchors = torch.stack(iou_with_anchors)
    best_iou_wa, best_anchor_ind = iou_with_anchors.max(0) # iou가 가장 높은 anchor 추출

    batch_inds, target_labels = target[:, 0].long(), target[:, 5].long()
    obj_mask[batch_inds, best_anchor_ind, grid_j, grid_i] = 1 # iou가 가장 높은 anchor 할당
    noobj_mask[batch_inds, best_anchor_ind, grid_j, grid_i] = 0

    # threshold 보다 높은 iou를 지닌 anchor
    # iou가 가장 높은 anchor만 할당하면 되기 때문입니다.
    for ind, iou_wa in enumerate(iou_with_anchors.t()):
        noobj_mask[batch_inds[ind], iou_wa > ignore_thres, grid_j[ind], grid_i[ind]] = 0

    # cell 내에서 x,y로 변환
    tx[batch_inds, best_anchor_ind, grid_j, grid_i] = t_x - t_x.float()
    ty[batch_inds, best_anchor_ind, grid_j, grid_i] = t_y - t_y.float()

    anchor_w = anchors[best_anchor_ind][:, 0]
    tw[batch_inds, best_anchor_ind, grid_j, grid_i] = torch.log(t_w / anchor_w + 1e-16)

    anchor_h = anchors[best_anchor_ind][:, 1]
    th[batch_inds, best_anchor_ind, grid_j, grid_i] = torch.log(t_h / anchor_h + 1e-16)

    tcls[batch_inds, best_anchor_ind, grid_j, grid_i, target_labels] = 1

    output = {
        'obj_mask': obj_mask,
        'noobj_mask': noobj_mask,
        'tx': tx,
        'ty': ty,
        'tw': tw,
        'th': th,
        'tcls': tcls,
        't_conf': obj_mask.float(),
    }
    return output

In [None]:
# anchor와 target box의 iou 계산하는 함수입니다.
def get_iou_WH(wh1, wh2):
    wh2 = wh2.t()
    w1, h1 = wh1[0], wh1[1]
    w2, h2 = wh2[0], wh2[1]
    inter_area = torch.min(w1, w2) * torch.min(h1, h2)
    union_area = (w1 * h1 + 1e-16) + w2 * h2 - inter_area
    return inter_area / union_area

**10. Model Learning**

In [None]:
# 현재 lr 계산하는 함수
def get_lr(opt):
    for param_group in opt.param_groups:
        return param_group['lr']

In [None]:
# epoch당 loss 계산하는 함수
def loss_epoch(model,params_loss,dataset_dl,sanity_check=False,opt=None):
    running_loss=0.0
    len_data=len(dataset_dl.dataset)
    running_metrics= {}
    
    for img, target in dataset_dl:
        target=target.to(device)
        _,output=model(img.to(device))
        loss_b=get_loss_batch(output,target, params_loss,opt)
        running_loss+=loss_b
        if sanity_check is True:
            break 
    loss=running_loss/float(len_data)
    return loss

In [None]:
import time
def train_val(model, params):
    num_epochs=params["num_epochs"] # 3
    params_loss=params["params_loss"] # params_loss
    opt=params["optimizer"] # opt == Adam
    train_dl=params["train_dl"] # train_dl
    val_dl=params["val_dl"] # val_dl
    sanity_check=params["sanity_check"]
    lr_scheduler=params["lr_scheduler"]
    path2weights=params["path2weights"] # ./models/weights.pt
    
    
    loss_history={
        "train": [],
        "val": [],
    }
    
    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss=float('inf') 
    
    start_time = time.time()
    for epoch in range(num_epochs):
        current_lr=get_lr(opt)
        print('Epoch {}/{}, current lr={}'.format(epoch, num_epochs - 1, current_lr))
        model.to(device)
        model.train()
        train_loss=loss_epoch(model,params_loss,train_dl,sanity_check,opt)
        loss_history["train"].append(train_loss)  
        
        model.eval()
        with torch.no_grad():
            val_loss=loss_epoch(model,params_loss,val_dl,sanity_check)
        loss_history["val"].append(val_loss)
        
        if val_loss < best_loss:
            best_loss = val_loss
            best_model_wts = copy.deepcopy(model.state_dict())
            torch.save(model.state_dict(), path2weights)
            print("Copied best model weights!")
            print('Get best val loss')
            
        lr_scheduler.step(val_loss)
        if current_lr != get_lr(opt):
            print("Loading best model weights!")
            model.load_state_dict(best_model_wts) 
        print("train loss: %.6f, val loss: %.6f, time: %.4f min" %(train_loss, val_loss, (time.time()-start_time)/60))
        print("-"*10) 
    model.load_state_dict(best_model_wts)
    return model, loss_history

In [None]:
path2models= "./models/"
if not os.path.exists(path2models):
        os.mkdir(path2models)

anchors = [[[10,13],  [16,30],  [33,23]], [[30,61],  [62,45],  [59,119]], [[116,90],  [156,198],  [373,32]]]
scaled_anchors = [[[10/40,13/40], [16/40,30/40], [33/40,23/40]], [[30/80,61/80], [62/80,45/80], [59/80,119/80]], [[116/160,90/160], [156/160,198/160], [373/160,32/160]]]
model = DarkNet(anchors)

'''
scaled_anchors=[model.module_list[82][0].scaled_anchors,
                model.module_list[94][0].scaled_anchors,
                model.module_list[106][0].scaled_anchors]
'''

mse_loss = nn.MSELoss(reduction="sum")
bce_loss = nn.BCELoss(reduction="sum")
params_loss={
    "scaled_anchors" : scaled_anchors,
    "ignore_thres": 0.5,
    "mse_loss": mse_loss,
    "bce_loss": bce_loss,
    "num_yolos": 3,
    "num_anchors": 3,
    "obj_scale": 1,
    "noobj_scale": 100,
}

In [None]:
opt = optim.Adam(model.parameters(), lr=1e-3)
lr_scheduler = ReduceLROnPlateau(opt, mode='min',factor=0.5, patience=20, verbose=1)

In [None]:
params_train={
    "num_epochs": 3,
    "optimizer": opt, # Adam
    "params_loss": params_loss,
    "train_dl": train_dl, 
    "val_dl": val_dl,
    "sanity_check": True,
    "lr_scheduler": lr_scheduler,
    "path2weights": path2models+"weights.pt",
}

model, loss_hist = train_val(model, params_train)

**Prediction through Model**

In [None]:
import greatbarrierreef
env = greatbarrierreef.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission
for (pixel_array, sample_prediction_df) in iter_test:
    pred_output = model(pixel_array)
    sample_prediction_df['annotations'] = '0.2 0 0 100 100'  # make your predictions here
    env.predict(sample_prediction_df)   # register your predictions