In [2]:
#We use VOC2012 dataset, you should download it, and put it in ../data.
#And you also need create a folder called 'labels'
DATA_PATH = "../data/VOC2012/"

In [3]:
#First, we should transform the dataset.
#We need the data type of (cls, x, y, w, h)
CLASSES = ['person', 'bird', 'cat', 'cow', 'dog', 'horse', 'sheep',
           'aeroplane', 'bicycle', 'boat', 'bus', 'car', 'motorbike', 'train',
           'bottle', 'chair', 'dining table', 'potted plant', 'sofa', 'tvmonitor']

In [4]:
import xml.etree.ElementTree as ET
import os
import cv2

def convert(size, box):
    dw = 1. / size[0]
    dh = 1. / size[1]
    
    x = (box[0] + box[1]) / 2.0
    y = (box[2] + box[3]) / 2.0
    w = box[1] - box[0]
    h = box[3] - box[2]
    
    x = x * dw
    y = y * dh
    w = w * dw
    h = h * dh
    
    return (x, y, w ,h)

def convert_annotation(image_id):
    in_file = open(DATA_PATH + 'Annotations/%s' % image_id)
    image_id = image_id.split('.')[0]
    out_file = open('./labels/%s.txt' % image_id, 'w')
    
    tree = ET.parse(in_file)
    root = tree.getroot()
    size = root.find('size')
    
    w = int(size.find('width').text)
    h = int(size.find('height').text)
    
    for obj in root.iter('object'):
        difficult = obj.find('difficult').text
        cls = obj.find('name').text
        
        if cls not in CLASSES or int(difficult) == 1:
            continue
        
        cls_id = CLASSES.index(cls)
        xmlbox = obj.find('bndbox')
        points = (float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text), float(xmlbox.find('ymin').text),
             float(xmlbox.find('ymax').text))
        convert_value = convert((w, h), points)
        
        out_file.write(str(cls_id) + " " + " ".join([str(a) for a in convert_value]) + '\n')

def make_label_txt():
    filenames = os.listdir(DATA_PATH + 'Annotations')
    for file in filenames:
        convert_annotation(file)

In [None]:
#Checkout the convert function
def show_labels_img(img_name):
    img = cv2.imread(DATA_PATH + 'JPEGImages/' + img_name + '.jpg')
    h, w = img.shape[:2]
    label = []
    
    with open('./labels/' + img_name + '.txt', 'r') as labels:
        for label in labels:
            label = label.split(' ')
            label = [float(x.strip()) for x in label]
            print(CLASSES[int(label[0])])
            pt1 = (int(label[1] * w - label[3] * w / 2), int(label[2] * h - label[4] * h / 2))
            pt2 = (int(label[1] * w + label[3] * w / 2), int(label[2] * h + label[4] * h / 2))
            cv2.putText(img, CLASSES[int(label[0])], pt1, cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255))
            cv2.rectangle(img,pt1,pt2,(0,0,255,2))
    
    cv2.imshow('img', img)
    cv2.waitKey(0)

make_label_txt()
show_labels_img('2007_000027')

In [5]:
#Prepare Pytorch Dataset.
import numpy as np
import torch
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torchvision.models as tvmodel
import visdom as visdom

NUM_BBOX = 2

class VOC2012(Dataset):
    def __init__(self, is_train=True, is_aug=True):
        self.filenames = []
        
        if is_train:
            with open(DATA_PATH + 'ImageSets/Main/train.txt', 'r') as files:
                self.filenames = [file.strip() for file in files]
        else:
            with open(DATA_PATH + 'ImageSets/Main/val.txt', 'r') as files:
                self.filenames = [file.strip() for file in files]
        
        self.img_path = DATA_PATH + 'JPEGImages/'
        self.label_path = './labels/'
        self.is_aug = is_aug
    
    def __len__(self):
        return len(self.filenames)
    
    def __getitem__(self, item):
        img = cv2.imread(self.img_path + self.filenames[item] + '.jpg')
        h, w = img.shape[0: 2]
        input_size = 448
        
        padding_w, padding_h = 0, 0
        
        if h > w:
            padding_w = (h - w) // 2
            img = np.pad(img, ((0, 0), (padding_w, padding_w), (0, 0)), 'constant', constant_values=0)
        elif w > h:
            padding_h = (w - h) // 2
            img = np.pad(img, ((padding_h, padding_h), (0, 0), (0, 0)), 'constant', constant_values=0)
        
        img = cv2.resize(img, (input_size, input_size))
        
        if self.is_aug:
            aug = transforms.Compose([
                transforms.ToTensor()
            ])
            img = aug(img)

        with open(self.label_path + self.filenames[item] + ".txt") as file:
            bbox = file.read().split('\n')
        bbox = [x.split() for x in bbox]
        bbox = [float(x) for y in bbox for x in y]
        if len(bbox) % 5 != 0:
            raise ValueError("File:" + self.label_path + self.filenames[item] + ".txt" + "——bbox Extraction Error!")

        for i in range(len(bbox) // 5):
            if padding_w != 0:
                bbox[i * 5 + 1] = (bbox[i * 5 + 1] * w + padding_w) / h
                bbox[i * 5 + 3] = (bbox[i * 5 + 3] * w) / h
            elif padding_h != 0:
                bbox[i * 5 + 2] = (bbox[i * 5 + 2] * h + padding_h) / w
                bbox[i * 5 + 4] = (bbox[i * 5 + 4] * h) / w

        labels = convert_bbox2labels(bbox)
        labels = transforms.ToTensor()(labels)
        
        return img, labels

def convert_bbox2labels(bbox):
    
    gridsize = 1.0 / 7
    labels = np.zeros((7, 7, 5 * NUM_BBOX + len(CLASSES)))  # 注意，此处需要根据不同数据集的类别个数进行修改
    for i in range(len(bbox) // 5):
        gridx = int(bbox[i * 5 + 1] // gridsize)  # 当前bbox中心落在第gridx个网格,列
        gridy = int(bbox[i * 5 + 2] // gridsize)  # 当前bbox中心落在第gridy个网格,行
        # (bbox中心坐标 - 网格左上角点的坐标)/网格大小  ==> bbox中心点的相对位置
        gridpx = bbox[i * 5 + 1] / gridsize - gridx
        gridpy = bbox[i * 5 + 2] / gridsize - gridy
        # 将第gridy行，gridx列的网格设置为负责当前ground truth的预测，置信度和对应类别概率均置为1
        labels[gridy, gridx, 0:5] = np.array([gridpx, gridpy, bbox[i * 5 + 3], bbox[i * 5 + 4], 1])
        labels[gridy, gridx, 5:10] = np.array([gridpx, gridpy, bbox[i * 5 + 3], bbox[i * 5 + 4], 1])
        labels[gridy, gridx, 10+int(bbox[i*5])] = 1
    return labels

In [6]:
#define loss function
class Loss_yolov1(nn.Module):
    def __init__(self):
        super(Loss_yolov1, self).__init__()
    
    def forward(self, pred, labels):
        num_gridx, num_gridy = labels.size()[-2:]  # 划分网格数量
        num_b = 2  # 每个网格的bbox数量
        num_cls = 20  # 类别数量
        noobj_confi_loss = 0.  # 不含目标的网格损失(只有置信度损失)
        coor_loss = 0.  # 含有目标的bbox的坐标损失
        obj_confi_loss = 0.  # 含有目标的bbox的置信度损失
        class_loss = 0.  # 含有目标的网格的类别损失
        n_batch = labels.size()[0]  # batchsize的大小

        # 可以考虑用矩阵运算进行优化，提高速度，为了准确起见，这里还是用循环
        for i in range(n_batch):  # batchsize循环
            for n in range(7):  # x方向网格循环
                for m in range(7):  # y方向网格循环
                    if labels[i,4,m,n]==1:# 如果包含物体
                        # 将数据(px,py,w,h)转换为(x1,y1,x2,y2)
                        # 先将px,py转换为cx,cy，即相对网格的位置转换为标准化后实际的bbox中心位置cx,xy
                        # 然后再利用(cx-w/2,cy-h/2,cx+w/2,cy+h/2)转换为xyxy形式，用于计算iou
                        bbox1_pred_xyxy = ((pred[i,0,m,n]+m)/num_gridx - pred[i,2,m,n]/2,(pred[i,1,m,n]+n)/num_gridy - pred[i,3,m,n]/2,
                                           (pred[i,0,m,n]+m)/num_gridx + pred[i,2,m,n]/2,(pred[i,1,m,n]+n)/num_gridy + pred[i,3,m,n]/2)
                        bbox2_pred_xyxy = ((pred[i,5,m,n]+m)/num_gridx - pred[i,7,m,n]/2,(pred[i,6,m,n]+n)/num_gridy - pred[i,8,m,n]/2,
                                           (pred[i,5,m,n]+m)/num_gridx + pred[i,7,m,n]/2,(pred[i,6,m,n]+n)/num_gridy + pred[i,8,m,n]/2)
                        bbox_gt_xyxy = ((labels[i,0,m,n]+m)/num_gridx - labels[i,2,m,n]/2,(labels[i,1,m,n]+n)/num_gridy - labels[i,3,m,n]/2,
                                        (labels[i,0,m,n]+m)/num_gridx + labels[i,2,m,n]/2,(labels[i,1,m,n]+n)/num_gridy + labels[i,3,m,n]/2)
                        iou1 = calculate_iou(bbox1_pred_xyxy,bbox_gt_xyxy)
                        iou2 = calculate_iou(bbox2_pred_xyxy,bbox_gt_xyxy)
                        # 选择iou大的bbox作为负责物体
                        if iou1 >= iou2:
                            coor_loss = coor_loss + 5 * (torch.sum((pred[i,0:2,m,n] - labels[i,0:2,m,n])**2) \
                                        + torch.sum((pred[i,2:4,m,n].sqrt()-labels[i,2:4,m,n].sqrt())**2))
                            obj_confi_loss = obj_confi_loss + (pred[i,4,m,n] - iou1)**2
                            # iou比较小的bbox不负责预测物体，因此confidence loss算在noobj中，注意，对于标签的置信度应该是iou2
                            noobj_confi_loss = noobj_confi_loss + 0.5 * ((pred[i,9,m,n]-iou2)**2)
                        else:
                            coor_loss = coor_loss + 5 * (torch.sum((pred[i,5:7,m,n] - labels[i,5:7,m,n])**2) \
                                        + torch.sum((pred[i,7:9,m,n].sqrt()-labels[i,7:9,m,n].sqrt())**2))
                            obj_confi_loss = obj_confi_loss + (pred[i,9,m,n] - iou2)**2
                            # iou比较小的bbox不负责预测物体，因此confidence loss算在noobj中,注意，对于标签的置信度应该是iou1
                            noobj_confi_loss = noobj_confi_loss + 0.5 * ((pred[i, 4, m, n]-iou1) ** 2)
                        class_loss = class_loss + torch.sum((pred[i,10:,m,n] - labels[i,10:,m,n])**2)
                    else:  # 如果不包含物体
                        noobj_confi_loss = noobj_confi_loss + 0.5 * torch.sum(pred[i,[4,9],m,n]**2)

        loss = coor_loss + obj_confi_loss + noobj_confi_loss + class_loss
        # 此处可以写代码验证一下loss的大致计算是否正确，这个要验证起来比较麻烦，比较简洁的办法是，将输入的pred置为全1矩阵，再进行误差检查，会直观很多。
        return loss/n_batch

def calculate_iou(bbox1,bbox2):
    """计算bbox1=(x1,y1,x2,y2)和bbox2=(x3,y3,x4,y4)两个bbox的iou"""
    intersect_bbox = [0., 0., 0., 0.]  # bbox1和bbox2的交集
    if bbox1[2]<bbox2[0] or bbox1[0]>bbox2[2] or bbox1[3]<bbox2[1] or bbox1[1]>bbox2[3]:
        pass
    else:
        intersect_bbox[0] = max(bbox1[0],bbox2[0])
        intersect_bbox[1] = max(bbox1[1],bbox2[1])
        intersect_bbox[2] = min(bbox1[2],bbox2[2])
        intersect_bbox[3] = min(bbox1[3],bbox2[3])

    area1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])  # bbox1面积
    area2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])  # bbox2面积
    area_intersect = (intersect_bbox[2] - intersect_bbox[0]) * (intersect_bbox[3] - intersect_bbox[1])  # 交集面积
    # print(bbox1,bbox2)
    # print(intersect_bbox)
    # input()

    if area_intersect>0:
        return area_intersect / (area1 + area2 - area_intersect)  # 计算iou
    else:
        return 0


In [7]:
#define model
class Yolov1_resnet(nn.Module):
    def __init__(self):
        super(Yolov1_resnet, self).__init__()
        resnet = tvmodel.resnet34(pretrained=True)
        resnet_out_channel = resnet.fc.in_features
        self.resnet = nn.Sequential(*list(resnet.children())[: -2])
        
        self.Conv_layers = nn.Sequential(
            nn.Conv2d(resnet_out_channel, 1024, 3, padding=1),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(),
            
            nn.Conv2d(1024, 1024, 3, stride=2, padding=1),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(),
            
            nn.Conv2d(1024, 1024, 3, padding=1),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(),
            
            nn.Conv2d(1024, 1024, 3, padding=1),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(),
        )
        
        self.Fc_layers = nn.Sequential(
            nn.Linear(7 * 7 * 1024, 4096),
            nn.LeakyReLU(),
            
            nn.Linear(4096, 7 * 7 * 30),
            nn.Sigmoid()
        )
        
    def forward(self, out):
        out = self.resnet(out)
        out = self.Conv_layers(out)
        out = out.view(out.size()[0], -1)
        out = self.Fc_layers(out)
        return out.reshape(-1, (5 * NUM_BBOX + len(CLASSES)), 7, 7)

In [21]:
#test model
x = torch.randn((1, 3, 448, 448))
net = Yolov1_resnet()
print(net)
y = net(x)
print(y.size())

Yolov1_resnet(
  (resnet): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=

torch.Size([1, 30, 7, 7])


In [8]:
#start training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

epoch = 50
batchsize = 100
lr = 0.01

train_data = VOC2012()
train_dataloader = DataLoader(VOC2012(is_train=True), batch_size=batchsize, shuffle=True)

model = Yolov1_resnet().to(device)

for layer in model.children():
    layer.requires_grad = False
    break
criterion = Loss_yolov1()
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=0.0005)

is_vis = False

if is_vis:
    vis = visdom.Visdom()
    viswin1 = vis.line(np.array([0.]),np.array([0.]),opts=dict(title="Loss/Step",xlabel="100*step",ylabel="Loss"))

for e in range(epoch):
    model.train()
    yl = torch.Tensor([0]).to(device)
    for i, (inputs, labels) in enumerate(train_dataloader):
        inputs = inputs.to(device)
        labels = labels.float().to(device)
        pred = model(inputs)
        loss = criterion(pred, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        print("Epoch %d/%d| Step %d/%d| Loss: %.2f"%(e,epoch,i,len(train_data)//batchsize,loss))
        yl = yl + loss
        if is_vis and (i + 1) % 100 == 0:
            vis.line(np.array([yl.cpu().item()/(i+1)]),np.array([i+e*len(train_data)//batchsize]),win=viswin1,update='append')
    if (e + 1) % 10 == 0:
        torch.save(model, "./models_pkl/YOLOv1_epoch" + str(e + 1) + ".pkl")
        # compute_val_map(model)

RuntimeError: CUDA out of memory. Tried to allocate 1.20 GiB (GPU 0; 4.00 GiB total capacity; 2.41 GiB already allocated; 604.14 MiB free; 2.44 GiB reserved in total by PyTorch)