In [23]:
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [24]:
import os
os.chdir("/content/gdrive/MyDrive/CMPT733-Lab3-Workspace")

Dataset.py

In [25]:
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.utils.data
import torchvision.datasets as dset
import torchvision.transforms as transforms
import torchvision.utils as vutils
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np
import os
import cv2
import matplotlib.pyplot as plt
import math
from random import randint

#generate default bounding boxes
def default_box_generator(layers, large_scale, small_scale):
    #input:
    #layers      -- a list of sizes of the output layers. in this assignment, it is set to [10,5,3,1].
    #large_scale -- a list of sizes for the larger bounding boxes. in this assignment, it is set to [0.2,0.4,0.6,0.8].
    #small_scale -- a list of sizes for the smaller bounding boxes. in this assignment, it is set to [0.1,0.3,0.5,0.7].
    
    #output:
    #boxes -- default bounding boxes, shape=[box_num,8]. box_num=4*(10*10+5*5+3*3+1*1) for this assignment.
    
    #TODO:
    #create an numpy array "boxes" to store default bounding boxes
    #you can create an array with shape [10*10+5*5+3*3+1*1,4,8], and later reshape it to [box_num,8]
    #the first dimension means number of cells, 10*10+5*5+3*3+1*1
    #the second dimension 4 means each cell has 4 default bounding boxes.
    #their sizes are [ssize,ssize], [lsize,lsize], [lsize*sqrt(2),lsize/sqrt(2)], [lsize/sqrt(2),lsize*sqrt(2)],
    #where ssize is the corresponding size in "small_scale" and lsize is the corresponding size in "large_scale".
    #for a cell in layer[i], you should use ssize=small_scale[i] and lsize=large_scale[i].
    #the last dimension 8 means each default bounding box has 8 attributes: [x_center, y_center, box_width, box_height, x_min, y_min, x_max, y_max]
    layer_sq=np.square(layers)
    box_num=sum(layer_sq)*len(layers)
    box_final=np.zeros((sum(layer_sq),len(layers),8))
    table_box_size=np.zeros([sum(layer_sq)])
    index_start=np.zeros([len(layers)])
    for i in np.arange(len(layers)-1):
        index_start[i+1]=index_start[i]+layer_sq[i]
    for i in np.arange(len(layers)-1):
        table_box_size[int(index_start[i]):int(index_start[i+1])]=layers[i]
    table_box_size[int(index_start[len(layers)-1]):len(table_box_size)]=layers[len(layers)-1]
    
    for i in range(sum(layer_sq)):
        size=1.0/table_box_size[i]
        layer_index=layers.index(table_box_size[i])
        rel_i=i-index_start[layer_index]
        ix=rel_i%table_box_size[i]
        iy=math.floor(rel_i/table_box_size[i])
        for j in range(len(layers)):
            box_final[i][j][0]=(ix*size)+size/2
            box_final[i][j][1]=(iy*size)+size/2
            if j==0:
                height=width=small_scale[layer_index]
            elif j==1:
                height=width=large_scale[layer_index]
            elif j==2:
                height=large_scale[layer_index]/math.sqrt(2)
                width=min(height,1)
            elif j==3:
                width=large_scale[layer_index]/math.sqrt(2)
                height=min(width,1)
            box_final[i][j][2]=width
            box_final[i][j][3]=height
            box_final[i][j][4]=max(box_final[i][j][0]-width/2,0)
            box_final[i][j][5]=max(box_final[i][j][1]-height/2,0)
            box_final[i][j][6]=min(box_final[i][j][0]+width/2,1)
            box_final[i][j][7]=min(box_final[i][j][1]+height/2,1) 
    box_final=box_final.reshape((box_num,8))
    return box_final

#this is an example implementation of IOU.
#It is different from the one used in YOLO, please pay attention.
#you can define your own iou function if you are not used to the inputs of this one.
def iou(boxs_default, x_min,y_min,x_max,y_max):
    #input:
    #boxes -- [num_of_boxes, 8], a list of boxes stored as [box_1,box_2, ...], where box_1 = [x1_center, y1_center, width, height, x1_min, y1_min, x1_max, y1_max].
    #x_min,y_min,x_max,y_max -- another box (box_r)
    
    #output:
    #ious between the "boxes" and the "another box": [iou(box_1,box_r), iou(box_2,box_r), ...], shape = [num_of_boxes]
    inter = np.maximum(np.minimum(boxs_default[:,6],x_max)-np.maximum(boxs_default[:,4],x_min),0)*np.maximum(np.minimum(boxs_default[:,7],y_max)-np.maximum(boxs_default[:,5],y_min),0)
    area_a = (boxs_default[:,6]-boxs_default[:,4])*(boxs_default[:,7]-boxs_default[:,5])
    area_b = (x_max-x_min)*(y_max-y_min)
    union = area_a + area_b - inter
    return inter/np.maximum(union,1e-8)

def match(ann_box,ann_confidence,boxs_default,threshold,cat_id,x_min,y_min,x_max,y_max):
    #input:
    #ann_box                 -- [num_of_boxes,4], ground truth bounding boxes to be updated
    #ann_confidence          -- [num_of_boxes,number_of_classes], ground truth class labels to be updated
    #boxs_default            -- [num_of_boxes,8], default bounding boxes
    #threshold               -- if a default bounding box and the ground truth bounding box have iou>threshold, then this default bounding box will be used as an anchor
    #cat_id                  -- class id, 0-cat, 1-dog, 2-person
    #x_min,y_min,x_max,y_max -- bounding box

    #compute iou between the default bounding boxes and the ground truth bounding box
    ious = iou(boxs_default, x_min,y_min,x_max,y_max)
    ious_true = ious>threshold
    #TODO:
    #update ann_box and ann_confidence, with respect to the ious and the default bounding boxes.
    #if a default bounding box and the ground truth bounding box have iou>threshold, then we will say this default bounding box is carrying an object.
    #this default bounding box will be used to update the corresponding entry in ann_box and ann_confidence
    idx = [np.argmax(ious)]
    #TODO:
    #make sure at least one default bounding box is used
    #update ann_box and ann_confidence (do the same thing as above)

    gw=x_max-x_min
    gh=y_max-y_min
    gx=x_min+gw/2
    gy=y_min+gh/2
    px=boxs_default[idx,0]
    py=boxs_default[idx,1]
    pw=boxs_default[idx,2]
    ph=boxs_default[idx,3]
    tx=(gx-px)/pw
    ty=(gy-py)/ph
    tw=np.log(gw/pw)
    th=np.log(gh/ph)
    ann_box[idx,0]=tx
    ann_box[idx,1]=ty
    ann_box[idx,2]=tw
    ann_box[idx,3]=th
    ann_confidence[idx,cat_id] = 1
    ann_confidence[idx,-1] = 0

class COCO(torch.utils.data.Dataset):
    def __init__(self, imgdir, anndir, class_num, boxs_default, train = True, augmentation = True, split = True, image_size=320):
        self.train = train
        self.augmentation = augmentation
        self.split = split
        self.imgdir = imgdir
        self.anndir = anndir
        self.class_num = class_num
        
        #overlap threshold for deciding whether a bounding box carries an object or no
        self.threshold = 0.5
        self.boxs_default = boxs_default
        self.box_num = len(self.boxs_default)
        
        self.img_names = os.listdir(self.imgdir)
        self.img_names.sort()
        self.image_size = image_size
        
        #notice:
        #you can split the dataset into 90% training and 10% validation here, by slicing self.img_names with respect to self.train
        ratio = 0.9
        ttl = len(self.img_names)
        if self.train == True and self.split == True:
            self.img_names = self.img_names[0:round(ttl*ratio)]
        elif self.train == False and self.split == True:
            self.img_names = self.img_names[round(ttl*ratio):ttl]

    def __len__(self):
        if self.train == True and self.augmentation == True:
            return len(self.img_names)*2
        else:
            return len(self.img_names)

    def __getitem__(self, index):
        ann_box = np.zeros([self.box_num,4], np.float32) #bounding boxes
        ann_confidence = np.zeros([self.box_num,self.class_num], np.float32) #one-hot vectors
        #one-hot vectors with four classes
        #[1,0,0,0] -> cat
        #[0,1,0,0] -> dog
        #[0,0,1,0] -> person
        #[0,0,0,1] -> background

        ann_confidence[:,-1] = 1 #the default class for all cells is set to "background"
        if self.train == True and self.augmentation == True:
            if index % 2 == 1:
                i = int((index-1)/2)
            else:
                i = int(index/2)
        else:
            i = index
        img_name = self.imgdir+self.img_names[i]
        if self.anndir != None:
            ann_name = self.anndir+self.img_names[i][:-3]+"txt"
        filename = self.img_names[i][:-4]
        im = cv2.imread(img_name)
        shape = im.shape
        if self.anndir == None:
            im = cv2.resize(im, (self.image_size, self.image_size), interpolation = cv2.INTER_AREA)
            im = im.transpose(2,0,1)
            #im = np.swapaxes(im,2,1)
            #im = np.swapaxes(im,1,0)
            im = (im/255.0).astype('float32')
            return im, ann_box, ann_confidence, torch.Tensor(shape),filename

        with open(ann_name) as f:
            line = f.read().splitlines()
        n_file = len(line) 
        x_min = np.zeros(n_file)
        y_min = np.zeros(n_file)
        x_max = np.zeros(n_file)
        y_max = np.zeros(n_file)
        
        for i in range(0,n_file):
            data = line[i].split(" ")
            cat_id = int(data[0])
            x_min[i] = float(data[1])
            y_min[i] = float(data[2])
            width = float(data[3])
            height = float(data[4])
            x_max[i] = x_min[i] + width
            y_max[i] = y_min[i] + height
        if self.train == True and self.augmentation == True:
            if index % 2 == 1:
                # Random Crop
                X_min = min(x_min)
                X_max = max(x_max)
                Y_min = min(y_min)
                Y_max = max(y_max)
                left = randint(0, math.floor(X_min))
                right = randint(math.ceil(X_max), shape[1])
                top = randint(0, math.floor(Y_min))
                bottom = randint(math.ceil(Y_max), shape[0])
                aug = im[top:bottom,left:right]
                x_min = x_min - left
                x_max = x_max - left
                y_min = y_min - top
                y_max = y_max - top
                shape = aug.shape
                
                # Flip
                aug = np.flip(aug, axis=1)
                tmp = x_min
                x_min = shape[1] - x_max
                x_max = shape[1] - tmp

                im = cv2.resize(aug, (self.image_size, self.image_size), interpolation = cv2.INTER_AREA)
                im = im.transpose(2,0,1)
            else:
                im = cv2.resize(im, (self.image_size, self.image_size), interpolation = cv2.INTER_AREA)
                im = im.transpose(2,0,1)
        else:
            im = cv2.resize(im, (self.image_size, self.image_size), interpolation = cv2.INTER_AREA)
            im = im.transpose(2,0,1)
        for i in range(0,n_file):
            x_min[i] = x_min[i] / shape[1]
            y_min[i] = y_min[i] / shape[0]
            x_max[i] = x_max[i] / shape[1]
            y_max[i] = y_max[i] / shape[0]
            match(ann_box,ann_confidence,self.boxs_default,self.threshold,cat_id,x_min[i],y_min[i],x_max[i],y_max[i])
        im = (im/255.0).astype('float32')
        
        return im, ann_box, ann_confidence, torch.Tensor(shape), filename

Model.py

In [26]:
import os
import random
import numpy as np
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.utils.data
import torchvision.datasets as dset
import torchvision.transforms as transforms
import torchvision.utils as vutils
from torch.autograd import Variable
import torch.nn.functional as F

def SSD_loss(pred_confidence, pred_box, ann_confidence, ann_box):
    #input:
    #pred_confidence -- the predicted class labels from SSD, [batch_size, num_of_boxes, num_of_classes]
    #pred_box        -- the predicted bounding boxes from SSD, [batch_size, num_of_boxes, 4]
    #ann_confidence  -- the ground truth class labels, [batch_size, num_of_boxes, num_of_classes]
    #ann_box         -- the ground truth bounding boxes, [batch_size, num_of_boxes, 4]
    #
    #output:
    #loss -- a single number for the value of the loss function, [1]
    
    #TODO: write a loss function for SSD
    #
    #For confidence (class labels), use cross entropy (F.cross_entropy)
    #You can try F.binary_cross_entropy and see which loss is better
    #For box (bounding boxes), use smooth L1 (F.smooth_l1_loss)
    #
    #Note that you need to consider cells carrying objects and empty cells separately.
    #I suggest you to reshape confidence to [batch_size*num_of_boxes, num_of_classes]
    #and reshape box to [batch_size*num_of_boxes, 4].
    #Then you need to figure out how you can get the indices of all cells carrying objects,
    #and use confidence[indices], box[indices] to select those cells.

    sz = ann_confidence.shape[0]*ann_confidence.shape[1]
    pred_confidence = pred_confidence.reshape(sz, ann_confidence.shape[2])
    pred_box = pred_box.reshape(sz, 4)
    ann_confidence = ann_confidence.reshape(sz, ann_confidence.shape[2])
    ann_box = ann_box.reshape(sz, 4)
    
    _, cls = torch.max(ann_confidence, 1)
    obj = (cls != 3).nonzero()
    noobj = (cls == 3).nonzero()
    obj = obj.reshape(len(obj))
    noobj = noobj.reshape(len(noobj))
    pred_conf_obj = pred_confidence[obj]
    pred_box_obj = pred_box[obj]
    ann_conf_obj = ann_confidence[obj]
    ann_box_obj = ann_box[obj]
    pred_conf_noobj = pred_confidence[noobj]
    ann_conf_noobj = ann_confidence[noobj]
    
    l_cls_obj = F.binary_cross_entropy(pred_conf_obj, ann_conf_obj)
    l_cls_noobj = 3*F.binary_cross_entropy(pred_conf_noobj, ann_conf_noobj)
    l_box = F.smooth_l1_loss(pred_box_obj, ann_box_obj)
    loss = l_cls_obj + l_cls_noobj + l_box

    return loss


def conv_layer(in_channel, out_channel, ker_sz, s, p=1):
    return nn.Sequential(
        nn.Conv2d(in_channel, out_channel, kernel_size=ker_sz, stride=s, padding=p),
        nn.BatchNorm2d(out_channel),
        nn.ReLU()
    )

class conv_reshape(nn.Module):
    def __init__(self, shape):
        super(conv_reshape, self).__init__()
        self.conv = nn.Conv2d(256, 16, kernel_size=3, stride=1, padding=1)
        self.shape = shape
        
    def forward(self, x):
        x = self.conv(x)
        x = x.reshape(len(x),16,self.shape)
        return x

def permute(x):
    x = x.permute(0,2,1)
    return x

class SSD(nn.Module):

    def __init__(self, class_num):
        super(SSD, self).__init__()
        
        self.class_num = class_num #num_of_classes, in this assignment, 4: cat, dog, person, background
        
        #TODO: define layers
        self.layer = nn.ModuleList()
        self.layer.append(conv_layer(3, 64, 3, 2))
        self.layer.append(conv_layer(64, 64, 3, 1))
        self.layer.append(conv_layer(64, 64, 3, 1))
        self.layer.append(conv_layer(64, 128, 3, 2))
        self.layer.append(conv_layer(128, 128, 3, 1))
        self.layer.append(conv_layer(128, 128, 3, 1))
        self.layer.append(conv_layer(128, 256, 3, 2))
        self.layer.append(conv_layer(256, 256, 3, 1))
        self.layer.append(conv_layer(256, 256, 3, 1))
        self.layer.append(conv_layer(256, 512, 3, 2))
        self.layer.append(conv_layer(512, 512, 3, 1))
        self.layer.append(conv_layer(512, 512, 3, 1))
        self.layer.append(conv_layer(512, 256, 3, 2))
        self.layer.append(conv_layer(256, 256, 1, 1, 0))
        self.layer.append(conv_layer(256, 256, 3, 2))
        self.layer.append(conv_layer(256, 256, 1, 1, 0))
        self.layer.append(conv_layer(256, 256, 3, 1, 0))
        self.layer.append(conv_layer(256, 256, 1, 1, 0))
        self.layer.append(conv_layer(256, 256, 3, 1, 0))

        self.path1l = conv_reshape(100)
        self.path1r = conv_reshape(100)
        self.path2l = conv_reshape(25)
        self.path2r = conv_reshape(25)
        self.path3l = conv_reshape(9)
        self.path3r = conv_reshape(9)
        self.path4l = nn.Conv2d(256, 16, kernel_size=1, stride=1, padding=0)
        self.path4r = nn.Conv2d(256, 16, kernel_size=1, stride=1, padding=0)
        
    def forward(self, x):        
        #input:
        #x -- images, [batch_size, 3, 320, 320]
        
        x = x/255.0 #normalize image. If you already normalized your input image in the dataloader, remove this line.
        
        #TODO: define forward
        
        #should you apply softmax to confidence? (search the pytorch tutorial for F.cross_entropy.) If yes, which dimension should you apply softmax?
        
        #sanity check: print the size/shape of the confidence and bboxes, make sure they are as follows:
        #confidence - [batch_size,4*(10*10+5*5+3*3+1*1),num_of_classes]
        #bboxes - [batch_size,4*(10*10+5*5+3*3+1*1),4]
        for i in range(13):
            x = self.layer[i](x)
        path1l = self.path1l(x)
        path1r = self.path1r(x)
        
        x = self.layer[13](x)
        x = self.layer[14](x)
        path2l = self.path2l(x)
        path2r = self.path2r(x)
        
        x = self.layer[15](x)
        x = self.layer[16](x)
        path3l = self.path3l(x)
        path3r = self.path3r(x)
        
        x = self.layer[17](x)
        x = self.layer[18](x)
        path4l = self.path4l(x).reshape(len(x),16,1)
        path4r = self.path4r(x).reshape(len(x),16,1)
        
        left = permute(torch.cat([path1l,path2l,path3l,path4l],dim=2)).reshape(len(x), 540, 4)
        right = permute(torch.cat([path1r,path2r,path3r,path4r], dim=2)).reshape(len(x), 540, self.class_num)
        right = torch.softmax(right,dim=2)
       
        bboxes = left
        confidence = right
        
        return confidence,bboxes

Utils.py

In [27]:
import numpy as np
import cv2

import matplotlib.pyplot as plt
import math
import os
import torch

colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255)]
#use [blue green red] to represent different classes

def visualize_pred(windowname, pred_confidence, pred_box, ann_confidence, ann_box, image_, boxs_default):
    #input:
    #windowname      -- the name of the window to display the images
    #pred_confidence -- the predicted class labels from SSD, [num_of_boxes, num_of_classes]
    #pred_box        -- the predicted bounding boxes from SSD, [num_of_boxes, 4]
    #ann_confidence  -- the ground truth class labels, [num_of_boxes, num_of_classes]
    #ann_box         -- the ground truth bounding boxes, [num_of_boxes, 4]
    #image_          -- the input image to the network
    #boxs_default    -- default bounding boxes, [num_of_boxes, 8]

    _, class_num = pred_confidence.shape
    #class_num = 4
    class_num = class_num-1
    #class_num = 3 now, because we do not need the last class (background)
    
    image_ = (image_*255).astype(np.uint8)
    image = np.transpose(image_, (1,2,0)).astype(np.uint8)
    shape = image.shape
    image1 = np.zeros(image.shape,np.uint8)
    image2 = np.zeros(image.shape,np.uint8)
    image3 = np.zeros(image.shape,np.uint8)
    image4 = np.zeros(image.shape,np.uint8)
    image1[:]=image[:]
    image2[:]=image[:]
    image3[:]=image[:]
    image4[:]=image[:]
    #image1: draw ground truth bounding boxes on image1
    #image2: draw ground truth "default" boxes on image2 (to show that you have assigned the object to the correct cell/cells)
    #image3: draw network-predicted bounding boxes on image3
    #image4: draw network-predicted "default" boxes on image4 (to show which cell does your network think that contains an object)
    
    #draw ground truth
    for i in range(len(ann_confidence)):
        for j in range(class_num):
            if ann_confidence[i,j]>0.5:
                #TODO:
                #image1: draw ground truth bounding boxes on image1
                #image2: draw ground truth "default" boxes on image2 (to show that you have assigned the object to the correct cell/cells)
                
                #you can use cv2.rectangle as follows:
                #start_point = (x1, y1) #top left corner, x1<x2, y1<y2
                #end_point = (x2, y2) #bottom right corner
                #color = colors[j] #use red green blue to represent different classes
                #thickness = 2
                #cv2.rectangle(image?, start_point, end_point, color, thickness)

                color = colors[j]
                thickness = 2
                x1,y1,x2,y2,_,_ = box_coordination(ann_box[i], shape, boxs_default[i])
                start_point = (x1, y1)
                end_point = (x2, y2)
                cv2.rectangle(image1, start_point, end_point, color, thickness)
                x1 = int(boxs_default[i,4] * shape[1])
                y1 = int(boxs_default[i,5] * shape[0])
                x2 = int(boxs_default[i,6] * shape[1])
                y2 = int(boxs_default[i,7] * shape[0])
                start_point = (x1, y1)
                end_point = (x2, y2)
                cv2.rectangle(image2, start_point, end_point, color, thickness)
    
    for i in range(len(pred_confidence)):
        for j in range(class_num):
            if pred_confidence[i,j]>0.5:
                #TODO:
                #image3: draw network-predicted bounding boxes on image3
                #image4: draw network-predicted "default" boxes on image4 (to show which cell does your network think that contains an object)
                color = colors[j]
                thickness = 2
                x1,y1,x2,y2,_,_ = box_coordination(pred_box[i], shape, boxs_default[i])
                start_point = (x1, y1)
                end_point = (x2, y2)
                cv2.rectangle(image3, start_point, end_point, color, thickness)
                x1 = int(boxs_default[i,4] * shape[1])
                y1 = int(boxs_default[i,5] * shape[0])
                x2 = int(boxs_default[i,6] * shape[1])
                y2 = int(boxs_default[i,7] * shape[0])
                start_point = (x1, y1)
                end_point = (x2, y2)
                cv2.rectangle(image4, start_point, end_point, color, thickness)

    #combine four images into one
    h,w,_ = image1.shape
    image = np.zeros([h*2,w*2,3], np.uint8)
    image[:h,:w] = image1
    image[:h,w:] = image2
    image[h:,:w] = image3
    image[h:,w:] = image4
    # cv2.imshow(windowname+" [[gt_box,gt_dft],[pd_box,pd_dft]]",image)
    # cv2.waitKey(1)
    #if you are using a server, you may not be able to display the image.
    #in that case, please save the image using cv2.imwrite and check the saved image for visualization.
    return image

def non_maximum_suppression(confidence, box, boxs_default, overlap=0.2, threshold=0.5):

    #input:
    #confidence_  -- the predicted class labels from SSD, [num_of_boxes, num_of_classes]
    #box_         -- the predicted bounding boxes from SSD, [num_of_boxes, 4]
    #boxs_default -- default bounding boxes, [num_of_boxes, 8]
    #overlap      -- if two bounding boxes in the same class have iou > overlap, then one of the boxes must be suppressed
    #threshold    -- if one class in one cell has confidence > threshold, then consider this cell carrying a bounding box with this class.
    
    #output:
    #depends on your implementation.
    #if you wish to reuse the visualize_pred function above, you need to return a "suppressed" version of confidence [5,5, num_of_classes].
    #you can also directly return the final bounding boxes and classes, and write a new visualization function for that.
    
    
    #TODO: non maximum suppression

    N = len(boxs_default)
    cls_num = confidence.shape[1]
    a_box = np.zeros([N,4])
    a_conf = np.zeros([N,cls_num])
    a_conf[:,-1] = 1
    b_box = np.zeros([N,4])
    b_conf = np.zeros([N,cls_num])
    b_conf[:,-1] = 1
    
    idx, cat = np.where(confidence[:,0:3] > threshold)
    a_conf[idx] = confidence[idx]
    a_box[idx] = box[idx]
    dx = box[:,0]
    dy = box[:,1]
    dw = box[:,2]
    dh = box[:,3]
    px = boxs_default[:,0]
    py = boxs_default[:,1]
    pw = boxs_default[:,2]
    ph = boxs_default[:,3]
    gx = pw * dx + px
    gy = ph * dy + py
    gw = pw * np.exp(dw)
    gh = ph * np.exp(dh)
    x0 = gx
    y0 = gy
    width = gw
    height = gh
    
    loc = np.zeros((len(a_box),8))
    for i in range(len(boxs_default)):
        loc[i,0] = x0[i]
        loc[i,1] = y0[i]
        loc[i,2] = width[i]
        loc[i,3] = height[i]
        loc[i,4] = x0[i] - (width[i]/2)
        loc[i,5] = y0[i] - (height[i]/2)
        loc[i,6] = x0[i] + (width[i]/2)
        loc[i,7] = y0[i] + (height[i]/2)
    while (np.max(a_conf[:,0:3]) > threshold):
        candidate = np.argmax(a_conf[:,0:3])
        idx, cat = np.unravel_index(candidate, [N, 3])
        b_conf[idx,:] = a_conf[idx,:]
        b_box[idx,:] = a_box[idx,:]
        a_conf[idx,:] = [0,0,0,1]
        a_box[idx,:] = [0,0,0,0]
        x_min = loc[idx,4]
        y_min = loc[idx,5]
        x_max = loc[idx,6]
        y_max = loc[idx,7]

        overlapping = np.where(a_conf[:,0:3] > threshold)[0]
        iou_val = iou(loc[overlapping], x_min,y_min,x_max,y_max)
        idx_overlap = np.where(iou_val > overlap)[0]
        idx_remove = overlapping[idx_overlap]
        a_conf[idx_remove,:] = [0,0,0,1]
        a_box[idx_remove,:] = [0,0,0,0]

    return b_conf, b_box


def box_coordination(box, shape, boxs_default):
    dx = box[0]
    dy = box[1]
    dw = box[2]
    dh = box[3]
    px = boxs_default[0]
    py = boxs_default[1]
    pw = boxs_default[2]
    ph = boxs_default[3]
    gx = pw*dx+px
    gy = ph*dy+py
    gw = pw*math.exp(dw)
    gh = ph*math.exp(dh)
    x0 = gx*shape[1]
    y0 = gy*shape[0]

    width = gw*shape[1]
    height = gh*shape[0]
    x1 = int(x0-(width/2))
    y1 = int(y0-(height/2))
    x2 = int(x0+(width/2))
    y2 = int(y0+(height/2))
    
    return x1,y1,x2,y2,width,height    
    
def visualize(idx, windowname, pred_confidence, pred_box, ann_confidence, ann_box, im, boxs_default, toFile=False, pathname=""):
    if torch.is_tensor(pred_confidence):
        pred_confidence = pred_confidence[idx].detach().cpu().numpy()
    if torch.is_tensor(pred_box):
        pred_box = pred_box[idx].detach().cpu().numpy()
    if torch.is_tensor(ann_confidence):
        ann_confidence = ann_confidence[idx].detach().cpu().numpy()
    if torch.is_tensor(ann_box):
        ann_box = ann_box[idx].detach().cpu().numpy()
    if torch.is_tensor(im):
        im = im[idx].detach().cpu().numpy()
    im = visualize_pred(windowname, pred_confidence, pred_box, ann_confidence, ann_box, im, boxs_default)
    if toFile == True:
        cv2.imwrite(pathname + ".jpeg", im)
    else:
        plt.imshow(cv2.cvtColor(im, cv2.COLOR_BGR2RGB))
        plt.show()


def toTxt(train, name, iteration, pred_confidence, pred_box, shape, batch_size, boxs_default):
    if train == True:
        path = "predicted_boxes/train/"
    else:
        path = "predicted_boxes/test/"
    if not os.path.exists(path):
        os.makedirs(path)
    
    if torch.is_tensor(pred_box):
        pred_box = pred_box.detach().cpu().numpy()
    if torch.is_tensor(pred_confidence):
        pred_confidence = pred_confidence.detach().cpu().numpy()
    if pred_box.ndim == 2:
        pred_box = np.reshape(pred_box, (batch_size, pred_box.shape[0], pred_box.shape[1]))
    if pred_confidence.ndim == 2:
        pred_confidence = np.reshape(pred_confidence, (batch_size, pred_confidence.shape[0], pred_confidence.shape[1]))

    for i in range(len(pred_box)):
        filename = os.path.join(path, str(name[i])+'.txt')
        with open(filename,"w") as f:
            idx, cls = np.where(pred_confidence[i,:,0:3] > 0.5)
            num_obj = len(idx)
            for j in range(num_obj):
                index = idx[j]
                cat_id = cls[j]
                x1,y1,_,_,width,height = box_coordination(pred_box[i,index], shape[i].numpy(), boxs_default[index])
                content = str(cat_id) +' '+ str(float(x1)) +' '+ str(float(y1)) +' '+ str(float(width)) +' '+ str(float(height)) + '\n'
                f.write(content)

main.py

In [28]:
import os
import numpy as np
import time
import cv2

import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.utils.data
import torchvision.datasets as dset
import torchvision.transforms as transforms
import torchvision.utils as vutils
from torch.autograd import Variable
import torch.nn.functional as F

import matplotlib.pyplot as plt
import sys

class_num = 4

num_epochs = 2
batch_size = 32

boxs_default = default_box_generator([10,5,3,1], [0.2,0.4,0.6,0.8], [0.1,0.3,0.5,0.7])

network = SSD(class_num)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

network.to(device)

if torch.cuda.is_available():
    cudnn.benchmark = True

saved = 0
cpts = 'checkpoints/network'
ext = '.pth'
_checkpoint = cpts+str(saved)+ext
_result = "results/"
_dir = 'train'
_TEST = False

if not _TEST:
    dataset = COCO("data/train/images/", "data/train/annotations/", class_num, boxs_default, train = True, augmentation = True, split = True, image_size=320)
    dataset_test = COCO("data/train/images/", "data/train/annotations/", class_num, boxs_default, train = False, augmentation = False, split = True, image_size=320)
    
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    dataloader_test = torch.utils.data.DataLoader(dataset_test, batch_size=batch_size, shuffle=True, num_workers=0)
    
    optimizer = optim.Adam(network.parameters(), lr = 1e-4)

    start_time = time.time()
    
    if os.path.exists(_checkpoint):
        network.load_state_dict(torch.load(_checkpoint,map_location=torch.device(device)))
        print("Loaded model "+_checkpoint+" to resume training")
    
    train_loss=[]
    validation_loss=[]
    for epoch in range(num_epochs):
        network.train()
        ttl_loss = 0
        item_count = 0
        for i, data in enumerate(dataloader, 0):
            images_, ann_box_, ann_confidence_, shape, _ = data
            images = images_.to(device)
            ann_box = ann_box_.to(device)
            ann_confidence = ann_confidence_.to(device)
            
            optimizer.zero_grad()
            pred_confidence, pred_box = network(images)

            loss_net = SSD_loss(pred_confidence, pred_box, ann_confidence, ann_box)
            loss_net.backward()
            optimizer.step()
            

            ttl_loss += loss_net.data
            item_count += 1            
            print('\rTraining: %d\t' % (i+1), end="")
            print(ttl_loss / item_count, end="")
        
        print('\r[%d] time: %f \ttrain loss: %f\t\t\t' % (saved+epoch+1, time.time()-start_time, ttl_loss/item_count))
        train_loss=np.append(train_loss, ttl_loss.detach().cpu().numpy()/item_count)
        nms_confidence, nms_box = non_maximum_suppression(pred_confidence[0].detach().cpu().numpy(), pred_box[0].detach().cpu().numpy(), boxs_default)
        visualize(0,"train", pred_confidence, pred_box, ann_confidence_, ann_box_, images_, boxs_default)
        visualize(0,"train", nms_confidence, nms_box, ann_confidence_, ann_box_, images_, boxs_default)

        if epoch%2==1:
            torch.save(network.state_dict(), cpts + str(saved+epoch+1)+ext)

        network.eval()
        ttl_loss=0
        item_count=0
    
        for i, data in enumerate(dataloader_test, 0):
            images_, ann_box_, ann_confidence_, shape, _ = data
            images = images_.to(device)
            ann_box = ann_box_.to(device)
            ann_confidence = ann_confidence_.to(device)
            pred_confidence, pred_box = network(images)
            loss_net=SSD_loss(pred_confidence,pred_box,ann_confidence,ann_box)
            ttl_loss+=loss_net.data
            item_count+=1
            print("\rTesting: %d\t\t\t" % (i+1), end='')

        print('\r[%d] time: %f \tvalidation loss: %f\t\t\t' % (saved+epoch+1, time.time()-start_time, ttl_loss/item_count))
        validation_loss=np.append(validation_loss, ttl_loss.detach().cpu().numpy()/item_count)
        nms_confidence, nms_box = non_maximum_suppression(pred_confidence[0].detach().cpu().numpy(), pred_box[0].detach().cpu().numpy(), boxs_default)
        visualize(0, "test", pred_confidence, pred_box, ann_confidence_, ann_box_, images_, boxs_default)
        visualize(0,"train", nms_confidence, nms_box, ann_confidence_, ann_box_, images_, boxs_default)

    x_axis = np.arange(num_epochs) + 1
    plt.plot(x_axis,train_loss,'b.-')
    plt.plot(x_axis,validation_loss,'r.-')
    plt.ylabel('average error')
    plt.legend(['Training loss','Validation loss'])
    plt.xlabel('Epoch')
    plt.show()

else:    
    test_batch_size = 1
    if _dir=='test':
        dataset_test = COCO("data/"+_dir+"/images/", None, class_num, boxs_default, train = False, augmentation = False, split = False, image_size=320)
    else:
        dataset_test = COCO("data/"+_dir+"/images/", "data/"+_dir+"/annotations/", class_num, boxs_default, train = False, augmentation = False, split = False, image_size=320)
    
    
    dataloader_test = torch.utils.data.DataLoader(dataset_test, batch_size=test_batch_size, shuffle=False, num_workers=0)
    
    if os.path.exists(_checkpoint):
        network.load_state_dict(torch.load(_checkpoint,map_location=torch.device(device)))
        print("Loading "+_checkpoint)
    else:
        print("Error: Loading failed.")
        sys.exit()
    
    network.eval()
    
    _result = _result+_dir+'/'
    if not os.path.exists(_result):
        os.makedirs(_result)
    
    for i, data in enumerate(dataloader_test, 0):
        images_, ann_box_, ann_confidence_, shape, filename = data
        images = images_.to(device)
        ann_box = ann_box_.to(device)
        ann_confidence = ann_confidence_.to(device)
        
        pred_confidence, pred_box = network(images)

        nms_confidence, nms_box = non_maximum_suppression(pred_confidence[0].detach().cpu().numpy(), pred_box[0].detach().cpu().numpy(), boxs_default)

        if _dir=='test':
            toTxt(False,filename,i,nms_confidence, nms_box, shape, test_batch_size, boxs_default)
        else:
            toTxt(True,i,nms_confidence, nms_box, shape, test_batch_size, boxs_default)
        visualize(0, _dir, pred_confidence, pred_box, ann_confidence_, ann_box_, images_, boxs_default, toFile=True, pathname=_result+str(i))
        visualize(0, _dir, nms_confidence, nms_box, ann_confidence_, ann_box_, images_, boxs_default, toFile=True, pathname=_result+str(i)+'nms')
        
        print('\rTesting: %d\t' % (i), end="")

AttributeError: ignored

F1 Score

In [16]:
num_gt =0
num_pred = 0
TP = 0
FP = 0
FN = 0
def line2data(line):
    obj = line.split()
    cls = int(obj[0])
    x = float(obj[1])
    y = float(obj[2])
    w = float(obj[3])
    h = float(obj[4])
    return cls,x,y,w,h

def IoU(x1,y1,w1,h1,x2,y2,w2,h2):
    X1=max(x1,x2)
    Y1=max(y1,y2)
    X2=min(x1+w1,x2+w2)
    Y2=min(y1+h1,y2+h2)
    if X1>=X2 or Y1>=Y2:
        return 0
    else:
        intersection=(X2-X1)*(Y2-Y1)
        s1=w1*h1
        s2=w2*h2
        return intersection/(s1+s2-intersection)

ann = sorted(os.listdir('data/train/annotations'))
pred = sorted(os.listdir('predicted_boxes_old_150ep/train'))
val_ann = ann[round(len(ann)*0.9):len(ann)]
val_pred = pred[round(len(pred)*0.9):len(pred)]

for i in np.arange(len(val_ann)):
    with open('data/train/annotations/'+val_ann[i]) as f:
        lines_ann = f.readlines()
    with open('predicted_boxes_old_150ep/train/'+val_pred[i]) as g:
        lines_pred = g.readlines()
    num_gt+=len(lines_ann)
    num_pred+=len(lines_pred)
    for j in np.arange(len(lines_pred)):
        cls_pred,x_pred,y_pred,w_pred,h_pred=line2data(lines_pred[j])
        t_p =False
        for k in np.arange(len(lines_ann)):
            cls_ann,x_ann,y_ann,w_ann,h_ann=line2data(lines_ann[k])
            if cls_ann==cls_pred and IoU(x_ann,y_ann,w_ann,h_ann,x_pred,y_pred,w_pred,h_pred)>=0.5:
                t_p =True
        if t_p==True:
            TP+=1
        else:
            FP+=1
    for j in np.arange(len(lines_ann)):
        cls_ann,x_ann,y_ann,w_ann,h_ann=line2data(lines_ann[j])
        f_n=True
        for k in np.arange(len(lines_pred)):
            cls_pred,x_pred,y_pred,w_pred,h_pred=line2data(lines_pred[k])
            if cls_ann==cls_pred and IoU(x_ann,y_ann,w_ann,h_ann,x_pred,y_pred,w_pred,h_pred)>=0.5:
                f_n=False
        if f_n==True:
            FN+=1

precision=TP/(TP+FP)
recall=TP/(TP+FN)
F1_score=2*precision*recall/(precision+recall)
print(F1_score)
precision=TP/num_pred
recall=TP/num_gt
F1_score=2*precision*recall/(precision+recall)
print(F1_score)
print(TP+FN)
print(num_gt)
print(TP+FP)
print(num_pred)

0.6903553299492386
0.6888567293777135
692
695
687
687
