# Import Libraries 

# Dataset 

loadImage() is used to load a single image from the COCO dataset and returns the original image, its height and width, as well as its resized height and width. 

In [460]:
def loadImage(self, index):
    img = self.imgs[index]
    path = self.img_files[index]
    img = cv2.imread(path)  
    originalHeight, originalWidth = img.shape[:2]
    r = self.img_size / max(originalHeight, originalWidth)  # resize image to img_size
    if r != 1:  # always resize down, only resize up if training with augmentation
        interp = cv2.INTER_AREA if r < 1 and not self.augment else cv2.INTER_LINEAR
        img = cv2.resize(img, (int(originalWidth * r), int(originalHeight * r)), interpolation = interp)
    return img, (originalHeight, originalWidth), img.shape[:2]  # img, hw_original, hw_resized


augmentHSV() is used to modify an input image inplace by manipulating its hue, saturation, and value. 

Hue, saturation, and value are the main color properties that allow us to distinguish between different colors. 

1. Hues are the three primary colors (red, blue, and yellow) and the three secondary colors (orange, green, and violet) that appear in the color wheel or color circle. When you refer to hue, you are referring to pure color, or the visible spectrum of basic colors that can be seen in a rainbow. 

2. Color saturation is the purity and intensity of a color as displayed in an image. The higher the saturation of a color, the more vivid and intense it is. The lower a color’s saturation, or chroma, the closer it is to pure gray on the grayscale.

3. Color value refers to the relative lightness or darkness of a color. We perceive color value based on the quantity of light reflected off of a surface and absorbed by the human eye. We refer to the intensity of the light that reaches the eye as “luminance.”


Modifying these values allows us to augment our input image, expand out dataset, and improve our training results.

The LUT OpenCV function applies a lookup-table transformation using calculate values for hue, saturation, and value

In [461]:
def augmentHSV(img, hgain = 0.5, sgain = 0.5, vgain = 0.5):
    randomGains = np.random.uniform(-1, 1, 3) * [hgain, sgain, vgain] + 1  # random gains
    hue, sat, val = cv2.split(cv2.cvtColor(img, cv2.COLOR_BGR2HSV))

    x = np.arange(0, 256, dtype = np.int16)
    lut_hue = ((x * randomGains[0]) % 180).astype(img.dtype)
    lut_sat = np.clip(x * randomGains[1], 0, 255).astype(img.dtype)
    lut_val = np.clip(x * randomGains[2], 0, 255).astype(img.dtype)

    img_hsv = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val))).astype(img.dtype)
    cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst = img)  

loadMosaic() is used to load images into a mosaic of four. It is a form of augmentation that is used only during training. It works by taking a total of four images, creating a base image with the corresponding number of tiles, and then calculating the position of each image on the base image. It also calculates the requried padding, normalizes the image labels, and then concatenates/clips the labels and applies an augmentation to both the images and labels. It returns the modified images and labels.

In [462]:
def load_mosaic(self, index):

    labels4 = []
    s = self.imageSize
    centerX, centerY = [int(random.uniform(s * 0.5, s * 1.5)) for _ in range(2)]
    indices = [index] + [random.randint(0, len(self.labels) - 1) for _ in range(3)]  # 3 additional image indices
    
    for i, index in enumerate(indices):
        # Load image
        img, _, (h, w) = loadImage(self, index)

        # top left
        if i == 0: 
            # base image with 4 tiles
            img4 = np.full((s * 2, s * 2, img.shape[2]), 114, dtype = np.uint8)  

            # xmin, ymin, xmax, ymax (large image)
            x1a, y1a, x2a, y2a = max(centerX - w, 0), max(centerY - h, 0), centerX, centerY

            # xmin, ymin, xmax, ymax (small image)
            x1b, y1b, x2b, y2b = w - (x2a - x1a), h - (y2a - y1a), w, h  
        
        # top right
        elif i == 1:  

            # xmin, ymin, xmax, ymax (large image)
            x1a, y1a, x2a, y2a = centerX, max(centerY - h, 0), min(centerX + w, s * 2), centerY

            # xmin, ymin, xmax, ymax (small image)
            x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h
        
        # bottom left
        elif i == 2:

            # xmin, ymin, xmax, ymax (large image)
            x1a, y1a, x2a, y2a = max(centerX - w, 0), centerY, centerX, min(s * 2, centerY + h)

            # xmin, ymin, xmax, ymax (small image)
            x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, max(centerX, w), min(y2a - y1a, h)
        
        # bottom right
        elif i == 3:

            # xmin, ymin, xmax, ymax (large image)
            x1a, y1a, x2a, y2a = centerX, centerY, min(centerX + w, s * 2), min(s * 2, centerY + h)

            # xmin, ymin, xmax, ymax (small image)
            x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h)

        img4[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b]  
        
        widthPadding = x1a - x1b
        heightPadding = y1a - y1b

        # labels
        x = self.labels[index]
        labels = x.copy()

        # normalize xywh to pixel xyxy format
        if x.size > 0:
            labels[:, 1] = w * (x[:, 1] - x[:, 3] / 2) + widthPadding
            labels[:, 2] = h * (x[:, 2] - x[:, 4] / 2) + heightPadding
            labels[:, 3] = w * (x[:, 1] + x[:, 3] / 2) + widthPadding
            labels[:, 4] = h * (x[:, 2] + x[:, 4] / 2) + heightPadding
        
        labels4.append(labels)

    # concatenate and clip labels
    if len(labels4):
        labels4 = np.concatenate(labels4, 0)
        np.clip(labels4[:, 1:], 0, 2 * s, out = labels4[:, 1:])  # use with random_affine

    # augment images+labels
    img4, labels4 = random_affine(img4, labels4,degrees = self.hyp['degrees'], translate = self.hyp['translate'], scale = self.hyp['scale'], shear = self.hyp['shear'], border =-s // 2)  # border to remove

    return img4, labels4

letterbox() is used to resize an input image into a 32-pixel-multiple rectangle. This reduces inference time proportionally to the amount of letterboxed area padded onto a square image. It works by extracting the current shape, calculating the neccessary padding, resizing it if necessary, and then creating and adding a border. It returns the letterboxed image, the scaling ratio, as well as the padding used.

In [463]:
def letterbox(img, newShape = (416, 416), color =(114, 114, 114), auto = True, scaleFill = False, scaleup = True):
    
    # extract current shape
    currShape = img.shape[:2]

    if isinstance(newShape, int):
        newShape = (newShape, newShape)

    # scale ratio (new / old)
    scaleRatio = min(newShape[0] / currShape[0], newShape[1] / currShape[1])

    # only scale down, do not scale up
    if not scaleup:
        scaleRatio = min(scaleRatio, 1.0)

    # calculate padding
    ratio = scaleRatio, scaleRatio
    unpaddedShape = int(round(currShape[1] * scaleRatio)), int(round(currShape[0] * scaleRatio))
    widthPadding, heightPadding = newShape[1] - unpaddedShape[0], newShape[0] - unpaddedShape[1]  
    
    if auto:  
        widthPadding, heightPadding = np.mod(widthPadding, 32), np.mod(heightPadding, 32)  # wh padding

    widthPadding /= 2
    heightPadding /= 2

    if currShape[::-1] != unpaddedShape:
        img = cv2.resize(img, unpaddedShape, interpolation = cv2.INTER_LINEAR)

    # create and add border
    top, bottom = int(round(heightPadding - 0.1)), int(round(heightPadding + 0.1))
    left, right = int(round(widthPadding - 0.1)), int(round(widthPadding + 0.1))
    img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value = color)  

    return img, ratio, (widthPadding, heightPadding)

randomAffine() is another form of dataset augmentation used to apply rotate, scale, translate, and shear transforms to an input image. It also transforms the label coordinates and returns the image and label coordinates. 

In [464]:
def random_affine(img, targets =(), degrees = 10, translate =.1, scale =.1, shear = 10, border = 0):

    height = img.shape[0] + border * 2
    width = img.shape[1] + border * 2

    # rotate and scale
    R = np.eye(3)
    a = random.uniform(-degrees, degrees)
    s = random.uniform(1 - scale, 1 + scale)
    R[:2] = cv2.getRotationMatrix2D(angle = a, center =(img.shape[1] / 2, img.shape[0] / 2), scale = s)

    # translate
    T = np.eye(3)
    T[0, 2] = random.uniform(-translate, translate) * img.shape[0] + border  # x translation (pixels)
    T[1, 2] = random.uniform(-translate, translate) * img.shape[1] + border  # y translation (pixels)

    # shear
    S = np.eye(3)
    S[0, 1] = math.tan(random.uniform(-shear, shear) * math.pi / 180)  # x shear (deg)
    S[1, 0] = math.tan(random.uniform(-shear, shear) * math.pi / 180)  # y shear (deg)

    # combined rotation matrix
    M = S @ T @ R
    if (border != 0) or (M != np.eye(3)).any():  # image changed
        img = cv2.warpAffine(img, M[:2], dsize =(width, height), flags = cv2.INTER_LINEAR, borderValue =(114, 114, 114))

    # transform label coordinates
    if len(targets):

        # warp points
        xy = np.ones((len(targets) * 4, 3))
        xy[:, :2] = targets[:, [1, 2, 3, 4, 1, 4, 3, 2]].reshape(len(targets) * 4, 2)  # x1y1, x2y2, x1y2, x2y1
        xy = (xy @ M.T)[:, :2].reshape(len(targets), 8)

        # create new boxes
        x = xy[:, [0, 2, 4, 6]]
        y = xy[:, [1, 3, 5, 7]]
        xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, len(targets)).T

        # reject warped points outside of image
        xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width)
        xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height)
        
        w = xy[:, 2] - xy[:, 0]
        h = xy[:, 3] - xy[:, 1]
        
        area = w * h
        area0 = (targets[:, 3] - targets[:, 1]) * (targets[:, 4] - targets[:, 2])
        
        aspectRatio = np.maximum(w / (h + 1e-16), h / (w + 1e-16))  # aspect ratio
        
        i = (w > 4) & (h > 4) & (area / (area0 * s + 1e-16) > 0.2) & (aspectRatio < 10)

        targets = targets[i]
        targets[:, 1:5] = xy[i]

    return img, targets

    

The following class defines a set of functions used to generate a list of images and their corresponding labels when training and/or testing the network. The image(s) and their corresponding label(s) can be found in data/coco/trainvalno5k.txt and data/coco/5k.txt. 


In [466]:
class LoadImagesAndLabels(Dataset):  
    def __init__(self, path, img_size=416, batch_size=16, augment=False, hyp=None, image_weights=False, cache_images=False, single_cls=False, pad=0.0):
        path = str(Path(path))  
        parent = str(Path(path).parent) + os.sep

        with open(path, 'r') as f:
            f = f.read().splitlines()
            f = [x.replace('./', parent) if x.startswith('./') else x for x in f] 

        self.imgFiles = [x.replace('/', os.sep) for x in f if os.path.splitext(x)[-1].lower() in img_formats]

        numFiles = len(self.imgFiles)
        batchIndex = np.floor(np.arange(numFiles) / batch_size).astype(np.int)  

        self.numImages = numFiles  
        self.batch = batchIndex  
        self.imageSize = img_size
        self.isAugment = augment
        self.hyperparameters = hyp
        self.isImageWeights = image_weights
        self.isMosaic = self.isAugment   

        # define labels
        self.labelFiles = [x.replace('images', 'labels').replace(os.path.splitext(x)[-1], '.txt')
                            for x in self.imgFiles]

        widthHeight = [getEXIFsize(Image.open(f)) for f in self.imgFiles]

        self.shapes = np.array(widthHeight, dtype=np.float64)

        # cache labels
        self.imgs = [None] * numFiles
        self.labels = [np.zeros((0, 5), dtype=np.float32)] * numFiles
        
        widthHeight = path.replace('images', 'labels')

        for i, file in enumerate(self.labelFiles):
            try:
                with open(file, 'r') as f:
                    l = np.array([x.split() for x in f.read().splitlines()], dtype=np.float32)
            except:
                continue

            if l.shape[0]:
                self.labels[i] = l


    def __len__(self):
        return len(self.imgFiles)

    def __getitem__(self, index):
        hyp = self.hyperparameters

        # create mosaic
        if self.isMosaic:
            img, labels = load_mosaic(self, index)
            shapes = None

        else:

            # load image 
            img, (h0, w0), (h, w) = loadImage(self, index)

            # letterbox
            shape = self.imgSize
            img, ratio, pad = letterbox(img, shape, auto = False, scaleup = self.isAugment)
            shapes = (h0, w0), ((h / h0, w / w0), pad)  # for COCO mAP rescaling

            # load labels
            labels = []
            x = self.labels[index]

            # normalize xywh to xyxy format
            if x.size > 0:
                labels = x.copy()
                labels[:, 1] = ratio[0] * w * (x[:, 1] - x[:, 3] / 2) + pad[0]  
                labels[:, 2] = ratio[1] * h * (x[:, 2] - x[:, 4] / 2) + pad[1]  
                labels[:, 3] = ratio[0] * w * (x[:, 1] + x[:, 3] / 2) + pad[0]
                labels[:, 4] = ratio[1] * h * (x[:, 2] + x[:, 4] / 2) + pad[1]

        # augment image/color space
        if self.isAugment:
            if not self.isMosaic:
                img, labels = random_affine(img, labels, degrees = hyp['degrees'], translate = hyp['translate'], scale = hyp['scale'], shear = hyp['shear'])
            augmentHSV(img, hgain = hyp['hsv_h'], sgain = hyp['hsv_s'], vgain = hyp['hsv_v'])
        
        # convert xyxy to xywh and normalize coordinates
        if len(labels) :
            labels[:, 1:5] = xyxy2xywh(labels[:, 1:5])

            # normalize height 
            labels[:, [2, 4]] /= img.shape[0]  

            # normalize width 
            labels[:, [1, 3]] /= img.shape[1]  

        # random left-right flip
        if self.isAugment:
            if random.random() < 0.5:
                img = np.fliplr(img)
                if len(labels) :
                    labels[:, 1] = 1 - labels[:, 1]

        outputLabels = torch.zeros((len(labels) , 6))
        if len(labels) :
            outputLabels[:, 1:] = torch.from_numpy(labels)

        # convert from BGR to RGB and reshape to 3x416x416
        img = img[:, :, ::-1].transpose(2, 0, 1)  
        img = np.ascontiguousarray(img)

        return torch.from_numpy(img), outputLabels, self.imgFiles[index], shapes

    @staticmethod
    def collate_fn(batch):

        img, label, path, shapes = zip(*batch)  

        # add target image index for buildTargets()
        for i, l in enumerate(label):
            l[:, 0] = i

        return torch.stack(img, 0), torch.cat(label, 0), path, shapes


NameError: name 'Dataset' is not defined

In [465]:
class LoadImages: 
    def __init__(self, path, img_size = 416):
        path = str(Path(path))  
        files = []

        if os.path.isdir(path):
            files = sorted(glob.glob(os.path.join(path, '*.*')))
        elif os.path.isfile(path):
            files = [path]

        images = [x for x in files if os.path.splitext(x)[-1].lower() in img_formats]

        numImages = len(images)

        self.imgSize = img_size
        self.files = images 
        self.numFiles = numImages 

    def __iter__(self):
        self.count = 0
        return self

    def __next__(self):
        if self.count == self.numFiles:
            raise StopIteration
        
        # extract path
        path = self.files[self.count]

        # increment count
        self.count += 1

        # read image
        img0 = cv2.imread(path)

        # resize by adding padding
        img = letterbox(img0, newShape = self.imgSize)[0]

        # convert image from BGR to RGB and to 3x416x416
        img = img[:, :, ::-1].transpose(2, 0, 1)
        img = np.ascontiguousarray(img)

        return path, img, img0

    def __len__(self):

        # return number of files
        return self.numFiles


# Network

The idea here is to parse the cfg, and store every block as a dict. The attributes of the blocks and their values are stored as key-value pairs in the dictionary. As we parse through the cfg, we keep appending these dicts, denoted by the variable block in our code, to a list blocks. Our function will return this block.

We begin by saving the content of the cfg file in a list of strings. Then, we loop over the resultant list to get blocks.

In [467]:
def parseModel(path):

    # init empty lists
    moduleDefinitions, validLines = [], []

    # read cfg file line by line and store it
    allLines = open(path, 'r').read().split('\n')
    
    # extract and append all lines that are not empty and do not start with '#'
    for line in allLines:
        if line and not line.startswith("#"):
            validLines.append(line.rstrip().lstrip())

    for line in validLines:
        
        # check if we are at the start of a new block 
        isNewBlock = line.startswith('[')

        if isNewBlock:

            # append and populate a dictionary to moduleDefinitions
            moduleDefinitions.append({})
            moduleDefinitions[-1]['type'] = line[1:-1].rstrip()

            # check if module type is convolutional and add batch norm parameter
            if moduleDefinitions[-1]['type'] == 'convolutional':
                moduleDefinitions[-1]['batch_normalize'] = 0  # pre-populate with zeros (may be overwritten later)
        
        else:
            
            # extract key, value pair
            key, val = line.split("=")

            # strip whitespace 
            key = key.rstrip()

            if key == 'anchors':  
                moduleDefinitions[-1][key] = np.array([float(x) for x in val.split(',')]).reshape((-1, 2))

            elif (key in ['from', 'layers', 'mask']):  
                moduleDefinitions[-1][key] = [int(x) for x in val.split(',')]
            
            elif (key == 'size' and ',' in val): 
                moduleDefinitions[-1][key] = [int(x) for x in val.split(',')]

            else:
                val = val.strip()
                if val.isnumeric():
                    moduleDefinitions[-1][key] = int(val) if (int(val) - float(val)) == 0 else float(val)   # return int or float
                else:
                    moduleDefinitions[-1][key] = val  

    return moduleDefinitions


In [468]:
def parseData(path):
    options = dict()

    with open(path, 'r') as f:
        lines = f.readlines()

    for line in lines:
        line = line.strip()
        if line == '' or line.startswith('#'): continue
        key, val = line.split('=')
        options[key.strip()] = val.strip()

    return options


The following function allows us to parse and load weights into our model. The first 160 bytes of the weights file store 5 int32 values which constitute the header of the file. The rest of bits now represent the weights and are stored as float32 or 32-bit floats. They are loaded in a np.ndarray and we then we iterate over the weights file and load the weights into the modules of our network.

Into the loop, we first check whether the convolutional block has batch_normalise True or not. Based on that, we load the weights. We keep a variable called ptr to keep track of where we are in the weights array. Now, if batch_normalize is True, we load the weights. If batch_norm is not true, simply load the biases of the convolutional layer. Finally, we load the convolutional layer's weights.


In [469]:
def loadDarkNetWeights(self, weights, cutoff=-1):
    # Parses and loads the weights stored in 'weights'

    # Establish cutoffs (load layers between 0 and cutoff. if cutoff = -1 all are loaded)
    file = Path(weights).name
    cutoff = 75

    # Read weights file
    with open(weights, 'rb') as f:
        self.version = np.fromfile(f, dtype=np.int32, count=3)  # (int32) version info: major, minor, revision
        self.seen = np.fromfile(f, dtype=np.int64, count=1)  # (int64) number of images seen during training
        weights = np.fromfile(f, dtype=np.float32)  # the rest are weights

    ptr = 0
    for idx, (moduleDef, module) in enumerate(zip(self.moduleDefinitions[:cutoff], self.module_list[:cutoff])):
        if moduleDef['type'] == 'convolutional':
            conv = module[0]
            if moduleDef['batch_normalize']:
                # Load BN bias, weights, running mean and running variance
                bn = module[1]
                nb = bn.bias.numel()  # number of biases
                # Bias
                bn.bias.data.copy_(torch.from_numpy(weights[ptr:ptr + nb]).view_as(bn.bias))
                ptr += nb
                # Weight
                bn.weight.data.copy_(torch.from_numpy(weights[ptr:ptr + nb]).view_as(bn.weight))
                ptr += nb
                # Running Mean
                bn.running_mean.data.copy_(torch.from_numpy(weights[ptr:ptr + nb]).view_as(bn.running_mean))
                ptr += nb
                # Running Var
                bn.running_var.data.copy_(torch.from_numpy(weights[ptr:ptr + nb]).view_as(bn.running_var))
                ptr += nb
            else:
                # Load conv. bias
                nb = conv.bias.numel()
                conv_b = torch.from_numpy(weights[ptr:ptr + nb]).view_as(conv.bias)
                conv.bias.data.copy_(conv_b)
                ptr += nb
            # Load conv. weights
            nw = conv.weight.numel()  # number of weights
            conv.weight.data.copy_(torch.from_numpy(weights[ptr:ptr + nw]).view_as(conv.weight))
            ptr += nw




In [470]:
def getLayers(model):
    return [i for i, m in enumerate(model.moduleList) if m.__class__.__name__ == 'YOLOLayer']  # [89, 101, 113]


In order to create the building blocks of this network, we utilise the output of the parsing function to construct a number of PyTorch modules as dictated by the cfg list. This function essentially iterates over the list of blocks and creates a PyTorch module for each block as we go. The output of this function will return a list that contains an nn.Module object called an nn.ModuleList. The nn.Sequential class is also used to sequentially execute a number of nn.Module objects. This is useful as some blocks may contain more than one layer. nn.Sequential allows us to attatch these layers together.  

In constructing the network, there are six main module types to consider. They are as follows:

1. **Convolutional Layer**

    The convolutional layer is a layer that contains units whose receptive fields cover a patch of the previous layer. The weight vector (the set of adaptive parameters) of such a unit is often called a filter.

    In defining the convolutional layer(s), it is important to define the kernel dimensions. This is pretty much taken care of by the parameters included in the cfg file, however we must also keep track of the number of filters present in each previous layer, therefore providing us with the depth of the feature map. It is also important to add the batch normalize layer, pad layer, as well as the leaky activation function. 

        
        
2. **Maxpool Layer**

    The Max Pooling layer is a pooling operation that extracts the maximum value in each patch of a feature map, and then down-samples it to highlight the most present feature in the patch. It is essentially a sample-based discretization process that aims to reduce dimensionality and allow for assumptions to be made about features contained in the sub-regions. In YOLOv3, however, max pooling is not used for downsampling. Instead, a 3X3 convolutional kernel is used with a step size of 2. This takes place a total of five times. 


3. **Upsample Layer**

    The upsample layer upsamples the feature map in the previous layer by a factor of stride using bilinear upsampling.
    
   

4. **Route Layer**

    The route layer plays a very important role in this network. It preforms the equivalent role of fusing together previous feature maps. If its attribute layer has only one value, then the route layer will output the feature maps of the layer indexed by that value. If, however, the attribute layer has two values then it will output the concatenated feature maps of the layers indexed by it's values along the depth dimension. It is also important to note that if there is a convolutional layer present right in front of a route layer, then the kernel is applied on the feature maps of previous layers, precisely the ones the route layer brings. Therefore, we need to keep a track of the number of filters in not only the previous layer, but each one of the preceding layers. As we iterate, we append the number of output filters of each block to the output_filters variable.



5. **Shortcut Layer**

    The shortcut layer is used to optimise the large network structure in order to provide faster training times and better convergence scores. It is essentially a skip connection that superimposes the value of the network without changing the size of the feature map, and so you will find that the input and output sizes have not changed before and after the shortcut layer. The output of the shortcut layer is found by adding feature maps.


6. **YOLO Layer**
    
   The YOLO layer corresponds to the detection layer. YOLOv3 makes prediction across 3 different scales. The detection layer is used make detection at feature maps of three different sizes, having strides 32, 16, 8 respectively. This means, with an input of 416 x 416, we make detections on scales 13 x 13, 26 x 26 and 52 x 52. Generally, stride of any layer in the network is equal to the factor by which the output of the layer is smaller than the input image to the network.

    The network downsamples the input image until the first detection layer, where a detection is made using feature maps of a layer with stride 32. Further, layers are upsampled by a factor of 2 and concatenated with feature maps of a previous layers having identical feature map sizes. Another detection is now made at layer with stride 16. The same upsampling procedure is repeated, and a final detection is made at the layer of stride 8. At each scale, each cell predicts 3 bounding boxes using 3 anchors, making the total number of anchors used 9. (The anchors are different for different scales). This is at the heart of YOLOv3's multi-scale detection idea. The use of 3 scales is to strengthen the detection of small targets. A relatively large feature map is used to detect relatively small targets, and a small feature map is responsible for detecting large targets.
    
    

In [471]:
def createModules(moduleDefinitions, imgSize, cfg):
    # Constructs module list of layer blocks from module configuration in moduleDefinitions

    imgSize = [imgSize] * 2 if isinstance(imgSize, int) else imgSize  # expand if necessary
    trainingHyperparms = moduleDefinitions.pop(0)  # cfg training hyperparams (unused)
    outputFilters = [3]  # input channels
    moduleList = nn.ModuleList()
    routingLayers = []  # list of layers which rout to deeper layers
    yoloIndex = -1

    for idx, currModule in enumerate(moduleDefinitions):
        modules = nn.Sequential()

        if currModule['type'] == 'convolutional':
            isBatchNormalize = currModule['batch_normalize']
            filters = currModule['filters']
            kernelSize = currModule['size']  # kernel size
            stride = currModule['stride'] if 'stride' in currModule else (currModule['stride_y'], currModule['stride_x'])
            
            modules.add_module('Conv2d', nn.Conv2d(in_channels = outputFilters[-1], out_channels = filters, kernel_size = kernelSize, stride = stride, padding = kernelSize // 2 if currModule['pad'] else 0, groups = currModule['groups'] if 'groups' in currModule else 1, bias = not isBatchNormalize))

            if isBatchNormalize:
                modules.add_module('BatchNorm2d', nn.BatchNorm2d(filters, momentum = 0.03, eps = 1E-4))
            else:
                routingLayers.append(idx)  # detection output (goes into yolo layer)

            if currModule['activation'] == 'leaky':  
                modules.add_module('activation', nn.LeakyReLU(0.1, inplace = True))

        elif currModule['type'] == 'upsample':
            modules = nn.Upsample(scale_factor = currModule['stride'])

        elif currModule['type'] == 'route':  # nn.Sequential() placeholder for 'route' layer
            layers = currModule['layers']
            filters = sum([outputFilters[l + 1 if l > 0 else l] for l in layers])
            routingLayers.extend([idx + l if l < 0 else l for l in layers])
            modules = FeatureConcat(layers = layers)

        elif currModule['type'] == 'shortcut':  # nn.Sequential() placeholder for 'shortcut' layer
            layers = currModule['from']
            filters = outputFilters[-1]
            routingLayers.extend([idx + l if l < 0 else l for l in layers])
            modules = WeightedFeatureFusion(layers = layers, weight ='weights_type' in currModule)

        elif currModule['type'] == 'yolo':
            yoloIndex += 1
            stride = [32, 16, 8]  # P5, P4, P3 strides
            layers = currModule['from'] if 'from' in currModule else []
            modules = YOLOLayer(anchors = currModule['anchors'][currModule['mask']],  # anchor list
                                numClasses = currModule['classes'],  # number of classes
                                img_size = imgSize,  # (416, 416)
                                yoloLayerIndex = yoloIndex,  # 0, 1, 2...
                                layers = layers,  # output layers
                                stride = stride[yoloIndex])

            # Initialize preceding Conv2d() bias (https://arxiv.org/pdf/1708.02002.pdf section 3.3)
            j = layers[yoloIndex] if 'from' in currModule else -1

            bias_ = moduleList[j][0].bias  # shape(255,)
            bias = bias_[:modules.numOutputs * modules.numAnchors].view(modules.numAnchors, -1)  # shape(3,85)
            bias[:, 4] += -4.5  # obj
            bias[:, 5:] += math.log(0.6 / (modules.numClasses - 0.99))  # cls (sigmoid(p) = 1/numClasses)
            moduleList[j][0].bias = torch.nn.Parameter(bias_, requires_grad = bias_.requires_grad)


        # Register module list and number of output filters
        moduleList.append(modules)
        outputFilters.append(filters)

    binaryRoutingLayers = [False] * (idx + 1)
    for idx in routingLayers:
        binaryRoutingLayers[idx] = True
    return moduleList, binaryRoutingLayers

The FeatureConcat class allows us to concatenate multiple feature maps. It is used as an alternative to XX. 

In [472]:
class FeatureConcat(nn.Module):
    def __init__(self, layers):
        super(FeatureConcat, self).__init__()
        self.layerIndices = layers  
        self.isMultipleLayers = len(layers) > 1  

    def forward(self, x, outputs):
        return torch.cat([outputs[i] for i in self.layerIndices], 1) if self.isMultipleLayers else outputs[self.layerIndices[0]]

NameError: name 'nn' is not defined

The WeightedFeatureFusion class allows us to produce a weighted sum of two or more layers. It is used as an alternative to XX.

In [473]:
class WeightedFeatureFusion(nn.Module):  # weighted sum of 2 or more layers https://arxiv.org/abs/1911.09070
    def __init__(self, layers, weight = False):
        super(WeightedFeatureFusion, self).__init__()
        self.layerIndices = layers  
        self.isApplyWeights = weight  
        self.numLayers = len(layers) + 1 

        if weight:
            self.layerWeights = nn.Parameter(torch.zeros(self.numLayers), requires_grad = True)  

    def forward(self, x, outputs):
        if self.isApplyWeights:
            w = torch.sigmoid(self.layerWeights) * (2 / self.numLayers)  
            x = x * w[0]

        inputChannels = x.shape[1]  
        
        for i in range(self.numLayers - 1):
            addFeatures = outputs[self.layerIndices[i]] * w[i + 1] if self.isApplyWeights else outputs[self.layerIndices[i]]  
            featureChannles = addFeatures.shape[1]  

            if inputChannels == featureChannles:
                x = x + addFeatures
            elif inputChannels > featureChannles:  
                x[:, :featureChannles] = x[:, :featureChannles] + addFeatures  
            else:
                x = x + addFeatures[:, :inputChannels]

        return x

NameError: name 'nn' is not defined

This class is for the YOLO Detection Layer. The forward pass is pretty important because it essentialy does the job of predict_transform. It makes everyting one tensor and does the equations that are provided here and earlier. This can be seen in this block of code where we pass through the sigmoid. There's also something about calculating grid offseats that is prertty important but I forgot lol. 

In [474]:
class YOLOLayer(nn.Module):
    def __init__(self, anchors, numClasses, img_size, yoloLayerIndex, layers, stride):
        super(YOLOLayer, self).__init__()
        self.anchors = torch.Tensor(anchors)
        self.layerIndex = yoloLayerIndex  
        self.layerIndices = layers  
        self.layerStride = stride  
        self.numOutputLayers = len(layers)  
        self.numAnchors = len(anchors) 
        self.numClasses = numClasses  
        self.numOutputs = numClasses + 5  
        self.numX, self.numY, self.numGridpoints = 0, 0, 0  
        self.anchorVector = self.anchors / self.layerStride
        self.anchorWH = self.anchorVector.view(1, self.numAnchors, 1, 1, 2)


    def creatGrids(self, ng =(13, 13), device ='cpu'):
        self.numX, self.numY = ng  
        self.numGridpoints = torch.tensor(ng, dtype = torch.float)

        if not self.training:
            yv, xv = torch.meshgrid([torch.arange(self.numY, device = device), torch.arange(self.numX, device = device)])
            self.grid = torch.stack((xv, yv), 2).view((1, 1, self.numY, self.numX, 2)).float()

        if self.anchorVector.device != device:
            self.anchorVector = self.anchorVector.to(device)
            self.anchorWH = self.anchorWH.to(device)

    def forward(self, prediction, out):

        bs, _, ny, nx = prediction.shape  # bs, 255, 13, 13
        if (self.numX, self.numY) != (nx, ny):
            self.creatGrids((nx, ny), prediction.device)

        prediction = prediction.view(bs, self.numAnchors, self.numOutputs, self.numY, self.numX).permute(0, 1, 3, 4, 2).contiguous()  

        if self.training:
            return prediction

        else:
            inferenceOutput = prediction.clone() 
            inferenceOutput[..., :2] = torch.sigmoid(inferenceOutput[..., :2]) + self.grid  # xy
            inferenceOutput[..., 2:4] = torch.exp(inferenceOutput[..., 2:4]) * self.anchorWH  # wh yolo method
            inferenceOutput[..., :4] *= self.layerStride
            torch.sigmoid_(inferenceOutput[..., 4:])
            return inferenceOutput.view(bs, -1, self.numOutputs), prediction  # view [1, 3, 13, 13, 85] as [1, 507, 85]

NameError: name 'nn' is not defined

In the Darknet class, we leverage a number of methods in order to construct the network's architecture block by block. In the init() function, we parse the configuration file and generate the relevant network architecture according to the content and order of the file. 

Talk about forward, forwardOnce.

The fuse function's purpose is to fuse together all the Conv2d and BatchNorm2d layers throughout model.


In [475]:
class Darknet(nn.Module):
    # YOLOv3 object detection model

    def __init__(self, cfg, img_size =(416, 416), verbose = False):
        super(Darknet, self).__init__()

        self.moduleDefinitions = parseModel(cfg)
        self.moduleList, self.routs = createModules(self.moduleDefinitions, img_size, cfg)
        self.yoloLayers = getLayers(self)
        self.version = np.array([0, 2, 5], dtype = np.int32)  
        self.numImageSeen = np.array([0], dtype = np.int64)  

    def forward(self, x, augment = False, verbose = False):

        if not augment:
            return self.forwardOnce(x)
        else:  
            imageSize = x.shape[-2:]  
            scales = [0.83, 0.67]  # scales
            y = []
            for i, xi in enumerate((x, torch_utils.scaleImage(x.flip(3), scales[0], same_shape = False),  torch_utils.scaleImage(x, scales[1], same_shape = False))):
                y.append(self.forwardOnce(xi)[0])

            y[1][..., :4] /= scales[0]  # scale
            y[1][..., 0] = imageSize[1] - y[1][..., 0]  # flip lr
            y[2][..., :4] /= scales[1]  # scale

            y = torch.cat(y, 1)

            return y, None

    def forwardOnce(self, inferenceOutput, augment = False):
        imageSize = inferenceOutput.shape[-2:]  
        yoloLayerOutput, output = [], []

        for i, module in enumerate(self.moduleList):
            name = module.__class__.__name__
            if name in ['WeightedFeatureFusion', 'FeatureConcat']: 
                inferenceOutput = module(inferenceOutput, output)  
            elif name == 'YOLOLayer':
                yoloLayerOutput.append(module(inferenceOutput, output))
            else: 
                inferenceOutput = module(inferenceOutput)

            output.append(inferenceOutput if self.routs[i] else [])

        if self.training:
            return yoloLayerOutput
        else: 
            inferenceOutput, trainingOutput = zip(*yoloLayerOutput)  
            inferenceOutput = torch.cat(inferenceOutput, 1)  
            if augment:  
                # de-augment results
                inferenceOutput = torch.split(inferenceOutput, nb, dim=0)
                # scale
                inferenceOutput[1][..., :4] /= s[0]
                # flip lr
                inferenceOutput[1][..., 0] = imageSize[1] - inferenceOutput[1][..., 0] 
                # scale
                inferenceOutput[2][..., :4] /= s[1]  

                inferenceOutput = torch.cat(inferenceOutput, 1)

            return inferenceOutput, trainingOutput

    def fuse(self):
        # Fuse Conv2d + BatchNorm2d layers throughout model
        fuseList = nn.ModuleList()
        for a in list(self.children())[0]:
            if isinstance(a, nn.Sequential):
                for i, b in enumerate(a):
                    if isinstance(b, nn.modules.batchnorm.BatchNorm2d):
                        # fuse this bn layer with the previous conv2d layer
                        conv = a[i - 1]
                        fused = torch_utils.fuseConvBnLayers(conv, b)
                        a = nn.Sequential(fused, *list(a.children())[i + 1:])
                        break
            fuseList.append(a)
        self.moduleList = fuseList

NameError: name 'nn' is not defined

# Utils

In [476]:
def fuseConvBnLayers(conv, bn):
    with torch.no_grad():
        fusedconv = torch.nn.Conv2d(conv.in_channels, conv.out_channels, kernel_size = conv.kernel_size, stride = conv.stride, padding = conv.padding, bias = True)

        convolutionalWeights = conv.weight.clone().view(conv.out_channels, -1)
        batchNormWeights = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var)))
        fusedconv.weight.copy_(torch.mm(batchNormWeights, convolutionalWeights).view(fusedconv.weight.size()))

        if conv.bias is not None:
            convolutionalBias = conv.bias
        else:
            convolutionalBias = torch.zeros(conv.weight.size(0))
        
        batchNormBias = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps))
        fusedconv.bias.copy_(torch.mm(batchNormWeights, convolutionalBias.reshape(-1, 1)).reshape(-1) + batchNormBias)

        return fusedconv



scaleImg() is used to scale and resize an input image. It works by extracting the image's width and height, calculating the new size, and adding the necessary padding required to the tensor using torch.nn.functional.pad().  

In [477]:
def scaleImage(img, ratio = 1.0, same_shape = True):  # img(16,3,256,416), r = ratio
    
    # extract width and height 
    height, width = img.shape[2:]

    # calculate new size 
    newSize = (int(height * ratio), int(width * ratio))  

    # resize image
    img = F.interpolate(img, size = newSize, mode ='bilinear', align_corners = False)  
    
    return F.pad(img, [0, width - newSize[1], 0, height - newSize[0]], value = 0.447)  


xyxy2xywh() converts boxes from  (x1, y1, x2, y2) to (x, y, w, h) where x1, y1 represent the top-left coordinates and x2, y2 represent the bottom-right coordinates. 


In [478]:
def xyxy2xywh(x):
    y = torch.zeros_like(x) if isinstance(x, torch.Tensor) else np.zeros_like(x)
    y[:, 0] = (x[:, 0] + x[:, 2]) / 2  # x center
    y[:, 1] = (x[:, 1] + x[:, 3]) / 2  # y center
    y[:, 2] = x[:, 2] - x[:, 0]  # width
    y[:, 3] = x[:, 3] - x[:, 1]  # height
    return y

xywh2xyxy() converts boxes from (x, y, w, h) to (x1, y1, x2, y2) where x1, y1 represent the top-left coordinates and x2, y2 represent the bottom-right coordinates.


In [479]:
def xywh2xyxy(x):
    y = torch.zeros_like(x) if isinstance(x, torch.Tensor) else np.zeros_like(x)
    y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
    y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
    y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
    y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
    return y

scaleCoords is used to rescale the coordinates extracted from the shape of img1 to that of the shape of img0. 

In [480]:
def scaleCoordinates(img1_shape, coords, img0_shape, ratio_pad = None):
    gain = max(img1_shape) / max(img0_shape)  # gain  = old / new
    pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2  # wh padding

    coords[:, [0, 2]] -= pad[0]  # x padding
    coords[:, [1, 3]] -= pad[1]  # y padding
    coords[:, :4] /= gain

    # Clip bounding xyxy bounding boxes to image shape (height, width)
    coords[:, 0].clamp_(0, img0_shape[1])  # x1
    coords[:, 1].clamp_(0, img0_shape[0])  # y1
    coords[:, 2].clamp_(0, img0_shape[1])  # x2
    coords[:, 3].clamp_(0, img0_shape[0])  # y2

    return coords


apPerClass() compute the average precision, given the recall and precision curves. It takes in a numpy array of true positives, objectness value, predicted object classes, and true object classes in order to return the average precision on a class basis. More information about this function can be found here: https://github.com/rafaelpadilla/Object-Detection-Metrics.

It works by first sorting by objectness and extracting the sorted indices and findinf the unique class. It then creates the precision-recall curves and computes the AP for each class.

In order to calculate the precision-recall curve, the function iterates through the unique classes and calculates the number of ground truth objects and the number of predicted objects. If these values are larger than 0, we accumulate the False/True positives and use them to calculate the recall and precision. 

The equations for this calculation are shown below: 

In order to calculate the average precision, please see the next block. 

In order to calculate the F1 score, the following equation is used: 



In [481]:
def getAPClass(tp, conf, pred_cls, target_cls):

    # Sort by objectness
    i = np.argsort(-conf)
    tp, conf, pred_cls = tp[i], conf[i], pred_cls[i]

    # Find unique classes
    unique_classes = np.unique(target_cls)

    # Create Precision-Recall curve and compute AP for each class
    pr_score = 0.1  # score to evaluate P and R https://github.com/ultralytics/yolov3/issues/898
    s = [unique_classes.shape[0], tp.shape[1]]  # number class, number iou thresholds (i.e. 10 for mAP0.5...0.95)
    ap, p, r = np.zeros(s), np.zeros(s), np.zeros(s)
    for ci, c in enumerate(unique_classes):
        i = pred_cls == c
        n_gt = (target_cls == c).sum()  # Number of ground truth objects
        n_p = i.sum()  # Number of predicted objects

        if n_p == 0 or n_gt == 0:
            continue
        else:
            # Accumulate FPs and TPs
            fpc = (1 - tp[i]).cumsum(0)
            tpc = tp[i].cumsum(0)

            # Recall
            recall = tpc / (n_gt + 1e-16)  # recall curve
            r[ci] = np.interp(-pr_score, -conf[i], recall[:, 0])  # r at pr_score, negative x, xp because xp decreases

            # Precision
            precision = tpc / (tpc + fpc)  # precision curve
            p[ci] = np.interp(-pr_score, -conf[i], precision[:, 0])  # p at pr_score

            # AP from recall-precision curve
            for j in range(tp.shape[1]):
                ap[ci, j] = getAP(recall[:, j], precision[:, j])

    # Compute F1 score (harmonic mean of precision and recall)
    f1 = 2 * p * r / (p + r + 1e-16)

    return p, r, ap, f1, unique_classes.astype('int32')

computeAP() works alongside the function above. It takes in the recall and precision curves as lists in order to calculate and return the average precision. More information about this function can be found here: https://github.com/rbgirshick/py-faster-rcnn.

It works by appending sentinel values to the beginning and end of the recall/precision lists, and then computing the precision envelope and integrating the area under the curve. The result of this integration is teh average precision and is returned. 

In [482]:
def getAP(recall, precision):

    # Append sentinel values to beginning and end
    mrec = np.concatenate(([0.], recall, [min(recall[-1] + 1E-3, 1.)]))
    mpre = np.concatenate(([0.], precision, [0.]))

    # Compute the precision envelope
    mpre = np.flip(np.maximum.accumulate(np.flip(mpre)))

    # Integrate area under curve
    x = np.linspace(0, 1, 101)  # 101-point interp (COCO)
    ap = np.trapz(np.interp(x, mrec, mpre), x)  # integrate

    return ap


In [483]:
def boundingBoxIOU(firstBox, secondBox, x1y1x2y2 = True, GIoU = False):
    # Returns the IoU of box1 to box2. box1 is 4, box2 is nx4
    secondBox = secondBox.t()

    # Get the coordinates of bounding boxes
    if x1y1x2y2:  # x1, y1, x2, y2 = box1
        # Transform from center and width to exact coordinates
        firstBoxX1, firstBoxY1, firstBoxX2, firstBoxY2 = firstBox[0], firstBox[1], firstBox[2], firstBox[3]
        secondBoxX1, secondBoxY1, secondBoxX2, secondBoxY2 = secondBox[0], secondBox[1], secondBox[2], secondBox[3]
    else:  # transform from xywh to xyxy
        # Get the coordinates of bounding boxes
        firstBoxX1, firstBoxX2 = firstBox[0] - firstBox[2] / 2, firstBox[0] + firstBox[2] / 2
        firstBoxY1, firstBoxY2 = firstBox[1] - firstBox[3] / 2, firstBox[1] + firstBox[3] / 2
        secondBoxX1, secondBoxX2 = secondBox[0] - secondBox[2] / 2, secondBox[0] + secondBox[2] / 2
        secondBoxY1, secondBoxY2 = secondBox[1] - secondBox[3] / 2, secondBox[1] + secondBox[3] / 2

    # extract intersection rectangle coordinates
    rectIntersectionX1, rectIntersectionY1  = torch.max(firstBoxX1, secondBoxX1), torch.max(firstBoxY1, secondBoxY1) 
    rectIntersectionX2, rectIntersectionY2 = torch.min(firstBoxX2, secondBoxX2), torch.min(firstBoxY2, secondBoxY2)
    
    # Intersection area
    intersectionWidth = (rectIntersectionX2 - rectIntersectionX1).clamp(0)
    intersectionHeight = (rectIntersectionY2 - rectIntersectionY1).clamp(0)

    intersectionArea = intersectionWidth * intersectionHeight

    # Union Area
    firstWidth, firstHeight = firstBoxX2 - firstBoxX1, firstBoxY2 - firstBoxY1
    secondWidth, secondHeight = secondBoxX2 - secondBoxX1, secondBoxY2 - secondBoxY1
    unionArea = (firstWidth * firstHeight + 1e-16) + secondWidth * secondHeight - intersectionArea

    iou = intersectionArea / unionArea  # iou
    
    if GIoU:
        smallestEnclosingWidth = torch.max(firstBoxX2, secondBoxX2) - torch.min(firstBoxX1, secondBoxX1)  # convex (smallest enclosing box) width
        smallestEnclosingHeight = torch.max(firstBoxY2, secondBoxY2) - torch.min(firstBoxY1, secondBoxY1)  # convex height
        smallestEnclosingArea = smallestEnclosingWidth * smallestEnclosingHeight + 1e-16  # convex area
        return iou - (smallestEnclosingArea - unionArea) / smallestEnclosingArea  # GIoU

    return iou

boxIOU() returns the intersection-over-union of the two input boxes. It is important to note that teh two sets of boxes are expected to be in (x1, y1, x2, y2) format.

In [484]:
def boxIOU(box1, box2):
    """
    Return intersection-over-union (Jaccard index) of boxes.
    Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
    Arguments:
        box1 (Tensor[N, 4])
        box2 (Tensor[M, 4])
    Returns:
        iou (Tensor[N, M]): the NxM matrix containing the pairwise
            IoU values for every element in boxes1 and boxes2
    """

    area1 = (box1.t()[2] - box1.t()[0]) * (box1.t()[3] - box1.t()[1])
    area2 = (box2.t()[2] - box2.t()[0]) * (box2.t()[3] - box2.t()[1])

    # inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)
    inter = (torch.min(box1[:, None, 2:], box2[:, 2:]) - torch.max(box1[:, None, :2], box2[:, :2])).clamp(0).prod(2)
    return inter / (area1[:, None] + area2 - inter)  # iou = inter / (area1 + area2 - inter)


whIOU() reutns the NxM IoU matrix. 

In [485]:
def widthHeightIOU(wh1, wh2):
    # Returns the nxm IoU matrix. wh1 is nx2, wh2 is mx2
    wh1 = wh1[:, None]  # [N,1,2]
    wh2 = wh2[None]  # [1,M,2]
    inter = torch.min(wh1, wh2).prod(2)  # [N,M]
    return inter / (wh1.prod(2) + wh2.prod(2) - inter)  # iou = inter / (area1 + area2 - inter)


In [486]:
def getLosses(p, targets, model):  # predictions, targets, model
    FloatTensor = torch.cuda.FloatTensor if p[0].is_cuda else torch.Tensor
    classLoss, boxLoss, objectLoss = FloatTensor([0]), FloatTensor([0]), FloatTensor([0])
    tcls, tbox, indices, anchors = buildTargets(p, targets, model)  # targets

    # Define criteria
    BCEcls = nn.BCEWithLogitsLoss(pos_weight = FloatTensor([model.hyp['cls_pw']]), reduction = 'mean')
    BCEobj = nn.BCEWithLogitsLoss(pos_weight = FloatTensor([model.hyp['obj_pw']]), reduction = 'mean')

    # class label smoothing https://arxiv.org/pdf/1902.04103.pdf eqn 3
    cp, cn = 1.0 - 0.5 * 0.0, 0.5 * 0.0

    # per output
    cumNumTargets = 0  # targets
    for layerIdx, layerPrediction in enumerate(p):  # layer index, layer predictions
        b, a, gj, gi = indices[layerIdx]  # image, anchor, gridy, gridx
        targetObj = torch.zeros_like(layerPrediction[..., 0])  # target obj

        numTargets = b.shape[0]  # number of targets
        if numTargets:
            cumNumTargets += numTargets  # cumulative targets
            predictionSubset = layerPrediction[b, a, gj, gi]  # prediction subset corresponding to targets

            # GIoU
            pxy = predictionSubset[:, :2].sigmoid()
            pwh = predictionSubset[:, 2:4].exp().clamp(max = 1E3) * anchors[layerIdx]
            pbox = torch.cat((pxy, pwh), 1)  # predicted box
            giou = boundingBoxIOU(pbox.t(), tbox[layerIdx], x1y1x2y2 = False, GIoU = True)  # giou(prediction, target)
            boxLoss += (1.0 - giou).mean()  # giou loss

            # Obj
            targetObj[b, a, gj, gi] = (1.0 - model.gr) + model.gr * giou.detach().clamp(0).type(targetObj.dtype)  # giou ratio

            # Class
            t = torch.full_like(predictionSubset[:, 5:], cn)  # targets
            t[range(numTargets), tcls[layerIdx]] = cp
            classLoss += BCEcls(predictionSubset[:, 5:], t)  # BCE

        objectLoss += BCEobj(layerPrediction[..., 4], targetObj)  # obj loss

    boxLoss *= model.hyp['giou']
    objectLoss *= model.hyp['obj']
    classLoss *= model.hyp['cls']

    totLoss = boxLoss + objectLoss + classLoss

    return totLoss, torch.cat((boxLoss, objectLoss, classLoss, totLoss)).detach()


buildTargets is called when computing loss. It works by iterating through the YOLO layers, matching the targets to their anchors and XX. 

In [487]:
def buildTargets(p, targets, model):
    # Build targets for getLosses(), input targets(image,class,x,y,w,h)
    numTargets = targets.shape[0]
    tcls, tbox, indices, anch = [], [], [], []
    gain = torch.ones(6, device = targets.device)  # normalized to gridspace gain

    for idx, layer in enumerate(model.yolo_layers):
        anchors = model.module_list[layer].anchorVector
        gain[2:] = torch.tensor(p[idx].shape)[[3, 2, 3, 2]]  # xyxy gain
        numAnchors = anchors.shape[0]  # number of anchors
        anchorTensor = torch.arange(numAnchors).view(numAnchors, 1).repeat(1, numTargets)  # anchor tensor, same as .repeat_interleave(nt)

        # Match targets to anchors
        a, t, offsets = [], targets * gain, 0
        if numTargets:

            layer = widthHeightIOU(anchors, t[:, 4:6]) > model.hyp['iou_t']  # iou(3,n) = widthHeightIOU(anchors(3,2), gwh(n,2))
            a, t = anchorTensor[layer], t.repeat(numAnchors, 1, 1)[layer]  # filter

            # overlaps
            gxy = t[:, 2:4]  # grid xy

       # Define
        b, c = t[:, :2].long().T  # image, class
        gxy = t[:, 2:4]  # grid xy
        gwh = t[:, 4:6]  # grid wh
        gij = (gxy - offsets).long()
        gi, gj = gij.T  # grid xy indices

        # Append
        indices.append((b, a, gj.clamp_(0, gain[3] - 1), gi.clamp_(0, gain[2] - 1)))  # image, anchor, grid indices
        tbox.append(torch.cat((gxy - gij, gwh), 1))  # box
        anch.append(anchors[a])  # anchors
        tcls.append(c)  # class

    return tcls, tbox, indices, anch

In [488]:
def NMS(prediction, conf_thres = 0.1, iou_thres = 0.6, multi_label = True, classes = None, agnostic = False):
    """
    Performs  Non-Maximum Suppression on inference results
    Returns detections with shape:
        nx6 (x1, y1, x2, y2, conf, cls)
    """

    # Settings
    merge = True  # merge for best mAP
    min_wh, max_wh = 2, 4096  # (pixels) minimum and maximum box width and height
    time_limit = 10.0  # seconds to quit after

    t = time.time()
    numClasses = prediction[0].shape[1] - 5  # number of classes
    multi_label &= numClasses > 1  # multiple labels per box
    output = [None] * prediction.shape[0]
    for xi, x in enumerate(prediction):  # image index, image inference
        # Apply constraints
        x = x[x[:, 4] > conf_thres]  # confidence
        x = x[((x[:, 2:4] > min_wh) & (x[:, 2:4] < max_wh)).all(1)]  # width-height

        # If none remain process next image
        if not x.shape[0]:
            continue

        # Compute conf
        x[..., 5:] *= x[..., 4:5]  # conf = obj_conf * cls_conf

        # Box (center x, center y, width, height) to (x1, y1, x2, y2)
        box = xywh2xyxy(x[:, :4])

        # Detections matrix nx6 (xyxy, conf, cls)
        i, j = (x[:, 5:] > conf_thres).nonzero().t()
        x = torch.cat((box[i], x[i, j + 5].unsqueeze(1), j.float().unsqueeze(1)), 1)

        # Filter by class
        if classes:
            x = x[(j.view(-1, 1) == torch.tensor(classes, device = j.device)).any(1)]

        # If none remain process next image
        n = x.shape[0]  # number of boxes
        if not n:
            continue

        # Batched NMS
        c = x[:, 5] * 0 if agnostic else x[:, 5]  # classes
        boxes, scores = x[:, :4].clone() + c.view(-1, 1) * max_wh, x[:, 4]  # boxes (offset by class), scores
        i = torchvision.ops.boxes.nms(boxes, scores, iou_thres)
        if merge and (1 < n < 3E3):  # Merge NMS (boxes merged using weighted mean)
            try:  # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
                iou = boxIOU(boxes[i], boxes) > iou_thres  # iou matrix
                weights = iou * scores[None]  # box weights
                x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim = True)  # merged boxes
            except:  # possible CUDA error https://github.com/ultralytics/yolov3/issues/1139
                print(x, i, x.shape, i.shape)
                pass

        output[xi] = x[i]
        if (time.time() - t) > time_limit:
            break  # time limit exceeded

    return output

In [489]:
def convertToTarget(output, width, height):
    """
    Convert a YOLO model output to target format
    [batch_id, class_id, x, y, w, h, conf]
    """
    if isinstance(output, torch.Tensor):
        output = output.cpu().numpy()

    targets = []
    for i, o in enumerate(output):
        if o is not None:
            for pred in o:
                box = pred[:4]
                w = (box[2] - box[0]) / width
                h = (box[3] - box[1]) / height
                x = box[0] / width + w / 2
                y = box[1] / height + h / 2
                conf = pred[4]
                cls = int(pred[5])

                targets.append([i, cls, x, y, w, h, conf])

    return np.array(targets)


In [490]:
def plotBox(x, img, color = None, label = None, line_thickness = None):
    # Plots one bounding box on image img
    tl = line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1  # line/font thickness
    color = color or [random.randint(0, 255) for _ in range(3)]
    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
    cv2.rectangle(img, c1, c2, color, thickness = tl, lineType = cv2.LINE_AA)
    if label:
        tf = max(tl - 1, 1)  # font thickness
        t_size = cv2.getTextSize(label, 0, fontScale = tl / 3, thickness = tf)[0]
        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
        cv2.putText(img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness = tf, lineType = cv2.LINE_AA)


def plotImages(images, targets, paths = None, fname ='images.jpg', names = None, max_size = 640, max_subplots = 16):
    tl = 3  # line thickness
    tf = max(tl - 1, 1)  # font thickness
    if os.path.isfile(fname):  # do not overwrite
        return None

    if isinstance(images, torch.Tensor):
        images = images.cpu().numpy()

    if isinstance(targets, torch.Tensor):
        targets = targets.cpu().numpy()

    # un-normalise
    if np.max(images[0]) <= 1:
        images *= 255

    bs, _, h, w = images.shape  # batch size, _, height, width
    bs = min(bs, max_subplots)  # limit plot images
    ns = np.ceil(bs ** 0.5)  # number of subplots (square)

    # Check if we should resize
    scale_factor = max_size / max(h, w)
    if scale_factor < 1:
        h = math.ceil(scale_factor * h)
        w = math.ceil(scale_factor * w)

    # Empty array for output
    mosaic = np.full((int(ns * h), int(ns * w), 3), 255, dtype = np.uint8)

    # Fix class - colour map
    prop_cycle = plt.rcParams['axes.prop_cycle']
    hex2rgb = lambda h: tuple(int(h[1 + i:1 + i + 2], 16) for i in (0, 2, 4))
    color_lut = [hex2rgb(h) for h in prop_cycle.by_key()['color']]

    for i, img in enumerate(images):
        if i == max_subplots:  # if last batch has fewer images than we expect
            break

        block_x = int(w * (i // ns))
        block_y = int(h * (i % ns))

        img = img.transpose(1, 2, 0)
        if scale_factor < 1:
            img = cv2.resize(img, (w, h))

        mosaic[block_y:block_y + h, block_x:block_x + w, :] = img
        if len(targets) > 0:
            image_targets = targets[targets[:, 0] == i]
            boxes = xywh2xyxy(image_targets[:, 2:6]).T
            classes = image_targets[:, 1].astype('int')
            gt = image_targets.shape[1] == 6  # ground truth if no conf column
            conf = None if gt else image_targets[:, 6]  # check for confidence presence (gt vs pred)

            boxes[[0, 2]] *= w
            boxes[[0, 2]] += block_x
            boxes[[1, 3]] *= h
            boxes[[1, 3]] += block_y
            for j, box in enumerate(boxes.T):
                cls = int(classes[j])
                color = color_lut[cls % len(color_lut)]
                cls = names[cls] if names else cls
                if gt or conf[j] > 0.3:  # 0.3 conf thresh
                    label = '%s' % cls if gt else '%s %.1f' % (cls, conf[j])
                    plotBox(box, mosaic, label = label, color = color, line_thickness = tl)

        # Draw image filename labels
        if paths is not None:
            label = os.path.basename(paths[i])[:40]  # trim to 40 char
            t_size = cv2.getTextSize(label, 0, fontScale = tl / 3, thickness = tf)[0]
            cv2.putText(mosaic, label, (block_x + 5, block_y + t_size[1] + 5), 0, tl / 3, [220, 220, 220], thickness = tf,
                        lineType = cv2.LINE_AA)

        # Image border
        cv2.rectangle(mosaic, (block_x, block_y), (block_x + w, block_y + h), (255, 255, 255), thickness = 3)

    if fname is not None:
        mosaic = cv2.resize(mosaic, (int(ns * w * 0.5), int(ns * h * 0.5)), interpolation = cv2.INTER_AREA)
        cv2.imwrite(fname, cv2.cvtColor(mosaic, cv2.COLOR_BGR2RGB))

    return mosaic

def plotResults(start = 0, stop = 0, bucket ='', id =()):  
    fig, ax = plt.subplots(2, 5, figsize =(12, 6), tight_layout = True)
    ax = ax.ravel()
    s = ['GIoU', 'Objectness', 'Classification', 'Precision', 'Recall',
         'val GIoU', 'val Objectness', 'val Classification', 'mAP@0.5', 'F1']

    files = glob.glob('results*.txt') + glob.glob('../../Downloads/results*.txt')
    for f in sorted(files):
        results = np.loadtxt(f, usecols =[2, 3, 4, 8, 9, 12, 13, 14, 10, 11], ndmin = 2).T
        n = results.shape[1]  # number of rows
        x = range(start, min(stop, n) if stop else n)
        for i in range(10):
            y = results[i, x]
            if i in [0, 1, 2, 5, 6, 7]:
                y[y == 0] = np.nan  # dont show zero loss values
            ax[i].plot(x, y, marker ='.', label = Path(f).stem, linewidth = 2, markersize = 8)
            ax[i].set_title(s[i])

    ax[1].legend()
    fig.savefig('results.png', dpi = 200)


# Detection 

# Testing

# Training