In [1]:
import torch
import torch.nn as nn

# ConvBNAct Block
Converts our input image (B, 3,640,640) into (B,64, 320,320)

In [2]:
class ConvBNAct (nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size = 3, stride = 1, padding = 1):
        super().__init__()
        self.block = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size, stride,padding, bias = False),
            nn.BatchNorm2d(out_channels),
            nn.SiLU())
    def forward(self, x):
        return self.block(x)

# BottleNeck Block

Extracts important features from the image

Input: (B,64,320,320) --> 1Ã—1 conv -> (B,32,320,320) -->  3Ã—3 conv -> (B,64,320,320) --> Residual :if allowed --> out = out + input

In [3]:
class Bottleneck(nn.Module):
    def __init__(self,in_channels,out_channels,shortcut = True,expansion =0.5):
        super().__init__()
        hidden_channels = int(out_channels * expansion)
        self.conv1 = ConvBNAct(in_channels,hidden_channels,stride=1,padding=0,kernel_size=1)
        self.conv2 = ConvBNAct(hidden_channels,out_channels,kernel_size=3,stride=1,padding=1)
        self.residual = shortcut and (in_channels == out_channels)

    def forward(self,x):
        out = self.conv2 (self.conv1(x))
        if (self.residual == True):
            out = out + x
        return out

 

# C2f Block

C2f = "Concatenate --> Convolution --> f-block(like Bottleneck)"

In [4]:
class c2f(nn.Module):
    def __init__(self, in_channels, out_channels,n,d,expansion=0.5,shortcut=True):
        super().__init__()
        hidden_channels = int(out_channels * expansion)
        num_bnecks = max(round(n*d),1)
        #1. applying 1st conv layer from ConvBNAct
        self.conv1=ConvBNAct(in_channels,out_channels,kernel_size=1,stride=1,padding=0)
        #2.splitting this output in 2 parts:
        # one for passing into bottlenecks and other as it is (for skip connections behaviour)
        self.layers = nn.ModuleList([
            Bottleneck(hidden_channels,hidden_channels) for i in range(num_bnecks)
        ])
        self.conv2=ConvBNAct((num_bnecks +2)*hidden_channels,out_channels,kernel_size=1,stride=1,padding=0)

    def forward(self,x):
         x=self.conv1(x)
         #splitting this obtained image into 2 parts channel wise
         x1 = x[:,:x.shape[1]//2,:,:] #shape: B,64,320,320
         x2 = x[:,x.shape[1]//2:,:,:] #shape: B,64,320,320

         outputs = [x1,x2]
         for layer in self.layers:
             x1=layer(x1)
             outputs.append(x1)

         out_final = torch.concat(outputs,dim=1)

         out=self.conv2(out_final)
         return out
         

# Spatial Pyramid Pooling Fast (SPPF) Block

Take features from different scales and combine them,so that the network becomes better at capturing both local and global features.

In [10]:
class sppf(nn.Module):
    def __init__(self,in_channels,out_channels,expansion=0.5,num_pool=3):
        super()._init__()
        hidden_channels=int(out_channels*expansion)
        self.conv1 = ConvBNAct(in_channels,hidden_channels,kernel_size=1,stride=1,padding=0)
    
        self.poolLayer = nn.ModuleList([
            nn.MaxPool2d(kernel_size=5,stride=1,padding=2) for i in range(num_pool)
        ])
    
        self.conv2 = ConvBNAct((num_pool + 1)*hidden_channels,out_channels,stride=1,kernel_size=1,padding=0)

    def forward(self,x):
        x=self.conv1(x)
        outs = [x]

        for layer in self.poolLayer:
            y1 = layer(x)
            outs.append(y1)

        x_cat=torch.concat(outs,dim=1)
        out=self.conv2(x_cat)
        return out

# Putting it Together - The Backbone

The backbone is the feature extractor of the model - it takes in the input image and extracts feature maps at multiple scales (from low-level to high-level). These features are then passed to the neck and head for detection.



In [11]:
class Backbone(nn.Module):
    def __init__(self,depth_mul=1,width_mul=1):
        super().__init__()

        def ch(c):
            return max(int(c*float(width_mul)),1)

        def d(n):
            return max(round(n*float(depth_mul)),1)

        self.stem = ConvBNAct(3,ch(64),3,2,1)

        self.conv1 = ConvBNAct(ch(64),ch(128),3,2,1)
        self.stage1=c2f(ch(128),ch(128),n=d(3),d=1)

        self.conv2 = ConvBNAct(ch(128), ch(256), 3, 2, 1)
        self.stage2 = c2f(ch(256), ch(256), n=d(6), d=1)

        
        self.conv3 = ConvBNAct(ch(256), ch(512), 3, 2, 1)
        self.stage3 = c2f(ch(512), ch(512), n=d(6), d=1)

        
        self.conv4 = ConvBNAct(ch(512), ch(512), 3, 2, 1)
        self.stage4 = c2f(ch(512), ch(512), n=d(3), d=1)

        self.sppf = sppf(in_channels=ch(512),out_channels=ch(512),num_pool=3)

        self.FinalLayer=nn.Sequential( self.stem,self.conv1, self.stage1,self.conv2, self.stage2,self.conv3, self.stage3,self.conv4, self.stage4,
            self.sppf
        )

        self.x1_layer=nn.Sequential(
            self.stem,self.conv1,self.stage1,self.conv2,self.stage2
        )

        self.x2_layer = nn.Sequential(
            self.stem,self.conv1, self.stage1,self.conv2, self.stage2,self.conv3, self.stage3
        )
        self.x3_layer = nn.Sequential(
            self.stem,self.conv1, self.stage1,self.conv2, self.stage2,self.conv3, self.stage3,self.conv4, self.stage4,
            self.sppf
        )
    def forward(self,x):
        out=self.FinalLayer(x)
        x1=self.x1_layer(x)
        x2=self.x2_layer(x)
        x3=self.x3_layer(x)
        return out,x1,x2,x3

     
        

        
        

# NECK Architecture

In [12]:
class upsample(nn.Module):
    def __init__(self,scale_factor = 2, mode="nearest"):
        super().__init__()
        self.Upsample = nn.Upsample(scale_factor=scale_factor,mode=mode)

    def forward(self,x):
        return self.Upsample(x)
    
    

In [13]:
class Neck(nn.Module):
    def __init__(self):
        super().__init__()
        self.upsample=upsample(scale_factor=2,mode="nearest")
        self.c2f12 = c2f(in_channels =1024,out_channels=1024,n=3,d=1,shortcut=False)
        self.c2f15 = c2f(in_channels = 768, out_channels = 768, n = 3, d = 1, shortcut = False)
        self.down1 = ConvBNAct(in_channels = 768, out_channels = 256, kernel_size = 1, stride = 1, padding = 0)
        self.conv16 = ConvBNAct(in_channels = 256, out_channels = 256, kernel_size = 3, stride = 2, padding = 1)
        self.c2f18 = c2f(in_channels = 768, out_channels = 768, n = 3, d = 1, shortcut = False)
        self.down2 = ConvBNAct(in_channels = 768, out_channels = 512, kernel_size = 1, stride = 1, padding = 0)
        self.conv19 = ConvBNAct(in_channels = 512, out_channels = 512, kernel_size = 3, stride = 2, padding = 1)
        self.c2f21 = c2f(in_channels = 1024, out_channels = 1024, n = 3, d = 1, shortcut = False)
        self.down3 = ConvBNAct(in_channels = 1024, out_channels = 512, kernel_size = 1, stride = 1, padding = 0)
        self.down4 = ConvBNAct(in_channels = 1024, out_channels = 512, kernel_size = 1, stride = 1, padding = 0)

    def forward(self,x1,x2,x3):
        
        x_10=self.upsample(x3)
        x_11=torch.concat([x_10,x_2],dim=1)
        x_12=self.down4(self.c2f12(x_11))
        
        x_13=self.upsample(x_12)
        x_14=torch.concat([x_13,x1],dim=1)
        x_15=self.c2f15(x_14)
        detect_1=self.down1(x_15)

        x_16=self.conv16(detect_1)
        x_17=torch.concat([x_12,x_16],dim=1)
        x_18=self.c2f18(x_17)
        detect_2=self.down2(x_18)
        
        x_19=self.conv19(detect_2)
        x_20=torch.concat([x3,x_19],dim=1)
        x_21=self.c2f21(x_20)
        detect_3=self.down3(x_21)
        
        return detect_1,detect_2,detect_3

        
        
        

# The Detect Box

In [14]:
class Detect(nn.Module):
    def __init__(self,in_channels,num_classes=1,reg_max=15):
        super().__init__()
        self.reg_max=reg_max
        self.num_classes=num_classes

        self.bbox_layer=nn.Sequential(
            nn.Conv2d(in_channels,in_channels,kernel_size=3,padding=1),
            nn.SiLU(),
            nn.Conv2d(in_channels,in_channels,kernel_size=3,padding=1),
            nn.SiLU(),
            nn.Conv2d(in_channels,4*(reg_max+1),kernel_size=1,padding=0),
        )

        self.class_layer = nn.Sequential(
            nn.Conv2d(in_channels, in_channels, kernel_size = 3, padding = 1),
            nn.SiLU(),
            nn.Conv2d(in_channels, in_channels, kernel_size = 3, padding = 1),
            nn.SiLU(),
            nn.Conv2d(in_channels, num_classes, kernel_size = 1, padding = 0)
        )

    def forward(self,x):
        bounding_box=self.bbox_layer(x)
        classif_box=self.class_layer(x)
        return bounding_box,classif_box
        

In [15]:
class Head(nn.Module):
    def __init__(self,in_channels_list,num_classes=1,reg_max=15):
        super().__init__()
        self.head_layer = nn.ModuleList([
            Detect(in_channels,num_classes,reg_max) for in_channels in in_channels_list
        ])

    def forward(self,features):
         all_preds=[]
         for i,feat in enumerate(features):
             bbox_pred,cls_pred = self.head_layer[i](feat)
             all_preds.append((bbox_pred,cls_pred))
         return all_preds

# The YOLOv8 Architecture

In [17]:
class YOLOv8(nn.Module):
    def __init__(self,backbone,neck,head):
        super(YOLOv8,self).__init__()
        self.backbone=backbone
        self.neck=neck
        self.head=head

    def forward(self,x):
        _,x1,x2,x3 = self.backbone(x)
        d1,d2,d3=self.neck(x1,x2,x3)
        preds=self.head([d1,d2,d3])
        return preds

# Helper Functions 

In [18]:
def decode_bboxes(bbox_pred, stride, reg_max=15):
    # Get the shape: B=batch size, _=channels, H=height, W=width
    B, _, H, W = bbox_pred.shape

    # Reshape from [B, 4*(reg_max+1), H, W] â†’ [B, 4, reg_max+1, H, W]
    # 4 corresponds to the 4 sides: [left, top, right, bottom]
    bbox_pred = bbox_pred.view(B, 4, reg_max + 1, H, W)

    # Apply softmax along the bin dimension (dim=2) to get probability distributions
    prob = F.softmax(bbox_pred, dim=2)

    # Create the bin index tensor: [0, 1, ..., reg_max]
    proj = torch.arange(reg_max + 1, dtype=prob.dtype, device=prob.device)

    # Compute the expected distance by taking the weighted sum of bin indices
    dist = (prob * proj[None, None, :, None, None]).sum(dim=2)  # shape: [B, 4, H, W]

    # Generate a grid of (x, y) locations corresponding to feature map positions
    grid_y, grid_x = torch.meshgrid(torch.arange(H), torch.arange(W), indexing='ij')
    grid = torch.stack((grid_x, grid_y), dim=0).to(bbox_pred.device)  # shape: [2, H, W]

    # Compute center coordinates in input image space
    x_center = (grid[0][None] + 0.5) * stride  # shape: [1, H, W]
    y_center = (grid[1][None] + 0.5) * stride  # shape: [1, H, W]

    # Convert predicted distances (dist) to bounding box corners
    x1 = x_center - dist[:, 0] * stride  # left
    y1 = y_center - dist[:, 1] * stride  # top
    x2 = x_center + dist[:, 2] * stride  # right
    y2 = y_center + dist[:, 3] * stride  # bottom

    # Convert corner format [x1, y1, x2, y2] to center format [xc, yc, w, h]
    xc = (x1 + x2) / 2
    yc = (y1 + y2) / 2
    w = x2 - x1
    h = y2 - y1

    # Stack to final bounding box shape: [B, H*W, 4] in (xc, yc, w, h) format
    bboxes = torch.stack([xc, yc, w, h], dim=-1)
    return bboxes.view(B, -1, 4)


This function processes class prediction scores by applying a sigmoid activation to convert raw logits into probabilities. It then flattens the predictions and creates a binary mask (`score_mask`) that marks positions where the confidence score exceeds a given threshold (`score_thresh`).


In [19]:
def process_cls_scores(cls_pred,score_thresh=0.4):
    B,C,H,W = cls_pred.shape
    assert C == 1
    probs=torch.sigmoid(cls_pred)
    probs=probs.view(B,-1)
    score_mask = probs > score_thresh
    return score_mask,probs

This function filters and formats model predictions (bounding boxes and classification scores) into a usable list of detections with coordinates and confidence scores, suitable for evaluation or visualization. It combines multi-scale predictions (from different strides) and selects only high-confidence boxes.

In [20]:
def get_final_predictions(preds, strides=[8, 16, 32], score_thresh=0.4):
    
    all_boxes = []  # To collect final bounding boxes with scores
    
    for i, (bbox_pred, cls_pred) in enumerate(preds):
        stride = strides[i]  # Each feature map corresponds to a stride (scale)
        
        # Decode the bounding boxes from distribution format to [x, y, w, h]
        boxes = decode_bboxes(bbox_pred, stride=stride)
        
        # Process classification scores and get a mask of confident predictions
        conf_mask, confs = process_cls_scores(cls_pred, score_thresh=score_thresh)

        # Iterate through each sample in the batch
        for batch_idx in range(boxes.shape[0]):
            batch_boxes = boxes[batch_idx]  # Predicted boxes for one image [H*W, 4]
            batch_mask = conf_mask[batch_idx]  # Boolean mask for scores > threshold
            batch_confs = confs[batch_idx]     # Confidence values [H*W]

            # Select only the valid boxes based on confidence threshold
            valid_boxes = batch_boxes[batch_mask]
            valid_scores = batch_confs[batch_mask]

            # Collect the filtered boxes and scores
            for j in range(valid_boxes.size(0)):
                x, y, w, h = valid_boxes[j]     # Bounding box in [xc, yc, w, h]
                score = valid_scores[j].item()  # Corresponding confidence score
                all_boxes.append((x.item(), y.item(), w.item(), h.item(), score))  # Add to results

    return all_boxes  # List of all final predicted boxes with scores


In [21]:
def decode_from_distr(pred, reg_max=15):
    
    B, channels, H, W = pred.shape
    pred = pred.view(B, 4, reg_max + 1, H, W)
    
    # Apply softmax
    prob = F.softmax(pred, dim=2)
    
    # Create projection weights
    proj = torch.arange(reg_max + 1, dtype=prob.dtype, device=prob.device)
    
    # Calculate expected values
    exp = torch.sum(prob * proj[None, None, :, None, None], dim=2)
    
    # Generate grid coordinates
    grid_y, grid_x = torch.meshgrid(torch.arange(H), torch.arange(W), indexing='ij')
    grid = torch.stack((grid_x, grid_y), dim=0).to(pred.device)
    
    # Decode to xyxy format (simplified version)
    decoded = exp.view(B, 4, -1).permute(0, 2, 1)  # [B, H*W, 4]
    return decoded

The `get_dfl_targets` function initializes a zero tensor to hold **distributional regression targets** for bounding box distances in a format compatible with **DFL (Distribution Focal Loss)**. Each box gets a `[4, reg_max + 1]` target representing class-like labels over bins for the four box directions (left, top, right, bottom).


In [22]:
def get_dfl_targets(boxes, reg_max=15):
    
    B, N, _ = boxes.shape
    targets = torch.zeros(B, N, 4, reg_max + 1, device=boxes.device)
    return targets

In [24]:
def make_anchors(feats, strides):
    """Generate anchor points."""
    anchor_points, stride_tensor = [], []

    for i, stride in enumerate(strides):
        h, w = feats[i].shape[2:]
        y, x = torch.arange(h, dtype=torch.float32), torch.arange(w, dtype=torch.float32)
        yv, xv = torch.meshgrid(y, x, indexing='ij')

        grid = torch.stack([xv, yv], 2).view(-1, 2)
        anchor_points.append(grid * stride + stride / 2)
        stride_tensor.append(torch.full((grid.shape[0], 1), stride, dtype=torch.float32))

    anchor_points = torch.cat(anchor_points, 0).to(feats[0].device)
    stride_tensor = torch.cat(stride_tensor, 0).to(feats[0].device)

    return anchor_points, stride_tensor

In [25]:
def decode_bboxes_dfl(bbox_pred, anchor_points, stride_tensor, reg_max=15):
    """Decode bbox predictions from DFL format."""
    batch_size, num_anchors, _ = bbox_pred.shape
    bbox_pred = bbox_pred.view(batch_size, num_anchors, 4, reg_max + 1)
    bbox_pred = F.softmax(bbox_pred, dim=-1)

    proj = torch.arange(reg_max + 1, dtype=bbox_pred.dtype, device=bbox_pred.device)
    bbox_pred = (bbox_pred * proj[None, None, None, :]).sum(dim=-1)

    lt, rb = bbox_pred.chunk(2, dim=-1)
    anchor_points = anchor_points[None].expand(batch_size, -1, -1)
    stride_tensor = stride_tensor[None].expand(batch_size, -1, -1)

    x1y1 = anchor_points - lt * stride_tensor
    x2y2 = anchor_points + rb * stride_tensor

    return torch.cat([x1y1, x2y2], dim=-1)


# Non - Max Suppression

In [23]:
import torchvision.ops as ops

def apply_nms(boxes,scores,iou_threshold=0.5):
    if boxes.numel()==0:
        return torch.tensor([],dtype=torch.int64)
    # Convert [xc, yc, w, h] â†’ [x1, y1, x2, y2]
    x1 = boxes[:, 0] - boxes[:, 2] / 2
    y1 = boxes[:, 1] - boxes[:, 3] / 2
    x2 = boxes[:, 0] + boxes[:, 2] / 2
    y2 = boxes[:, 1] + boxes[:, 3] / 2
    boxes_xyxy=torch.stack(([x1,y1,x2,y2]),dim=1)
    keep=ops.nms(boxes_xyxy,scores,iou_threshold=iou_threshold)
    return keep

# Defining Loss Functions

### ðŸ“Š Comparison of YOLOv8 Loss Functions

| Loss Function               | Used For              | Formula                                                                                                       | Key Characteristics                                                                                                                                                             |
|----------------------------|------------------------|---------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| **Focal Loss**             | Classification         | $\text{FL}(p_t) = -\alpha_t (1 - p_t)^\gamma \log(p_t)$<br>where $p_t = \begin{cases} p & y = 1 \\ 1 - p & y = 0 \end{cases}$ | - Tackles class imbalance by focusing training on hard negatives<br>- Uses parameters $\gamma$ (focusing factor), $\alpha$ (class balance)<br>- Common in dense detectors like RetinaNet and YOLO |
| **Distribution Focal Loss** (DFL) | Box Side Regression     | $\text{DFL} = (1 - w) \cdot \text{NLL}(l) + w \cdot \text{NLL}(r)$<br>where $t = l + w$, $l = \lfloor t \rfloor$, $r = \lceil t \rceil$ | - Treats bounding box regression as classification over discrete bins<br>- Improves localization accuracy<br>- Used for bounding box side predictions in YOLOv8                 |
| **CIoU Loss**              | Bounding Box Regression | $\text{CIoU} = 1 - \text{IoU} + \frac{\rho^2(\mathbf{b}, \mathbf{b}^{gt})}{c^2}$<br>where $\rho$ is center distance, $c$ is diagonal of smallest enclosing box | - Improves over IoU by considering object center distance and aspect ratio<br>- Encourages better convergence<br>- Strong performance in precise localization tasks             |


In [None]:
import torch.nn.functional as F

class FocalLoss(nn.Module):
    def __init__(self,gamma=2.0,alpha=0.25,reduction='mean'):
        super().__init__()
        self.gamma = gamma
        self.alpha = alpha
        self.reduction = reduction

     def forward(self,logits,targets):
         probs=torch.sigmoid(logits)
         ce_loss=F.binary_cross_entropy_with_logits(logits,targets,reduction='none')
         p_t=targets*probs+(1-targets)*(1-probs)
         loss=ce_loss*((1-p_t)**self.gamma)

         if self.alpha >= 0:
             alpha_t = self.alpha 8*targets +(1-self.alpha)*(1-targets)
             loss *= alpha_t

         if self.reduction == 'mean' :
             return loss.mean()
         elif self.reduction == 'sum':
             return loss.sum()
         return loss
         

In [None]:
class DistributionFocalLoss(nn.Module):
    def __init__(self, reg_max=15):
        super().__init__()
        self.reg_max = reg_max

    def forward(self, pred, target):
        # FIXED: Added clamping to prevent index overflow
        dis_left = target.long()
        dis_right = torch.clamp(dis_left + 1, max=self.reg_max)

        weight_right = target - dis_left
        weight_left = 1 - weight_right

        pred = pred.log_softmax(dim=-1)
        loss_left = F.nll_loss(pred.view(-1, pred.shape[-1]), dis_left.view(-1), reduction='none')
        loss_right = F.nll_loss(pred.view(-1, pred.shape[-1]), dis_right.view(-1), reduction='none')

        loss = (weight_left.view(-1) * loss_left + weight_right.view(-1) * loss_right).view(target.shape)
        return loss.mean()

In [None]:
def bbox_ciou_loss(pred_boxes, target_boxes, eps=1e-7):
    """
    pred_boxes, target_boxes: [N, 4] in xyxy format
    """
    x1 = torch.max(pred_boxes[:, 0], target_boxes[:, 0])
    y1 = torch.max(pred_boxes[:, 1], target_boxes[:, 1])
    x2 = torch.min(pred_boxes[:, 2], target_boxes[:, 2])
    y2 = torch.min(pred_boxes[:, 3], target_boxes[:, 3])

    inter = (x2 - x1).clamp(min=0) * (y2 - y1).clamp(min=0)
    area_pred = (pred_boxes[:, 2] - pred_boxes[:, 0]) * (pred_boxes[:, 3] - pred_boxes[:, 1])
    area_target = (target_boxes[:, 2] - target_boxes[:, 0]) * (target_boxes[:, 3] - target_boxes[:, 1])
    union = area_pred + area_target - inter + eps
    iou = inter / union

    # Center distance
    px = (pred_boxes[:, 0] + pred_boxes[:, 2]) / 2
    py = (pred_boxes[:, 1] + pred_boxes[:, 3]) / 2
    gx = (target_boxes[:, 0] + target_boxes[:, 2]) / 2
    gy = (target_boxes[:, 1] + target_boxes[:, 3]) / 2
    center_dist = (px - gx) ** 2 + (py - gy) ** 2

    # Enclosing box diagonal
    x1_c = torch.min(pred_boxes[:, 0], target_boxes[:, 0])
    y1_c = torch.min(pred_boxes[:, 1], target_boxes[:, 1])
    x2_c = torch.max(pred_boxes[:, 2], target_boxes[:, 2])
    y2_c = torch.max(pred_boxes[:, 3], target_boxes[:, 3])
    diagonal = ((x2_c - x1_c) ** 2 + (y2_c - y1_c) ** 2) + eps

    ciou = iou - (center_dist / diagonal)
    loss = 1.0 - ciou.clamp(min=-1.0, max=1.0)
    return loss.mean()