In [130]:
import torch
import torch.nn as nn

class ConvBlock(nn.Module):
    """A block of Conv2D -> BatchNorm -> ReLU."""
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding):
        super(ConvBlock, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding)
        self.bn = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.relu(self.bn(self.conv(x)))

class YOLOBackbone(nn.Module):
    def __init__(self):
        super(YOLOBackbone, self).__init__()
        self.layers = nn.Sequential(
            ConvBlock(3, 32, kernel_size=3, stride=1, padding=1),
            nn.MaxPool2d(2, 2),
            ConvBlock(32, 64, kernel_size=3, stride=1, padding=1),
            nn.MaxPool2d(2, 2),
            ConvBlock(64, 128, kernel_size=3, stride=1, padding=1),
            nn.MaxPool2d(2, 2),
            ConvBlock(128, 256, kernel_size=3, stride=1, padding=1),
            nn.MaxPool2d(2, 2), 
            ConvBlock(256, 512, kernel_size=3, stride=1, padding=1),
            nn.MaxPool2d(2, 2),
            ConvBlock(512, 1024, kernel_size=3, stride=1, padding=1),
            nn.MaxPool2d(2, 2),  # 14 -> 7
        )

    def forward(self, x):
        return self.layers(x)


class YOLOHead(nn.Module):
    def __init__(self, grid_size, num_classes, num_anchors):
        super(YOLOHead, self).__init__()
        self.grid_size = grid_size
        self.num_classes = num_classes
        self.num_anchors = num_anchors
        self.detector = nn.Conv2d(1024, num_anchors * (5 + num_classes), kernel_size=1)

    def forward(self, x):
        return self.detector(x).permute(0, 2, 3, 1).contiguous()



class YOLO(nn.Module):
    def __init__(self, grid_size=7, num_classes=2, num_anchors=3):
        super(YOLO, self).__init__()
        self.backbone = YOLOBackbone()
        self.head = YOLOHead(grid_size, num_classes, num_anchors)

    def forward(self, x):
        features = self.backbone(x)
        predictions = self.head(features)
        return predictions

# Example usage
model = YOLO(grid_size=7, num_classes=2, num_anchors=3)
print(model)

YOLO(
  (backbone): YOLOBackbone(
    (layers): Sequential(
      (0): ConvBlock(
        (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (bn): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU()
      )
      (1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (2): ConvBlock(
        (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU()
      )
      (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (4): ConvBlock(
        (conv): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU()
      )
      (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=F

In [131]:
import numpy as np

In [132]:
import numpy as np

def generate_anchors(scales, ratios):
    """Generates anchor boxes for given scales and aspect ratios."""
    anchors = []
    for scale in scales:
        for ratio in ratios:
            width = scale * np.sqrt(ratio)
            height = scale / np.sqrt(ratio)
            anchors.append((width, height))
    return np.array(anchors)

# Example: Scales and ratios
scales = [0.1, 0.2, 0.4]
ratios = [0.5, 1, 2]
anchors = generate_anchors(scales, ratios)
print("Anchor Boxes:", anchors)

Anchor Boxes: [[0.07071068 0.14142136]
 [0.1        0.1       ]
 [0.14142136 0.07071068]
 [0.14142136 0.28284271]
 [0.2        0.2       ]
 [0.28284271 0.14142136]
 [0.28284271 0.56568542]
 [0.4        0.4       ]
 [0.56568542 0.28284271]]


In [133]:
def convert_to_yolo_format(width, height, bbox):
    """Converts absolute bounding box to YOLO format."""
    x_min, y_min, x_max, y_max = bbox
    x_center = (x_min + x_max) / 2 / width
    y_center = (y_min + y_max) / 2 / height
    box_width = (x_max - x_min) / width
    box_height = (y_max - y_min) / height
    return [x_center, y_center, box_width, box_height]

In [134]:
import torchvision.transforms as transforms

train_transforms = transforms.Compose([
    transforms.Resize((448, 448)),
    # transforms.RandomHorizontalFlip(),
    # transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor()
])

In [152]:
def yolo_loss(predictions, targets, num_classes=2, lambda_coord=5, lambda_noobj=0.5):
    """
    Computes YOLO loss.
    - predictions: Predicted tensor of shape [batch_size, S, S, B*(5+C)]
    - targets: Ground truth tensor of shape [batch_size, S, S, B*(5+C)]
    """
    batch_size, S, S, total_features = predictions.shape
    B = 3  # number of anchors
    C = num_classes
    
    # Reshape predictions and targets to separate anchors
    predictions = predictions.view(batch_size, S, S, B, 5 + C)
    targets = targets.view(batch_size, S, S, B, 5 + C)
    
    # Unpack predictions and targets
    pred_boxes = predictions[..., :4]  # [batch_size, S, S, B, 4]
    pred_conf = predictions[..., 4]    # [batch_size, S, S, B]
    pred_classes = predictions[..., 5:] # [batch_size, S, S, B, C]
    
    target_boxes = targets[..., :4]    # [batch_size, S, S, B, 4]
    target_conf = targets[..., 4]      # [batch_size, S, S, B]
    target_classes = targets[..., 5:]  # [batch_size, S, S, B, C]
    
    # Localization Loss (only for cells with objects)
    obj_mask = target_conf > 0
    box_loss = lambda_coord * torch.sum(obj_mask.unsqueeze(-1) * (pred_boxes - target_boxes) ** 2)

    # Confidence Loss
    obj_loss = torch.sum(obj_mask * (pred_conf - target_conf) ** 2)
    noobj_loss = lambda_noobj * torch.sum((~obj_mask) * pred_conf ** 2)

    # Classification Loss (only for cells with objects)
    class_loss = torch.sum(obj_mask.unsqueeze(-1) * (pred_classes - target_classes) ** 2)

    # Total Loss
    total_loss = box_loss + obj_loss + noobj_loss + class_loss
    return total_loss

In [153]:
def encode_yolo_targets(boxes, S=7, B=3, C=2):
    """
    boxes: tensor of shape [num_boxes, 5] (class, x_center, y_center, width, height)
    Returns: tensor of shape [S, S, B*5 + B*C] = [S, S, 21] for B=3, C=2
    """
    target = torch.zeros((S, S, B*5 + B*C))  # Changed from B*5 + C to B*5 + B*C
    if len(boxes) == 0:
        return target
    
    for box in boxes:
        class_idx, x, y, w, h = box
        # Convert to Python scalars
        class_idx = int(class_idx.item()) if hasattr(class_idx, 'item') else int(class_idx)
        x = float(x.item()) if hasattr(x, 'item') else float(x)
        y = float(y.item()) if hasattr(y, 'item') else float(y)
        w = float(w.item()) if hasattr(w, 'item') else float(w)
        h = float(h.item()) if hasattr(h, 'item') else float(h)
        
        i, j = int(y * S), int(x * S)  # grid cell indices
        if i >= S: i = S-1
        if j >= S: j = S-1
        if i < 0: i = 0
        if j < 0: j = 0
        
        # Fill all anchors for this cell
        for anchor_idx in range(B):
            # Box coordinates and objectness (5 values per anchor)
            anchor_offset = anchor_idx * 5
            target[i, j, anchor_offset:anchor_offset+5] = torch.tensor([x, y, w, h, 1])
            
            # Class probabilities (C values per anchor)
            class_offset = B*5 + anchor_idx * C
            if class_idx < C:
                target[i, j, class_offset + class_idx] = 1  # one-hot class for this anchor
    
    return target

In [154]:
# Test shapes
for batch_idx, (images, targets) in enumerate(train_loader):
    if batch_idx == 0:  # Only test first batch
        images = images.to(device)
        targets = targets.to(device)
        
        predictions = model(images)
        
        print("Predictions shape:", predictions.shape)
        print("Targets shape:", targets.shape)
        
        # Test loss function
        loss = yolo_loss(predictions, targets, num_classes=2)
        print("Loss computed successfully:", loss.item())
        break

Predictions shape: torch.Size([8, 7, 7, 21])
Targets shape: torch.Size([8, 7, 7, 21])
Loss computed successfully: 736.7039794921875


In [None]:
import numpy as np

In [155]:
import torch
from torch.utils.data import Dataset
import cv2
import os
import xml.etree.ElementTree as ET
from torchvision import transforms


class YOLODataset(Dataset):
    def __init__(self, img_dir, label_dir, class_to_idx, transforms=None):
        self.img_dir = img_dir
        self.label_dir = label_dir
        self.transforms = transforms
        self.class_to_idx = class_to_idx  # e.g., {"cat": 0, "dog": 1}
        self.images = [f for f in os.listdir(img_dir) if f.endswith(('.jpg', '.png'))]

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img_filename = self.images[idx]
        img_path = os.path.join(self.img_dir, img_filename)
        label_filename = os.path.splitext(img_filename)[0] + ".xml"
        label_path = os.path.join(self.label_dir, label_filename)

        # Load image using PIL
        image = Image.open(img_path).convert("RGB")
        width, height = image.size

        # Parse XML and extract boxes
        boxes = []
        tree = ET.parse(label_path)
        root = tree.getroot()

        for obj in root.findall('object'):
            label = obj.find('name').text.lower().strip()
            class_idx = self.class_to_idx[label]

            bbox = obj.find('bndbox')
            x_min = float(bbox.find('xmin').text)
            y_min = float(bbox.find('ymin').text)
            x_max = float(bbox.find('xmax').text)
            y_max = float(bbox.find('ymax').text)

            # Convert to YOLO format: [class, x_center, y_center, width, height]
            x_center = (x_min + x_max) / 2 / width
            y_center = (y_min + y_max) / 2 / height
            box_width = (x_max - x_min) / width
            box_height = (y_max - y_min) / height

            boxes.append([class_idx, x_center, y_center, box_width, box_height])

        if self.transforms:
            image = self.transforms(image)
        
                # Handle empty boxes case
        if len(boxes) == 0:
            encoded_target = encode_yolo_targets(torch.empty(0, 5), S=7, B=3, C=2)
        else:
            encoded_target = encode_yolo_targets(torch.tensor(boxes), S=7, B=3, C=2)
        
        return image, encoded_target


class_to_idx = {"cat": 0, "dog": 1} 
# Example: Initialize DataLoader
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor


train_dataset = YOLODataset(img_dir=r"C:\Users\Harshal\playground\object-detection\dataset\images", label_dir=r"C:\Users\Harshal\playground\object-detection\dataset\annotation", class_to_idx=class_to_idx, transforms=train_transforms)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)


In [156]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x177e2735c90>

In [157]:
# Reinitialize model with correct architecture
model = YOLO(grid_size=7, num_classes=2, num_anchors=3)
print(model)

# Test the output shape
x = torch.randn(1, 3, 448, 448)
output = model(x)
print("Output shape:", output.shape)  # Should be [1, 7, 7, 75]

YOLO(
  (backbone): YOLOBackbone(
    (layers): Sequential(
      (0): ConvBlock(
        (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (bn): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU()
      )
      (1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (2): ConvBlock(
        (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU()
      )
      (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (4): ConvBlock(
        (conv): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU()
      )
      (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=F

In [158]:
import torch
import torch.optim as optim
from tqdm import tqdm

# Hyperparameters
num_epochs = 10
learning_rate = 1e-4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_classes = 2

# Initialize model, loss function, and optimizer
model = YOLO(grid_size=7, num_classes=num_classes, num_anchors=3).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Move model to train mode
model.train()

for epoch in range(num_epochs):
    loop = tqdm(train_loader, leave=True)
    total_loss = 0

    for batch_idx, (images, targets) in enumerate(loop):
        images = images.to(device)
        targets = targets.to(device)

        # Forward pass
        predictions = model(images)

        # Compute loss
        loss = yolo_loss(predictions, targets, num_classes=num_classes)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Update tqdm loop description
        loop.set_description(f"Epoch [{epoch+1}/{num_epochs}]")
        loop.set_postfix(loss=loss.item())

    print(f"Epoch {epoch+1} Loss: {total_loss / len(train_loader):.4f}")


Epoch [1/10]: 100%|██████████| 3/3 [00:07<00:00,  2.60s/it, loss=34.7]


Epoch 1 Loss: 484.0259


Epoch [2/10]: 100%|██████████| 3/3 [00:06<00:00,  2.26s/it, loss=56.3]   


Epoch 2 Loss: 1119.0332


Epoch [3/10]: 100%|██████████| 3/3 [00:06<00:00,  2.15s/it, loss=31.9]


Epoch 3 Loss: 198.5142


Epoch [4/10]: 100%|██████████| 3/3 [00:06<00:00,  2.32s/it, loss=15.5]


Epoch 4 Loss: 215.1785


Epoch [5/10]: 100%|██████████| 3/3 [00:06<00:00,  2.30s/it, loss=10.3]


Epoch 5 Loss: 71.1342


Epoch [6/10]: 100%|██████████| 3/3 [00:12<00:00,  4.29s/it, loss=6.58]


Epoch 6 Loss: 65.1223


Epoch [7/10]: 100%|██████████| 3/3 [00:15<00:00,  5.27s/it, loss=4.97]


Epoch 7 Loss: 53.2828


Epoch [8/10]: 100%|██████████| 3/3 [00:16<00:00,  5.44s/it, loss=5.55]


Epoch 8 Loss: 34.6401


Epoch [9/10]: 100%|██████████| 3/3 [00:14<00:00,  4.77s/it, loss=8.23]


Epoch 9 Loss: 26.4058


Epoch [10/10]: 100%|██████████| 3/3 [00:06<00:00,  2.30s/it, loss=8.03]

Epoch 10 Loss: 27.9706





In [159]:
# Save the trained model
torch.save(model.state_dict(), 'yoloObjectdetection_cat_dog_model.pth')
print("Model saved successfully!")

Model saved successfully!
