In [11]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
print("GPU Name:", torch.cuda.get_device_name(0))

Using device: cuda
GPU Name: NVIDIA GeForce RTX 4050 Laptop GPU


In [12]:
import torch.nn as nn

# Architectue of the yolov1 for convulation layers
![arch](papers/arch.png)

In [13]:
# this is layer arctitecture of the convulation layers of the yolo vwersion 1

# each tupe: (size_of_kernel/filters, number of kernels/filters, number of strides, padding size)
# "M" : means maxpooling layer of size 2x2 with stride 2

# inside list: [(), (), n] : meaning apply those tuples conv layers 'n' number of times 

architecture_config = [
    (7, 64, 2, 3),
    "M",

    (3, 192, 1, 1),
    "M",

    (1, 128, 1, 0),
    (3, 256, 1, 1),
    (1, 256, 1, 0),
    (3, 512, 1, 1),
    "M",

    [(1, 256, 1, 0), (3, 512, 1, 1), 4],
    (1, 512, 1, 0),
    (3, 1024, 1, 1),
    "M",

    [(1, 512, 1, 0), (3, 1024, 1, 1), 2],
    (3, 1024, 1, 1),
    (3, 1024, 2, 1),

    (3, 1024, 1, 1),
    (3, 1024, 1, 1),
]

In [14]:
class CNNBlock(nn.Module): # inheriting the nn 
    def __init__(self, in_channels, out_channels, **kwargs):
        super(CNNBlock, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs) 
        # bias=False because we will use batch normalization
        self.batchnorm = nn.BatchNorm2d(out_channels)
        self.leakyRelu = nn.LeakyReLU(0.1)

    def forward(self, x):
        return self.leakyRelu(self.batchnorm(self.conv(x)))

In [15]:
class YoloV1(nn.Module):
    def __init__(self, in_channels=3, **kwargs):
        super(YoloV1, self).__init__()
        self.architecture = architecture_config
        self.in_channels = in_channels
        self.darknet = self._create_conv_layers(self.architecture)
        self.fcs = self._create_fcs(**kwargs)

    def forward(self, x):
        x = self.darknet(x)
        return self.fcs(x)
    
    def _create_conv_layers(self, architecture):
        layers = []
        in_channels = self.in_channels

        for x in architecture:
            if type(x) == tuple:
                layers += [
                            CNNBlock(in_channels, x[1], kernel_size=x[0], stride=x[2], padding=x[3],)
                        ]
                in_channels = x[1]
                
            elif type(x) == str:
                layers += [
                            nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
                        ]
            
            elif type(x) == list:
                conv1 = x[0] # Tuple
                conv2 = x[1] # Tuple
                num_repeats = x[2] # Interger

                for _ in range(num_repeats):
                    layers += [
                                CNNBlock(in_channels, conv1[1], kernel_size=conv1[0], stride=conv1[2], padding=conv1[3])
                            ]
                    layers += [
                                CNNBlock(conv1[1], conv2[1], kernel_size=conv2[0], stride=conv2[2], padding=conv2[3])
                            ]
                    
                    in_channels = conv2[1]

        return nn.Sequential(*layers)
    
    def _create_fcs(self, split_size, num_boxes, num_classes):
        S, B, C = split_size, num_boxes, num_classes
        return nn.Sequential(
                                nn.Flatten(start_dim=1),
                                nn.Linear(1024 * S * S, 496), # Original Paper its 4096 (too much VRAM required)
                                nn.Dropout(0.0), 
                                nn.LeakyReLU(0.1),
                                nn.Linear(496, S * S * (C + B * 5)), # (S * S * 30) C+B*5 = 30
                            )


def test(S=7, B=2, C=20):
        model = YoloV1(split_size=S, num_boxes=B, num_classes=C)
        x = torch.randn((2, 3, 448, 448))
        print(model(x).shape)

test()

torch.Size([2, 1470])
