In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [10]:
# nn module, base class (parent) for all model/layer in pytorch
# nn.Module handle parameters(weight and bias), autosave in parameters() -> use for train
# define Forward rule, just write def forward -> pytorch handle backpropagation(gradient)
# sp composition, build layer from other layers (LEGO)
# multiple utils like, .train(), .eval(), .cuda(), .cpu(), .state_dict() -> save/load model
class Conv(nn.Module):
    def __init__(self, c_in, c_out, k=1, s=1, p=None, g=1):
        super().__init__()
        self.conv = nn.Conv2d(c_in, c_out, k, s, autopad(k, p), groups=g, bias=False)
        self.bn = nn.BatchNorm2d(c_out)
        self.act = nn.SiLU(inplace=True)

    def forward(self, x):
        return self.act(self.bn(self.conv(x)))
    
def autopad(k, p=None):
    if p is None:
        # padding = kernel_size // 2
        p = k // 2
    return p

In [11]:
class Bottleneck(nn.Module):
    def __init__(self, c_in, c_out, shortcut=True):
        super().__init__()
        # c_in, nums of channel of input feature map (RGB -> c_in=3)
        self.cv1 = Conv(c_in, c_out, 1, 1) # 1x1 conv
        self.cv2 = Conv(c_in, c_out, 3, 1)
        # skip connection if c_in = c_out, same the shape, channels
        self.add = shortcut and c_in == c_out
    
    def forward(self, x):
        y = self.cv2(self.cv1(x))
        return x + y if self.add else y # x + F(x)

In [None]:
# class Cross Stage Partial
# 2f = 2 fusion (merge 2 branch in bottleneck), cv1, cv2
class C2f(nn.Module):
    def __init__(self, c_in, c_out, n=1):
        super().__init__()
        hidden = c_out // 2
        self.cv1 = Conv(c_in, hidden, 1, 1)
        self.cv2 = Conv(c_in, hidden, 1, 1)
        self.m = nn.Sequential(*[Bottleneck(hidden, hidden) for _ in range(n)])
        self.cv3 = Conv(2 * hidden, c_out, 1)

    def forward(self, x):
        y1 = self.cv1(x)
        y2 = self.m(self.cv2(x))
        return self.cv3(torch.cat((y1, y2), 1))


In [13]:
class SPPF(nn.Module):
    def __init__(self, c_in, c_out, k=5):
        super().__init__()
        hidden = c_in // 2
        self.cv1 = Conv(c_in, hidden, 1, 1)
        self.cv2 = Conv(hidden * 4, c_out, 1, 1)
        self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)

    def forward(self, x):
        x = self.cv1(x)
        y1 = self.m(x)
        y2 = self.m(y1)
        y3 = self.m(y2)
        return self.cv2(torch.cat([x, y1, y2, y3], 1))

In [14]:
class YOLOv8(nn.Module):
    def __init__(self, num_classes=9):
        super().__init__()
        # Backbone
        self.layer1 = Conv(3, 32, 3, 2)
        self.layer2 = C2f(32, 64, n=1)
        self.layer3 = Conv(64, 128, 3, 2)
        self.layer4 = C2f(128, 128, n=2)
        self.layer5 = Conv(128, 256, 3, 2)
        self.layer6 = C2f(256, 256, n=2)
        self.layer7 = Conv(256, 512, 3, 2)
        self.layer8 = C2f(512, 512, n=1)
        self.layer9 = SPPF(512, 512)

        # Neck (FPN + PAN)
        self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
        self.c2f1 = C2f(512 + 256, 256, n=1)
        self.c2f2 = C2f(256 + 128, 128, n=1)
        self.down1 = Conv(128, 256, 3, 2)
        self.c2f3 = C2f(256 + 256, 256, n=1)
        self.down2 = Conv(256, 512, 3, 2)
        self.c2f4 = C2f(512 + 512, 512, n=1)

        # Head (3 scales)
        self.detect_p3 = nn.Conv2d(128, 4 + 1 + num_classes, 1)
        self.detect_p4 = nn.Conv2d(256, 4 + 1 + num_classes, 1)
        self.detect_p5 = nn.Conv2d(512, 4 + 1 + num_classes, 1)

    def forward(self, x):
        # Backbone
        x1 = self.layer1(x)
        x2 = self.layer2(x1)
        x3 = self.layer3(x2)
        x4 = self.layer4(x3)
        x5 = self.layer5(x4)
        x6 = self.layer6(x5)
        x7 = self.layer7(x6)
        x8 = self.layer8(x7)
        x9 = self.layer9(x8)  # P5

        # FPN
        p5 = x9
        p4 = self.c2f1(torch.cat([self.upsample(p5), x6], 1))
        p3 = self.c2f2(torch.cat([self.upsample(p4), x4], 1))

        # PAN
        n4 = self.c2f3(torch.cat([self.down1(p3), p4], 1))
        n5 = self.c2f4(torch.cat([self.down2(n4), p5], 1))

        # Head
        out_p3 = self.detect_p3(p3)
        out_p4 = self.detect_p4(n4)
        out_p5 = self.detect_p5(n5)

        return [out_p3, out_p4, out_p5]

In [15]:
if __name__ == "__main__":
    model = YOLOv8(num_classes=9)
    dummy = torch.randn(1, 3, 640, 640) 
    outputs = model(dummy)
    for i, out in enumerate(outputs):
        print(f"Scale P{i+3}: {out.shape}")

TypeError: C2f.__init__() got an unexpected keyword argument 'n'