In [5]:
from google.colab import drive
import torch
import torch.nn as nn
import torchvision.models as models
from torch.nn.modules.upsampling import Upsample
import torch.nn.functional as F
#drive.mount('/content/gdrive')

In [None]:
%cd gdrive/MyDrive/yolov3

### Torch modules

In [110]:
class CnnBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size = 3, padding = 0, stride = 1):
        super().__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size = kernel_size , stride= stride, padding = padding)
        self.bn = nn.BatchNorm2d(out_channels)
        self.relu = nn.LeakyReLU(0.1)

    def forward(self, x):
        out = self.conv(x)
        out = self.bn(out)
        out = self.relu(out)
        return out

class Upsample(nn.Module):
    def __init__(self, in_channels, out_channels, scale=2):
        super(Upsample, self).__init__()

        self.upsample = nn.Sequential(
            CnnBlock(in_channels = in_channels, out_channels = out_channels, kernel_size = 1),
            nn.Upsample(scale_factor = scale)
        )

    def forward(self, x):
        return self.upsample(x)

class Downsample(nn.Module):
    def __init__(self, in_channels, out_channels, scale=2):
        super(Downsample, self).__init__()

        self.downsample = CnnBlock(in_channels = in_channels, out_channels = out_channels, kernel_size = 3, stride = 2, padding = 1)

    def forward(self, x):
        return self.downsample(x)

class CnnBlockNoBnActiv(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)

    def forward(self, x):
        out = self.conv(x)
        return out

class ScaledPrediction(nn.Module):
    def __init__(self, channels, nclasses, padding = 1):
        super().__init__()
        self.nclasses = nclasses
        self.cnn = CnnBlock(channels, (nclasses + 5) * 3, kernel_size = 1,  padding = 0)
        #self.scaled_pred = nn.Sequential(
        #                    CnnBlock(channels, channels * 2, kernel_size = 3,  padding = 1),
        #                    CnnBlock(channels * 2, (nclasses + 5) * 3, kernel_size = 1,  padding = 0)
        #)

    def forward(self, x):
        out = self.cnn(x)
        # out = out.reshape(x.shape[0], 3, self.nclasses + 5, x.shape[2], x.shape[3]).permute(0, 1, 3, 4, 2)
        #print('ScaledPred.shape: ', out.shape)
        return out.reshape(x.shape[0], 3, self.nclasses + 5, x.shape[2], x.shape[3]).permute(0, 1, 3, 4, 2)

class SpatialPyramidPooling(nn.Module):
    def __init__(self):
        super().__init__()

        self.pyramid = nn.Sequential(
                                      nn.MaxPool2d(5, 1, 5 // 2),
                                      nn.MaxPool2d(9, 1, 9 // 2),
                                      nn.MaxPool2d(13, 1, 13 // 2)
                                      )
        #print(self.pyramid)

    def forward(self, x):
        features = [block(x) for block in self.pyramid]
        features = torch.cat([x] + features, dim=1)
        print('SSP out shape:', features.shape)
        return features


In [111]:
class PathAggregationNet(nn.Module):
    def __init__(self):
        super().__init__()

        self.feature_transform3 = CnnBlock(in_channels = 64, out_channels = 128, kernel_size = 1)

        self.feature_transform4 = CnnBlock(in_channels = 160 , out_channels = 256, kernel_size = 1)

        self.resample5_4 = Upsample(in_channels = 512, out_channels = 256)
        self.resample4_3 = Upsample(in_channels = 256, out_channels = 128)
        self.resample3_4 = Downsample(in_channels = 128, out_channels = 256)
        self.resample4_5 = Downsample(in_channels = 256, out_channels = 512)

        self.downstream_conv5 = nn.Sequential(
            # 2048, 512
            CnnBlock(in_channels = 2048, out_channels = 512, kernel_size = 1),
            # 512, 1024
            CnnBlock(in_channels = 512, out_channels = 1024, kernel_size = 3,  padding = 1),
            # 1024, 512
            CnnBlock(in_channels = 1024, out_channels = 512, kernel_size = 1)
        )

        self.downstream_conv4 = nn.Sequential(
            CnnBlock(in_channels = 512, out_channels = 256, kernel_size = 1),
            CnnBlock(in_channels = 256, out_channels = 512, kernel_size = 3, padding = 1),
            CnnBlock(in_channels = 512, out_channels = 256, kernel_size = 1),
            CnnBlock(in_channels = 256, out_channels = 512, kernel_size = 3,  padding = 1),
            CnnBlock(in_channels = 512, out_channels = 256, kernel_size = 1),
        )
        self.downstream_conv3 = nn.Sequential(
            CnnBlock(in_channels = 256, out_channels = 128, kernel_size = 1),
            CnnBlock(in_channels = 128, out_channels = 256, kernel_size = 3,  padding = 1),
            CnnBlock(in_channels = 256, out_channels = 128, kernel_size = 1),
            CnnBlock(in_channels = 128, out_channels = 256, kernel_size = 3,  padding = 1),
            CnnBlock(in_channels = 256, out_channels = 128, kernel_size = 1),
        )

        self.upstream_conv4 = nn.Sequential(
            CnnBlock(in_channels = 512, out_channels = 256, kernel_size = 1),
            CnnBlock(in_channels = 256, out_channels = 512, kernel_size = 3,  padding = 1),
            CnnBlock(in_channels = 512, out_channels = 256, kernel_size = 1),
            CnnBlock(in_channels = 256, out_channels = 512, kernel_size = 3,  padding = 1),
            CnnBlock(in_channels = 512, out_channels = 256, kernel_size = 1),
        )
        self.upstream_conv5 = nn.Sequential(
            CnnBlock(in_channels = 1024, out_channels = 512, kernel_size = 1),
            CnnBlock(in_channels = 512, out_channels = 1024, kernel_size = 3,  padding = 1),
            CnnBlock(in_channels = 1024, out_channels = 512, kernel_size = 1),
            CnnBlock(in_channels = 512, out_channels = 1024, kernel_size = 3,  padding = 1),
            CnnBlock(in_channels = 1024, out_channels = 512, kernel_size = 1)
        )

    def forward(self, scale1, scale2, scale3):
        print("PanNet In 1 shape:", scale1.shape)
        print("PanNet In 2 shape:", scale2.shape)
        print("PanNet In 3 shape:", scale3.shape)
        #features = [self.feature_transform3(features[0]), self.feature_transform4(features[1]), features[2]]
        x1 = self.feature_transform3(scale1)
        x2 = self.feature_transform4(scale2)
        x3 = scale3

        downstream_feature5 = self.downstream_conv5(x3)
        route1 = torch.cat((x2, self.resample5_4(downstream_feature5)), dim = 1)
        downstream_feature4 = self.downstream_conv4(route1)
        route2 = torch.cat((x1, self.resample4_3(downstream_feature4)), dim = 1)
        downstream_feature3 = self.downstream_conv3(route2)

        route3 = torch.cat((self.resample3_4(downstream_feature3), downstream_feature4), dim = 1)
        upstream_feature4 = self.upstream_conv4(route3)
        route4 = torch.cat((self.resample4_5(upstream_feature4), downstream_feature5), dim = 1)
        upstream_feature5 = self.upstream_conv5(route4)

        return downstream_feature3, upstream_feature4, upstream_feature5


In [112]:
# We try to stay true as close as possible to the darknet yolov3.cfg
# we however made changes and do not count [route], [shortcut] or
# [yolo] blocks as seperate layers in the network. These are generally
# not counted as seprate layers by the darknet framework either.
class YoloV4_EfficentNet(nn.Module):
      def __init__(self, nclasses = 20):
        super(YoloV4_EfficentNet, self).__init__()
        self.nclasses = nclasses

        self.efficientnetbackbone =  nn.Sequential(
              *list(models.efficientnet_v2_s(pretrained=True).children())[:-2]
              )

        self.yolov4coadaptation = nn.Sequential(
            CnnBlock(in_channels = 1280, out_channels = 512, kernel_size = 1, padding = 0), # L1 (done)
            CnnBlock(in_channels = 512, out_channels = 1024, kernel_size = 3, padding = 1), # L2 (done)
            CnnBlock(in_channels = 1024, out_channels = 512, kernel_size = 1, padding = 0), # L3 (done)
            )

        self.yolov4neck = nn.Sequential(
            SpatialPyramidPooling(),
            PathAggregationNet(),
            )



        self.yolov4head = nn.Sequential(

              )

      def forward(self, x):

          # the original yolov4 backbone Darknet53 CPS returns features maps at
          # different scales, which are then further processed by the SSP and
          # PaNet. Lastly the predictions are also made at 3 different scales.
          # We adjust the backbone to accomodate an efficentnet backbone. The
          # principle however stays the same.
          backbone_scale1 = self.efficientnetbackbone[0][:4](x)
          backbone_scale2 = self.efficientnetbackbone[0][4:6](backbone_scale1)
          # scale 3 is out of final passed onto following parts of the architecture
          backbone_scale3 = self.efficientnetbackbone[0][6:](backbone_scale2)

          x = self.yolov4coadaptation(backbone_scale3)
          ssp_out = self.yolov4neck[0](x)
          panet_scale1, panet_scale2, panet_scale3 = self.yolov4neck[1](backbone_scale1, backbone_scale2, ssp_out)
          print('PANET out features[0] shape:', panet_scale1.shape)
          print('PANET out features[1] shape:', panet_scale2.shape)
          print('PANET out features[2] shape:', panet_scale3.shape)

          return x


In [113]:
if __name__ == "__main__":
    img_size = 416
    nclasses = 20
    model = YoloV4_EfficentNet(nclasses = nclasses)
    x = torch.randn((2, 3, img_size, img_size))
    out = model(x)
    #assert model(x)[0].shape == (2, 3, img_size//32, img_size//32, nclasses + 5)
    #assert model(x)[1].shape == (2, 3, img_size//16, img_size//16, nclasses + 5)
    #assert model(x)[2].shape == (2, 3, img_size//8, img_size//8, nclasses + 5)
    print("Success!")



SSP out shape: torch.Size([2, 2048, 13, 13])
PanNet In 1 shape: torch.Size([2, 64, 52, 52])
PanNet In 2 shape: torch.Size([2, 160, 26, 26])
PanNet In 3 shape: torch.Size([2, 2048, 13, 13])
PANET out features[0] shape: torch.Size([2, 128, 52, 52])
PANET out features[1] shape: torch.Size([2, 256, 26, 26])
PANET out features[2] shape: torch.Size([2, 512, 13, 13])
Success!


#### YoloV3

In [36]:
class YoloV3_EfficentNet(nn.Module):
      def __init__(self, nclasses = 20):
        super(YoloV3_EfficentNet, self).__init__()

        self.efficientnetbackbone =  nn.Sequential(
              *list(models.efficientnet_v2_s(pretrained=True).children())[:-2]
              )

        self.yolov4neck = nn.Sequential(
                                        SpatialPyramidPooling(),
                                        PathAggregationNet()

        )

        self.yolov3head = nn.Sequential(

              # We try to stay true as close as possible to the darknet yolov3.cfg
              # we however made changes and do not count [route], [shortcut] or
              # [yolo] blocks as seperate layers in the network. These are generally
              # not counted as seprate layers by the darknet framework either.

              CnnBlock(in_channels = 1280, out_channels = 512, kernel_size = 1, padding = 0), # L1 (done)
              CnnBlock(in_channels = 512, out_channels = 1024, kernel_size = 3, padding = 1), # L2 (done)
              CnnBlock(in_channels = 1024, out_channels = 512, kernel_size = 1, padding = 0), # L3 (done)

              ## SSP
              SpatialPyramidPooling(),
              PathAggregationNet(),

              CnnBlock(in_channels = 512, out_channels = 1024, kernel_size = 3, padding = 1), # L4 (-4)
              CnnBlock(in_channels = 1024, out_channels = 512, kernel_size = 1, padding = 0), # L5 (61th in yolov3.cfg)
              CnnBlock(in_channels = 512, out_channels = 1024, kernel_size = 3, padding = 1), # L6

              # pred at first scale 1
              ScaledPrediction(1024, nclasses),                                               # L7

              # Route1: Concat output of L8 with out of L4 (Layer -4 before)
              CnnBlock(in_channels = 1024, out_channels = 256, kernel_size = 1, padding = 0),  # L8

              # Upsample palcehooder: Upsample is in the forward pass                         # U1

              # Route 2: Concat output of L10 with layer -1 before skip upsample so L8
              # Also concat with layer indexed as 61 (since we use EfficentNet, we will have to see whether)
              # EfficentNet has a suitable layer which we can concat
              CnnBlock(in_channels = 1024 + 256, out_channels = 256, kernel_size = 1, padding = 0), # L9

              # Due to concat in the forward pass between tensors outs of previous layers
              # with channel size 512, 256, 256 we adjust input channels
              CnnBlock(in_channels = 512 + 256 + 256 , out_channels = 512, kernel_size = 3, padding = 1), # L10
              CnnBlock(in_channels = 512, out_channels = 256, kernel_size = 1, padding = 0), # L11

              CnnBlock(in_channels = 256, out_channels = 512, kernel_size = 3, padding = 1), # L12 (-4)
              CnnBlock(in_channels = 512, out_channels = 256, kernel_size = 1, padding = 0), # L13
              CnnBlock(in_channels = 256, out_channels = 512, kernel_size = 3, padding = 1), # L14

              # pred at 2nd scale
              ScaledPrediction(512, nclasses),                                               # L15

              # Block 3

              # Route 3: Concat output of L16 with out of L12 (Layer -4 before)
              CnnBlock(in_channels = 512, out_channels = 128, kernel_size = 1, padding = 0), # L16

              # Upsample palcehooder: Upsample is in the forward pass                         # U2

              # Route 4: Concat output of L17 with layer -1 before (output of upsample)
              # Also concat with layer indexed as 36 (since we use EfficentNet, we will have to see whether)
              # EfficentNet has a suitable layer which we can concat
              CnnBlock(in_channels = 512 + 128, out_channels = 128, kernel_size = 1, padding = 0), # L17

              CnnBlock(in_channels = 128 + 128 + 128, out_channels = 256, kernel_size = 3, padding = 1), # L18
              CnnBlock(in_channels = 256, out_channels = 128, kernel_size = 1, padding = 0), # L19
              CnnBlock(in_channels = 128, out_channels = 256, kernel_size = 3, padding = 1), # L20
              CnnBlock(in_channels = 256, out_channels = 128, kernel_size = 1, padding = 0), # L21
              CnnBlock(in_channels = 128, out_channels = 256, kernel_size = 3, padding = 1), # L22
              # pred at 3rd scale
              ScaledPrediction(256, nclasses), # L23
              )

      def forward(self, x):

          # From here Backbone.
          # Yolov3 has a skip connection from Layer 36, which is in the backbone
          # of the darknet53 network connecting to L18 of the Yolohead.
          # We select suitable candidate layer L49 from the EfficentNet backbone
          # to mimic the behavior of the very same skip connection.
          # So we extract the output of L49 in the EfficentNet backbone for
          # concatination (see route 4: in the forward pass).
          backboneoutL49_a = self.efficientnetbackbone[0][:2](x)
          print('Intermediate A backbone shape:', backboneoutL49_a.shape)
          backboneoutL49_b = self.efficientnetbackbone[0][:3](x)
          print('Intermediate B backbone shape:', backboneoutL49_b.shape)
          backboneoutL49_c = self.efficientnetbackbone[0][:4](x)
          print('Intermediate C backbone shape:', backboneoutL49_c.shape)
          backboneoutL49_d = self.efficientnetbackbone[0][:5](x)
          print('Intermediate d backbone shape:', backboneoutL49_d.shape)
          backboneoutL49_e = self.efficientnetbackbone[0][:6](x)
          print('Intermediate e backbone shape:', backboneoutL49_e.shape)

          backboneoutL49_f = self.efficientnetbackbone[0][:7](x)
          print('Intermediate f backbone shape:', backboneoutL49_f.shape)
          backboneoutL49_g = self.efficientnetbackbone[0][:8](x)
          print('Intermediate egbackbone shape:', backboneoutL49_g.shape)
          # pass out L49 back into the backbone.
          x = self.efficientnetbackbone[0][5:](backboneoutL49_c)
          print("Backbone shape:", x.shape)
          x = self.yolov4neck(x)
          print("Neck shape:", x.shape)
          # From here YoloHead.
          x = self.yolov3head[0](x)                       # L1
          print("Yolohead L1 out.shape:", x.shape)
          x = self.yolov3head[1](x)                       # L2
          print("Yolohead L2 out.shape:", x.shape)
          x = self.yolov3head[2](x)                       # L3
          print("Yolohead L3 out.shape:", x.shape)
          outL4 = self.yolov3head[3](x)                   # L4
          print("Yolohead L4 out.shape:", outL4.shape)
          outL5 = self.yolov3head[4](outL4)               # L5
          print("Yolohead L5 out.shape:", x.shape)
          x = self.yolov3head[5](outL5)                   # L6
          print("Yolohead L6 out.shape:", x.shape)


          # Save the output of the first ScaledPrediction layer
          scaled_pred1 = self.yolov3head[6](x)           # L7
          print("Yolohead L7 out.shape:", scaled_pred1.shape)


          outL8 = self.yolov3head[7](x)           # L8
          print("Yolohead L8 out.shape:", outL8.shape)
          # Route 2: L4, L8 to L9
          x = torch.cat((outL4, outL8), dim=1)
          print("Concat 1 (outL4, x) :", x.shape)
          upsample1 = F.interpolate(x, scale_factor=2, mode='nearest') # U1
          print("Yolohead U1:", upsample1.shape)


          outL5 = F.interpolate(outL5, scale_factor=2, mode='nearest') # U1
          outL8 = F.interpolate(outL8, scale_factor=2, mode='nearest') # U1
          # Upsampling is between L8 and L9, so output of layers beyond L8
          # are not upsampled

          outL9 = self.yolov3head[8](upsample1)             # L9
          print("Route 2 vars", outL8.shape, outL5.shape, outL9.shape)
          print("Yolohead Layer 9 out.shape:", outL9.shape)
          # Route 2: L5, L8 to L9 to L10
          x = torch.cat((outL5, outL8, outL9), dim=1)
          print("Concat 2 Yolohead (outL5, outL8, upsample1) :", x.shape)
          x = self.yolov3head[9](x)                     # L10
          print("Yolohead L10 out.shape:", x.shape)
          x = self.yolov3head[10](x)                    # L11
          print("Yolohead L11 out.shape:", x.shape)
          outL12 = self.yolov3head[11](x)                # L12
          print("Yolohead L12 out.shape:", outL12.shape)
          x = self.yolov3head[12](outL12)                # L13
          print("Yolohead L13 out.shape:", x.shape)
          x = self.yolov3head[13](x)                    # L14
          print("Yolohead L14 out.shape:", x.shape)
          # Save the output of the second ScaledPrediction layer
          scaled_pred2 = self.yolov3head[14](x)         # L15
          print("Scaled Pred 2 Yolohead Layer 15 out.shape:", x.shape)


          outL16 = self.yolov3head[15](x)        # L16
          print("Yolohead L16 out.shape:", outL16.shape)
          # Route 3: L12, L16 to L17
          x = torch.cat((outL12, outL16), dim=1)
          print("Concat 3 (outL12, x) :", x.shape)
          upsample2 = F.interpolate(x, scale_factor=2, mode='nearest') # U2
          outL16 = F.interpolate(outL16, scale_factor=2, mode='nearest') # U2
          backboneoutL49 = F.interpolate(backboneoutL49, scale_factor=2, mode='nearest') # U2
          print("UYolohead U2:", upsample2.shape)
          outL17 = self.yolov3head[16](upsample2)            # L17
          print("Yolohead L17 out.shape:", outL17.shape)
          x = torch.cat((backboneoutL49, outL16, outL17), dim=1)
          print('Concat 4:', backboneoutL49.shape, outL16.shape, outL17.shape)
          x = self.yolov3head[17](x)                    # L18
          print("Yolohead L18 out.shape:", x.shape)
          x = self.yolov3head[18](x)                    # L19
          print("Yolohead L19 out.shape:", x.shape)
          x = self.yolov3head[19](x)                    # L20
          print("Yolohead L20 out.shape:", x.shape)
          x = self.yolov3head[20](x)                    # L21
          print("Yolohead L21 out.shape:", x.shape)
          x = self.yolov3head[21](x)                    # L22
          print("Yolohead L22 out.shape:", x.shape)
          # Save the output of the third ScaledPrediction layer
          scaled_pred3 = self.yolov3head[22](x)         # L23
          print("Yolohead L23 out.shape:", scaled_pred3.shape)

          return scaled_pred1, scaled_pred2, scaled_pred3


In [37]:
if __name__ == "__main__":
    img_size = 416
    nclasses = 20
    model = YoloV3_EfficentNet(nclasses = nclasses)
    x = torch.randn((2, 3, img_size, img_size))
    out = model(x)
    #assert model(x)[0].shape == (2, 3, img_size//32, img_size//32, nclasses + 5)
    #assert model(x)[1].shape == (2, 3, img_size//16, img_size//16, nclasses + 5)
    #assert model(x)[2].shape == (2, 3, img_size//8, img_size//8, nclasses + 5)
    print("Success!")



Intermediate A backbone shape: torch.Size([2, 24, 208, 208])
Intermediate B backbone shape: torch.Size([2, 48, 104, 104])
Intermediate C backbone shape: torch.Size([2, 64, 52, 52])
Intermediate d backbone shape: torch.Size([2, 128, 26, 26])
Intermediate e backbone shape: torch.Size([2, 160, 26, 26])
Intermediate f backbone shape: torch.Size([2, 256, 13, 13])
Intermediate egbackbone shape: torch.Size([2, 1280, 13, 13])


RuntimeError: ignored

In [None]:
PathAggregationNet()

PathAggregationNet(
  (panet): Sequential(
    (0): CnnBlock(
      (conv): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1))
      (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): LeakyReLU(negative_slope=0.1)
    )
    (1): CnnBlock(
      (conv): Conv2d(512, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (bn): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): LeakyReLU(negative_slope=0.1)
    )
    (2): CnnBlock(
      (conv): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1))
      (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): LeakyReLU(negative_slope=0.1)
    )
    (3): CnnBlock(
      (conv): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
      (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): LeakyReLU(negative_slope=0.1)
    )
    (4): CnnBlock(
      

In [None]:
scaled pred: torch.Size([2, 75, 13, 13])
scaled pred: torch.Size([2, 75, 26, 26])
scaled pred: torch.Size([2, 75, 52, 52])

scaled pred: torch.Size([2, 75, 13, 13])
scaled pred: torch.Size([2, 75, 26, 26])
scaled pred: torch.Size([2, 75, 52, 52])

In [None]:
print((2, 3, 416//32, 416//32, 20 + 5))
print((2, 3, 416//16, 416//16, 20 + 5))
print((2, 3, 416//8, 416//8, 20 + 5))

(2, 3, 13, 13, 25)
(2, 3, 26, 26, 25)
(2, 3, 52, 52, 25)
