In [1]:
import torch
import torch.nn as nn

 As described in the paper (for ResNet50), we prepare a convolution block such as 

 1x1,64 conv -> 3x3,64 conv -> 1x1,256 conv

 1x1, 3x3,... are the kernels used, 64,256,... is the number of channels

 The input and output of the block are the same size

 expansion for the last conv layer is 4 in each block

In [2]:
# ResNet50
class block(nn.Module):
    def __init__(self, in_channels, out_channels, identity_downsample=None, stride=1):
        super(block, self).__init__()
        self.expansion = 4
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=stride, padding=1)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.conv3 = nn.Conv2d(out_channels, out_channels*self.expansion, kernel_size=1, stride=1, padding=0)
        self.bn3 = nn.BatchNorm2d(out_channels*self.expansion)
        self.relu = nn.ReLU()
        # identity_downsample is used to match the dimensions main connection with skip connection
        self.identity_downsample = identity_downsample


    def forward(self,x):
        identity = x

        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.conv3(x)
        x = self.bn3(x)

    # identity_downsample will be used to perform the necessary downsample operation to ensure the dimensions match for the addition operation
        if self.identity_downsample is not None:
            identity = self.identity_downsample(identity)
        
        x += identity
        x = self.relu(x)
        return x

In [3]:
class ResNet(nn.Module):
    def __init__(self, block, layers, image_channels, num_classes):
    # layers is a list of how many times we want to repeat the block. 
    # For resnet50, layers = [3,4,6,3] as we have 4 blocks and each block has 3,4,6,3 times of repeating
        super(ResNet,self).__init__()
        self.in_channels = 64
        # Let's define the conv1 layer (Not resnet layer) as defined in paper
        self.conv1 = nn.Conv2d(image_channels, out_channels=64,kernel_size=7,stride=2,padding=3)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size=3,stride=2,padding=1)

        # ResNet layers
        self.layer1 = self._make_layer(block, layers[0], out_channels=64, stride=1)
        self.layer2 = self._make_layer(block, layers[1], out_channels=128, stride=2)
        self.layer3 = self._make_layer(block, layers[2], out_channels=256, stride=2)
        self.layer4 = self._make_layer(block, layers[3], out_channels=512, stride=2)

        self.avgpool = nn.AdaptiveAvgPool2d((1,1))
        self.fc = nn.Linear(512*4,num_classes)


    def forward(self,x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.reshape(x.shape[0],-1) # reshape to shape (batch_size, 512*4) to pass to fc layer
        x = self.fc(x)

        return x

    def _make_layer(self,block, num_residual_blocks, out_channels, stride):
        identity_downsample = None
        layers = []

        # Either if we half the input space for ex, 56x56 -> 28x28 (stride=2), or channels changes
        # we need to adapt the Identity (skip connection) so it will be able to be added
        # to the layer that's ahead

        if stride != 1 or self.in_channels != out_channels * 4:
            identity_downsample = nn.Sequential(nn.Conv2d(self.in_channels, out_channels*4, kernel_size=1, stride=stride),
                                                nn.BatchNorm2d(out_channels*4))
        
        layers.append(block(self.in_channels, out_channels, identity_downsample, stride)) 
        # Calls block class where 3 convs are present and at end, last out channels is multiplied by 4 so the in_channels for next block is 4 times the out_channels of previous block
        self.in_channels = out_channels * 4

        # For example for first resnet layer: we have 3 residual blocks, so first we append the 
        # 3 blocks to the layers module
        # Then, we pass the input through these blocks
        for i in range(num_residual_blocks - 1):
            layers.append(block(self.in_channels, out_channels))

        return nn.Sequential(*layers) 
    # *layers: to pass a list as a arguments, pytorch unpacks layers knowing each comes after another

In [4]:
def Resnet50(img_channels=3, num_classes=1000):
    # print("Resnet50",ResNet(block, [3,4,6,3], img_channels, num_classes))
    return ResNet(block, [3,4,6,3], img_channels, num_classes)


def Resnet101(img_channels=3, num_classes=1000):
    return ResNet(block, [3,4,23,3], img_channels, num_classes)


def Resnet152(img_channels=3, num_classes=1000):
    return ResNet(block, [3,8,36,3], img_channels, num_classes)



In [5]:
def tests():
    net = Resnet50()
    x = torch.randn(2,3,224,224)
    # print(net(x))
    y = net(x).to('cuda')
    print(y.shape)
tests()

torch.Size([2, 1000])
