## 3D ResNet Architecture Implementation in PyTorch

based on https://www.kaggle.com/code/banerz/resnet-implementation-in-pytorch

<img src="https://pytorch.org/assets/images/resnet.png">

Image source https://pytorch.org/assets/images/resnet.png

In [1]:
import torch
from torch import nn

ResNet18 and Resnet34 are built on same residual block with `3x3` kernel whereas ResNet architectures from 50 to 152 layers has a point-wise convolution with 64 output feature maps followed by a convolution with `3x3` kernel with 64 output feature maps, and finally ends with a point-wise convolution with 64*4 =256 channels.

So we will build two residual blocks
- Basicblock block (for ResNet 18 & 34)
- Bottleneck block (for ResNet 50,101 & 152)

### Basicblock

`down_sample` in the code below is a boolean function which is required to change the number of channels and the size of the input feature map such that it can be added with the previous output.

Why is it required?

Let's say you have a input tensor `x` with dimensions `[3,28,28]`. After passing this input into the function `F(x)`, the output tensor size is `[16,24,24]`. As you can clearly see both the dimensions are different, therefore `F(x)+x` operation cannot be performed, we need to change the input dimensions to make it work. Hence, the `down_sample` layer in the code ensures that both the output tensors have same dimensions.

This is what a `"skip connection"` is.

<img src="https://miro.medium.com/max/1140/1*D0F3UitQ2l5Q0Ak-tjEdJg.png" width=25% height = 25%>

image source https://miro.medium.com/max/1140/1*D0F3UitQ2l5Q0Ak-tjEdJg.png

In [14]:

class BasicBlock(nn.Module):
    def __init__(self,in_features=64,out_features=64,stride=[1,1],down_sample=False):
        # stride : list
        # the value at corresponding indices are the strides of corresponding layers in a residual block

        super(BasicBlock,self).__init__()

        self.conv1 = nn.Conv3d(in_features,out_features,3,stride[0],padding=1,bias=False) #weight layer
        self.bn1 = nn.BatchNorm3d(out_features) #weight layer

        self.relu = nn.ReLU(True) #relu

        self.conv2 = nn.Conv3d(out_features,out_features,3,stride[1],padding=1,bias=False) #weight layer
        self.bn2 = nn.BatchNorm3d(out_features) #weight layer

        self.down_sample = down_sample
        if down_sample:
            self.downsample = nn.Sequential(
                    nn.Conv3d(in_features,out_features,1,2,bias=False),
                    nn.BatchNorm3d(out_features)
                )

    def forward(self,x):
        x0=x.clone()
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)

        if self.down_sample:
            x0 = self.downsample(x0)
        x = x + x0    # F(x)+x
        x= self.relu(x)
        return x


### Bottleneck block

This is similar to the previous basicblock, but this block has 3 conv layers, with 2 point-wise convolutional layers and a conv3x3 layers in the middle. The last layer always outputs number of channels 4 times of input channels. Basically expanding the depth of the feature maps.

In [15]:


class Bottleneck(nn.Module):
    def __init__(self,inFeatures=64,outFeatures=64,kSize=[1,3,1],stride=[1,2,1],
    dn_sample=False,dnSample_stride=1) -> None:
        super(Bottleneck,self).__init__()


        self.conv1 = nn.Conv3d(inFeatures,outFeatures,kSize[0],stride[0],bias=False)
        self.bn1 = nn.BatchNorm3d(outFeatures)
        self.conv2 = nn.Conv3d(outFeatures,outFeatures,kSize[1],stride[1],padding=1,bias=False)
        self.bn2 = nn.BatchNorm3d(outFeatures)
        self.conv3 = nn.Conv3d(outFeatures,outFeatures*4,kSize[2],stride[2],bias=False)
        self.bn3 = nn.BatchNorm3d(outFeatures*4)
        self.relu = nn.ReLU(True)


        self.ds = dn_sample
        if dn_sample:
            self.downSample = nn.Sequential(
                nn.Conv3d(inFeatures,outFeatures*4,1,stride=dnSample_stride,bias=False),
                nn.BatchNorm3d(outFeatures*4)
            )


    def forward(self,x):
        x0 = x
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.conv3(x)
        x = self.bn3(x)
        x = self.relu(x)
        if self.ds:
            x0 = self.downSample(x0)
        x = x+x0
        return x


`num_residual_block=[3,4,6,3]` : number of residual blocks in each layer. (as shown in figure above). This means there will be four layers since the length of the list is 4, and each layer will have respective number of residual blocks.

In [16]:
class ResNet(nn.Module):

    def __init__(self,in_channels=3,num_residual_block=[3,4,6,3],num_class=1000,block_type='normal'):
        super(ResNet,self).__init__()

        self.conv1 = nn.Conv3d(in_channels,64,7,2,3,bias=False)
        self.bn1 = nn.BatchNorm3d(64)
        self.relu = nn.ReLU(True)
        self.maxpool = nn.MaxPool3d(3,2,1)

        if block_type.lower() == 'bottleneck':
            self.resnet,outchannels = self.__bottlenecks(num_residual_block)
        else:
            self.resnet,outchannels = self.__layers(num_residual_block)


        self.avgpool = nn.AdaptiveAvgPool3d((1,1,1))
        self.fc = nn.Linear(in_features=outchannels,out_features=num_class,bias=True)


    def forward(self,x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.resnet(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x

    def __layers(self,num_residual_block):
        layer=[]
        layer += [BasicBlock()]*num_residual_block[0]
        inchannels=64
        for numOFlayers in num_residual_block[1:]:
            stride = [2,1] #updating the stride, the first layer of residual block
            # will have a stride of two and the 2nd layer of the residual block have
            # a stride of 1
            downsample=True
            outchannels = inchannels*2
            for _ in range(numOFlayers):
                layer.append(BasicBlock(inchannels,outchannels,stride,down_sample=downsample))
                inchannels = outchannels
                downsample = False
                stride=[1,1]

        return nn.Sequential(*layer),outchannels


    def __bottlenecks(self,numres):

        layer=[]

        stride = [1,1,1]
        dnStride=1
        inchan = 64
        for i,numOFlayers in enumerate(numres):
            dn_sample = True
            outchan = 64*(2**i)

            for _ in range(numOFlayers):
                layer+=[
                    Bottleneck(inchan,outchan,stride=stride,
                    dn_sample=dn_sample,dnSample_stride=dnStride)
                ]
                inchan = outchan*4
                dn_sample = False
                stride = [1,1,1]
            dn_sample=True
            stride = [1,2,1]
            dnStride=2


        return nn.Sequential(*layer),inchan

In [17]:


def  resnet18(**kwargs):
    return ResNet(num_residual_block=[2,2,2,2],**kwargs)

def resnet34(**kwargs):
    return ResNet(num_residual_block=[3,4,6,3],**kwargs)

def resnet50(**kwargs):
    return ResNet(num_residual_block=[3,4,6,3],block_type='bottleneck',**kwargs)

def resnet101(**kwargs):
    return ResNet(num_residual_block=[3,4,23,3],block_type='bottleneck',**kwargs)

def resnet152(**kwargs):
    return ResNet(num_residual_block=[3,8,36,3],block_type='bottleneck',**kwargs)


In [6]:
!pip install torchsummary



In [18]:
from torchsummary import summary
model18 = resnet18()
summary(model18,(3, 25, 112, 112))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv3d-1       [-1, 64, 13, 56, 56]          65,856
       BatchNorm3d-2       [-1, 64, 13, 56, 56]             128
              ReLU-3       [-1, 64, 13, 56, 56]               0
         MaxPool3d-4        [-1, 64, 7, 28, 28]               0
            Conv3d-5        [-1, 64, 7, 28, 28]         110,592
       BatchNorm3d-6        [-1, 64, 7, 28, 28]             128
              ReLU-7        [-1, 64, 7, 28, 28]               0
            Conv3d-8        [-1, 64, 7, 28, 28]         110,592
       BatchNorm3d-9        [-1, 64, 7, 28, 28]             128
             ReLU-10        [-1, 64, 7, 28, 28]               0
       BasicBlock-11        [-1, 64, 7, 28, 28]               0
           Conv3d-12        [-1, 64, 7, 28, 28]         110,592
      BatchNorm3d-13        [-1, 64, 7, 28, 28]             128
             ReLU-14        [-1, 64, 7,

In [19]:
model34 = resnet34()
summary(model34,(3, 25, 112, 112))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv3d-1       [-1, 64, 13, 56, 56]          65,856
       BatchNorm3d-2       [-1, 64, 13, 56, 56]             128
              ReLU-3       [-1, 64, 13, 56, 56]               0
         MaxPool3d-4        [-1, 64, 7, 28, 28]               0
            Conv3d-5        [-1, 64, 7, 28, 28]         110,592
       BatchNorm3d-6        [-1, 64, 7, 28, 28]             128
              ReLU-7        [-1, 64, 7, 28, 28]               0
            Conv3d-8        [-1, 64, 7, 28, 28]         110,592
       BatchNorm3d-9        [-1, 64, 7, 28, 28]             128
             ReLU-10        [-1, 64, 7, 28, 28]               0
       BasicBlock-11        [-1, 64, 7, 28, 28]               0
           Conv3d-12        [-1, 64, 7, 28, 28]         110,592
      BatchNorm3d-13        [-1, 64, 7, 28, 28]             128
             ReLU-14        [-1, 64, 7,

In [20]:
model50 = resnet50()
summary(model50,(3, 25, 112, 112))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv3d-1       [-1, 64, 13, 56, 56]          65,856
       BatchNorm3d-2       [-1, 64, 13, 56, 56]             128
              ReLU-3       [-1, 64, 13, 56, 56]               0
         MaxPool3d-4        [-1, 64, 7, 28, 28]               0
            Conv3d-5        [-1, 64, 7, 28, 28]           4,096
       BatchNorm3d-6        [-1, 64, 7, 28, 28]             128
            Conv3d-7        [-1, 64, 7, 28, 28]         110,592
       BatchNorm3d-8        [-1, 64, 7, 28, 28]             128
            Conv3d-9       [-1, 256, 7, 28, 28]          16,384
      BatchNorm3d-10       [-1, 256, 7, 28, 28]             512
             ReLU-11       [-1, 256, 7, 28, 28]               0
           Conv3d-12       [-1, 256, 7, 28, 28]          16,384
      BatchNorm3d-13       [-1, 256, 7, 28, 28]             512
       Bottleneck-14       [-1, 256, 7,

In [21]:
model101 = resnet101()
summary(model101,(3, 25, 112, 112))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv3d-1       [-1, 64, 13, 56, 56]          65,856
       BatchNorm3d-2       [-1, 64, 13, 56, 56]             128
              ReLU-3       [-1, 64, 13, 56, 56]               0
         MaxPool3d-4        [-1, 64, 7, 28, 28]               0
            Conv3d-5        [-1, 64, 7, 28, 28]           4,096
       BatchNorm3d-6        [-1, 64, 7, 28, 28]             128
            Conv3d-7        [-1, 64, 7, 28, 28]         110,592
       BatchNorm3d-8        [-1, 64, 7, 28, 28]             128
            Conv3d-9       [-1, 256, 7, 28, 28]          16,384
      BatchNorm3d-10       [-1, 256, 7, 28, 28]             512
             ReLU-11       [-1, 256, 7, 28, 28]               0
           Conv3d-12       [-1, 256, 7, 28, 28]          16,384
      BatchNorm3d-13       [-1, 256, 7, 28, 28]             512
       Bottleneck-14       [-1, 256, 7,

In [22]:
model152 = resnet152()
summary(model152,(3, 25, 112, 112))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv3d-1       [-1, 64, 13, 56, 56]          65,856
       BatchNorm3d-2       [-1, 64, 13, 56, 56]             128
              ReLU-3       [-1, 64, 13, 56, 56]               0
         MaxPool3d-4        [-1, 64, 7, 28, 28]               0
            Conv3d-5        [-1, 64, 7, 28, 28]           4,096
       BatchNorm3d-6        [-1, 64, 7, 28, 28]             128
            Conv3d-7        [-1, 64, 7, 28, 28]         110,592
       BatchNorm3d-8        [-1, 64, 7, 28, 28]             128
            Conv3d-9       [-1, 256, 7, 28, 28]          16,384
      BatchNorm3d-10       [-1, 256, 7, 28, 28]             512
             ReLU-11       [-1, 256, 7, 28, 28]               0
           Conv3d-12       [-1, 256, 7, 28, 28]          16,384
      BatchNorm3d-13       [-1, 256, 7, 28, 28]             512
       Bottleneck-14       [-1, 256, 7,

### Summary of total number of parameters in each resnet architecture

|Architecture|Total Trainable parameters|
|:---:|:---:|
|ResNet18| 33,716,904|
|ResNet34| 64,026,536|
|ResNet50| 48,247,976|
|ResNet101| 87,294,120|
|ResNet152|  119,452,840|



Torchsummary is a good way to check whether the implementation is correct or not, it will throw an error if the dimensions in the hidden layers do not match, otherwise it will output the total number of trainable parameters with other details.


If this notebook helps you then let me know by upvoting, if you find any mistakes, then constructive criticisms are always welcome. :)

# Проверка на данных

In [29]:
!pip install mediapipe=='0.10.9'
!pip install opencv-python=='4.8.0.76'

Collecting mediapipe==0.10.9
  Downloading mediapipe-0.10.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.5/34.5 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
Collecting sounddevice>=0.4.4 (from mediapipe==0.10.9)
  Downloading sounddevice-0.4.6-py3-none-any.whl (31 kB)
Installing collected packages: sounddevice, mediapipe
Successfully installed mediapipe-0.10.9 sounddevice-0.4.6


In [25]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [30]:
dataloader_dir = "/content/drive/MyDrive/slovo/dataloaders/"
val_dataloader = torch.load(dataloader_dir + 'color_frames_val_dataloader.pth')
print(f"Val dataloader contains: {len(val_dataloader)} videos")

Val dataloader contains: 60 videos


In [34]:
use_cuda = True
seed = 1
use_cuda = use_cuda and torch.cuda.is_available()

if use_cuda:
  for dl in [val_dataloader]:
    dl.num_workers = 0
    dl.pin_memory = True
  torch.cuda.manual_seed(seed)  # Set the random seed for the current GPU
else:
  torch.manual_seed(seed)

device = torch.device("cuda" if use_cuda else "cpu")

print(f"Device: {device}")

Device: cpu


In [35]:
frames, labels = next(iter(val_dataloader))
frames = frames.to(device)
resnet = resnet50()
resnet.to(device)
print(f"Frames shape: {frames.shape}") # (batch_size, -channels, frames, w, h) torch.Size([1, 3, 25, 112, 112])
print(f"Labels shape: {labels.shape}") # batch_size
print(f"Labels: {labels}") # class index
print(f"Output shape: {resnet(frames/255).shape}") # batch size, number of classes

Frames shape: torch.Size([1, 3, 25, 112, 112])
Labels shape: torch.Size([1])
Labels: tensor([27])
Output shape: torch.Size([1, 1000])


In [38]:
num_ftrs = resnet.fc.in_features
resnet.fc = nn.Linear(num_ftrs, 30) # reset out features fir the last layer to the number of classes
resnet.fc

Linear(in_features=2048, out_features=30, bias=True)

In [40]:
#print(resnet)
print(f"Output shape: {resnet(frames/255).shape}")

Output shape: torch.Size([1, 30])


In [43]:
import torch.optim as optim
lr = 0.00001
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(resnet.parameters(), lr=lr)

In [44]:
from utils import VideoDataset, validate_model, train_model, display_frames, display_learning_dynamic, classification_model_metrics

In [45]:
use_cuda = True
seed = 1

train_batch_size = 1
test_batch_size = 1
val_batch_size = 1
IMG_SIZE = 112

limit_list = [] #["динозавр", "пингвин", "кролик"]

min_frame_count = 25

lr = 0.00001
momentum = 0.0001
epoches = 15
save = True

annotations_file = "/content/drive/MyDrive/slovo/SLOVO_DF_SHORT.tsv" #"/home/jupyter/mnt/s3/rsl-videos/slovo/slovo_annotations/SLOVO_DATAFRAME.tsv"
dataloader_dir = "/content/drive/MyDrive/slovo/dataloaders/"

save_path = '/content/drive/MyDrive/slovo/models' #'/content/model'
model_name = 'ResNet50_lr10_5'

In [49]:
import pandas as pd
import numpy as np

In [50]:
video_labels = pd.read_csv(annotations_file, sep='\t')
video_labels['group_rank'] = video_labels.groupby(['text']).cumcount()+1;
video_labels['dataset'] = np.where(video_labels['group_rank']<17,'train', np.where(video_labels['group_rank']<19,'val', 'test'))
if len(limit_list)>0:
  video_labels = video_labels[video_labels.text.isin(limit_list)]
video_labels.tail(5)

Unnamed: 0.1,Unnamed: 0,attachment_id,text,begin,end,group_rank,dataset
595,55,25b5fb58-46dd-4fd6-9928-734460795b22,мышь,22,83,20,test
596,56,336e0e7a-ff68-427c-9a77-52626b81edd1,паук,42,74,19,test
597,57,d0ec4c00-6ff5-4089-ac9d-5a33a76fb037,паук,3,70,20,test
598,58,59ae408b-f87a-422a-bc57-2ff5af28ee57,бабочка,2,74,19,test
599,59,e421d77c-1dcd-4124-b2a4-b2415569a4dd,бабочка,63,131,20,test


In [51]:
classes = tuple(video_labels.text.unique())
n_class = len(classes)
print(f"Total number of classes: {n_class}. Example classes: {classes[:5]}")

Total number of classes: 30. Example classes: ('пингвин', 'жираф', 'лягушка', 'бегемот', 'козел')


In [52]:
def save_model_dyn(save_path, model_name, train_loss_dynamic, val_accuracy_dynamic):
    with open(f'{save_path}/{model_name}_loss_acc_dynamic.json', 'w') as f:
        json.dump(
            {
                'train_loss_dynamic': train_loss_dynamic,
                'val_accuracy_dynamic': val_accuracy_dynamic
            },
            f)

In [53]:
train_dataloader = torch.load(dataloader_dir + 'color_frames_train_dataloader.pth')
print(f"Train dataloader contains: {len(train_dataloader)} videos")

Train dataloader contains: 480 videos


In [55]:
import json

In [56]:
best_acc = 1/30

previous_epochs = 0
num_epoches = 1

# if use_cuda:
#   model = torch.nn.DataParallel(resnet, device_ids=range(torch.cuda.device_count()))
#   cudnn.benchmark = True

epochs = list()
train_loss_dynamic = list()
val_accuracy_dynamic = list()
for epoch in range(previous_epochs+1, previous_epochs + num_epoches + 1):
  epochs.append(epoch)
  print(f'Train Epoch {epoch}/{previous_epochs + num_epoches}', end=": ")
  train_loss = train_model(resnet, optimizer, criterion, train_dataloader, device, scheduler=None)
  print(f"train_loss={train_loss}", end="; ")
  train_loss_dynamic.append(train_loss)
  if epoch % 1 == 0:
    predict_acc, best_acc = validate_model(resnet, classes, epoch, criterion, optimizer, val_dataloader, device, best_acc, save, save_path, model_name)
    print(f"val acc={predict_acc:.3f}")
    val_accuracy_dynamic.append(predict_acc.item())
    save_model_dyn(save_path, model_name, train_loss_dynamic, val_accuracy_dynamic)

Train Epoch 1/1: train_loss=4.055; val acc=0.033


# Загрузим веса

In [58]:
resnet2 = resnet50()
resnet2.to(device)
num_ftrs = resnet2.fc.in_features
resnet2.fc = nn.Linear(num_ftrs, 30) # reset out features fir the last layer to the number of classes
resnet2.fc

Linear(in_features=2048, out_features=30, bias=True)

In [None]:
# Load test epoch model state
state_file = 'ResNet50_lr10_5-1-Val_acc-0.050.pth'
epoch_number = state_file.split('-')[1]
print(f"Epoch: {epoch_number}")
PATH = save_path+'/'+state_file
loaded_path = torch.load(PATH)
classes = loaded_path['classes']
num_ftrs = resnet2.fc.in_features
n_class = len(classes)
resnet2.fc = nn.Linear(num_ftrs, n_class)
resnet2.load_state_dict({k.replace('module.',''):v for k,v in loaded_path['model_state_dict'].items()})
resnet2.to(device)

In [60]:
predict_acc, best_acc = validate_model(resnet2, classes, epoch, criterion, optimizer, val_dataloader, device, best_acc, save, save_path, model_name)
print(f"val acc={predict_acc:.3f}")

val acc=0.050
