In [4]:
import torch
import torch.nn as nn
import os
from tqdm import tqdm
from torch.nn import Conv2d
from torch import Tensor
from torchvision.ops import DeformConv2d
from typing import Any, Callable, List, Optional, Sequence, Tuple, Type, Union
from torchvision import transforms
from PIL import Image

## Data Preprocessing

In [5]:
import torch
import torchvision
import torch.nn as nn
import os
from torchvision import transforms
from torch.utils.data import TensorDataset, DataLoader, Dataset
from PIL import Image
import tqdm
import numpy as np
import matplotlib.pyplot as plt
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter 

In [6]:
writer = SummaryWriter("Trains//R2+1D")

In [7]:
if not os.path.exists("runs/R2+1D"):
    print("Log directory does not exist.")
else:
    print("Log directory exists.")

Log directory does not exist.


In [8]:
def input_cleaner(path=r'C:\Users\User\Desktop\Dataset2_cleaned'):
    del_list = []
    for folder in tqdm.tqdm(os.listdir(path)):
        if len(os.listdir(path + '\\' + folder)) != 8:
            del_list.append(path + '\\' + folder)
    return del_list
del_list = input_cleaner()
       

  0%|          | 0/222 [00:00<?, ?it/s]

100%|██████████| 222/222 [00:00<00:00, 6302.57it/s]


In [9]:
len(del_list)

0

In [10]:
for folder in del_list:
    for file in os.listdir(folder):
        os.remove(folder + '\\' + file)

In [11]:
for folder in os.listdir(r'C:\Users\User\Desktop\Dataset2_cleaned'):
    if not os.listdir(r'C:\Users\User\Desktop\Dataset2_cleaned' + '\\' + folder):
        os.rmdir(r'C:\Users\User\Desktop\Dataset2_cleaned' + '\\' + folder)

In [12]:
labels_dict = {
    'Car': torch.tensor(0),
    'Clear': torch.tensor(1),
    'Human': torch.tensor(2),
    'LineNoise': torch.tensor(1),
    'Noise': torch.tensor(1)
}

In [13]:
inverse_label_dict = {
    0: 'Car',
    1: 'Noise',
    2: 'Human'
}

In [14]:
def data_maker(path=r'C:\Users\User\Desktop\Dataset2_cleaned'):
    video_lib = []
    labels = []

    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.43216, 0.394666, 0.37645], std=[0.22803, 0.22145, 0.216989])
        ])
    
    for folder in tqdm.tqdm(os.listdir(path)):
        frames = []
        labels.append(labels_dict[folder.split(",")[0][2:-1]])
        for img_path in os.listdir(path + '\\' + folder):
            img = Image.open(path + '\\' + folder + '\\' + img_path)
            frames.append(transform(img))
        
        # video_tensor = torch.stack(frames, dim=1)
        # video_tensor.unsqueeze(0)

        video_lib.append(frames)
    
    return video_lib, labels

In [15]:
video_data, labels = data_maker()

100%|██████████| 222/222 [00:01<00:00, 138.75it/s]


In [16]:
labels[0]

tensor(0)

In [17]:
len(labels), len(video_data)

(222, 222)

In [18]:
class VideoDataset(Dataset):
    def __init__(self, video_tensors, labels):
        self.video_tensors = video_tensors
        self.labels = labels
    
    def __getitem__(self, index):
        video = torch.tensor(np.array(self.video_tensors[index]))
        label = torch.tensor(self.labels[index])
        return video, label
    
    def __len__(self):
        return len(self.video_tensors)

In [19]:
dataset = VideoDataset(video_data, labels)
    
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])


val_size = int(0.2 * train_size)
train_size = train_size - val_size
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])


In [20]:
train_loader = DataLoader(
    dataset = train_dataset,
    batch_size = 8,
    shuffle = True,    
    num_workers=0
)

val_loader = DataLoader(
    dataset = val_dataset,
    batch_size = 8,
    shuffle = False
)

test_loader = DataLoader(
    dataset = test_dataset,
    batch_size = 8,
    shuffle = False,
    num_workers = 0
)




In [21]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(0)

<torch._C.Generator at 0x1e0ffa31dd0>

In [22]:
device

device(type='cuda', index=0)

## R(2def+1)D Model Defineing

In [23]:
class ExtendedDeformConv2d(DeformConv2d):
    @staticmethod
    def get_downsample_stride(stride: int) -> Tuple[int, int, int]:
        return stride, stride, stride

In [24]:
class DeformableFrameConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1):
        super().__init__()
        # Offset convolution to learn deformable convolution parameters
        self.offset_conv = nn.Conv2d(
            in_channels, 
            2 * kernel_size * kernel_size,  # 2 coordinates for x and y
            kernel_size=kernel_size, 
            stride=stride, 
            padding=padding
        )
        
        self.deform_conv = DeformConv2d(
            in_channels, 
            out_channels, 
            kernel_size=kernel_size, 
            stride=stride, 
            padding=padding 
        )
        
    def forward(self, x):
        # x shape: [batch_size, channels, frames, height, width]
        batch_size, channels, frames, height, width = x.shape
        
        # Will store processed frames
        output = []
        
        for frame in range(frames):
            current_frame = x[:, :, frame, :, :]
            
            offset = self.offset_conv(current_frame)
            
            deformed_frame = self.deform_conv(current_frame, offset)
            
            output.append(deformed_frame)
        
        return torch.stack(output, dim=2)

In [131]:
class BasicDeformableBlock(nn.Module):

    expansion = 1

    def __init__(
        self,
        inplanes: int,
        planes: int,
        conv_builder: Callable[..., nn.Module],
        stride: int = 1,
        downsample: Optional[nn.Module] = None,
    ) -> None:
        midplanes = int((planes + inplanes) / 2)
        print(f"planes {planes}, inplanes {inplanes}")
        super().__init__()
        self.offset_conv1 = nn.Conv2d(
            inplanes, 
            2 * 3 * 3,  # 2 coordinates for x and y
            kernel_size=3, 
            stride=stride, 
            padding=1
        )
        
        self.deform_conv1 = conv_builder(
            inplanes,
            midplanes,
            kernel_size=3,
            stride=stride,
            padding=1
        )

        self.temp_conv0 = nn.Sequential(
            nn.Conv3d(inplanes, planes, kernel_size=(3, 1, 1), stride=(stride, 1, 1), padding=(1, 0, 0)), 
            nn.BatchNorm3d(planes), 
            nn.ReLU(inplace=True)
        )

        self.temp_conv1 = nn.Sequential(
            nn.Conv3d(midplanes, midplanes, kernel_size=(3, 1, 1), stride=(stride, 1, 1), padding=(1, 0, 0)), 
            nn.BatchNorm3d(planes), 
            nn.ReLU(inplace=True)
        )

        self.offset_conv2 = nn.Conv2d(
            midplanes,
            2 * 3 * 3,
            kernel_size=3,
            stride=stride,
            padding=1
        )

        self.deform_conv2 = conv_builder(
            midplanes,
            planes,
            kernel_size=3,
            stride=stride,
            padding=1
        )

        self.temp_conv2 = nn.Sequential(
            nn.Conv3d(planes, planes, kernel_size=(3, 1, 1), stride=(stride, 1, 1), padding=(1, 0, 0)), 
            nn.BatchNorm3d(planes), 
            nn.ReLU(inplace=True)
        )
    
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x: Tensor) -> Tensor:
        print(x.shape)
        residual = self.temp_conv0(x)
        print("residual shape", residual.shape)

        # x shape: [batch_size, channels, frames, height, width]
        batch_size, channels, frames, height, width = x.shape
        
        # Will store processed frames
        first_output = torch.zeros(
            batch_size, self.deform_conv1.out_channels, frames, height, width
        )
        print("going to first loop")
        for frame in range(frames):
            current_frame = x[:, :, frame, :, :]
            
            offset1 = self.offset_conv1(current_frame)
            
            deformed_frame = self.deform_conv1(current_frame, offset1)
            
            first_output[:, :, frame, :, :] = deformed_frame
        print("first output shape is", first_output.shape)
        out = first_output
        # out = torch.stack(output, dim=2)
        # out = self.temp_conv1(x)

        batch_size, channels, frames, height, width = out.shape
        second_output = torch.zeros(
            batch_size, self.deform_conv2.out_channels, frames, height, width
        )
        for frame in range(frames):
            current_frame = out[:, :, frame, :, :]
            
            offset2 = self.offset_conv2(current_frame)
            
            deformed_frame = self.deform_conv2(current_frame, offset2)
            
            second_output[:, :, frame, :, :] = deformed_frame
        print("second output shape is", second_output.shape)
        out = second_output
        # out = torch.stack(output, dim=2)
        # out = self.temp_conv2(out)
        # out = self.conv2(out)
        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out
    
    

In [101]:
class R2Plus1dStem(nn.Sequential):
    """R(2+1)D stem is different than the default one as it uses separated 3D convolution"""

    def __init__(self) -> None:
        super().__init__(
            nn.Conv3d(3, 45, kernel_size=(1, 7, 7), stride=(1, 2, 2), padding=(0, 3, 3), bias=False),
            nn.BatchNorm3d(45),
            nn.ReLU(inplace=True),
            nn.Conv3d(45, 64, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False),
            nn.BatchNorm3d(64),
            nn.ReLU(inplace=True),
        )

In [123]:
class VideoResNet(nn.Module):
    def __init__(
            self,
            block: Type[Union[BasicDeformableBlock]],
            conv_makers: Sequence[Type[Union[ExtendedDeformConv2d]]],
            nums_of_layers: List[int], 
            stem: Callable[..., nn.Module],
            num_classes: int = 2,
            zero_init_residual: bool = False
    ) -> None:
        super().__init__()
        self.inplanes = 64

        self.stem = stem()

        self.layer1 = self._make_layer(64, block, conv_makers[0], 128, nums_of_layers[0], stride=1)
        self.layer2 = self._make_layer(128, block, conv_makers[1], 256, nums_of_layers[1], stride=1)
        self.layer3 = self._make_layer(256, block, conv_makers[2], 512, nums_of_layers[2], stride=1)
        self.layer4 = self._make_layer(512, block, conv_makers[3], 1024, nums_of_layers[3], stride=1)

        self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1))
        self.fc = nn.Linear(1024 * block.expansion, num_classes)

        #init weights 
        for m in self.modules():
            if isinstance(m, nn.Conv3d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm3d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)
        
        # if zero_init_residual:
        #     for m in self.modules():
        #         if isinstance(m, Bottleneck):
        #             nn.init.constant_(m.bn3.weight, 0)
        

    def forward(self, x: Tensor) -> Tensor:
        x = self.stem(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        #flatten layer to fc
        x = x.flatten(1)
        x = self.fc(x)

        return x




    def _make_layer(
            self,
            inplanes,
            block: Type[Union[BasicDeformableBlock]],
            conv_builder: Type[Union[ExtendedDeformConv2d]],
            out_planes: int,
            num_of_blocks: int,
            stride: int = 1,
            ) -> nn.Sequential:
        downsample = None

        if stride != 1 or self.inplanes != out_planes *  block.expansion:
            ds_stride = conv_builder.get_downsample_stride(stride)
            downsample = nn.Sequential(
                nn.Conv3d(self.inplanes, out_planes*block.expansion, kernel_size=1, stride=ds_stride, bias=False),
                nn.BatchNorm3d(out_planes * block.expansion)
            )
        layers = []
        layers.append(block(inplanes, out_planes, conv_builder, stride, downsample))

        self.inplanes = out_planes * block.expansion
        for i in range(1, num_of_blocks):
            layers.append(block(self.inplanes, out_planes, conv_builder))
        
        return nn.Sequential(*layers)
    


In [124]:
def make_video_resnet(
    block: Type[Union[BasicDeformableBlock]],
    conv_makers: Sequence[Type[Union[ExtendedDeformConv2d]]],
    layers: List[int],
    stem: Callable[..., nn.Module],
) -> VideoResNet:

    model = VideoResNet(block, conv_makers, layers, stem)

    return model

In [132]:
model = make_video_resnet(
        BasicDeformableBlock,
        [ExtendedDeformConv2d] * 4,
        [2, 2, 2, 2],
        R2Plus1dStem,
    )

planes 128, inplanes 64
planes 128, inplanes 128
planes 256, inplanes 128
planes 256, inplanes 256
planes 512, inplanes 256
planes 512, inplanes 512
planes 1024, inplanes 512
planes 1024, inplanes 1024


## Working

In [133]:
for vid, _ in train_loader:
    vid1 = vid.permute(0, 2, 1, 3, 4)
    print(model(vid1))
    break

  label = torch.tensor(self.labels[index])


torch.Size([8, 64, 8, 56, 56])
residual shape torch.Size([8, 128, 8, 56, 56])
going to first loop
first output shape is torch.Size([8, 96, 8, 56, 56])
second output shape is torch.Size([8, 128, 8, 56, 56])
torch.Size([8, 128, 8, 56, 56])
residual shape torch.Size([8, 128, 8, 56, 56])
going to first loop
first output shape is torch.Size([8, 128, 8, 56, 56])
second output shape is torch.Size([8, 128, 8, 56, 56])
torch.Size([8, 128, 8, 56, 56])
residual shape torch.Size([8, 256, 8, 56, 56])
going to first loop
first output shape is torch.Size([8, 192, 8, 56, 56])
second output shape is torch.Size([8, 256, 8, 56, 56])
torch.Size([8, 256, 8, 56, 56])
residual shape torch.Size([8, 256, 8, 56, 56])
going to first loop
first output shape is torch.Size([8, 256, 8, 56, 56])
second output shape is torch.Size([8, 256, 8, 56, 56])
torch.Size([8, 256, 8, 56, 56])
residual shape torch.Size([8, 512, 8, 56, 56])
going to first loop
first output shape is torch.Size([8, 384, 8, 56, 56])
second output sha

## Others 

In [2]:
import tensorflow as tf
from tensorflow.keras import layers
from typing import Tuple

class Conv2Plus1D(tf.keras.Sequential):
    def __init__(
        self, in_planes: int, out_planes: int, midplanes: int, stride: int = 1, padding: int = 1
    ) -> None:
        layers_list = [
            layers.Conv3D(
                filters=midplanes,
                kernel_size=(1, 3, 3),
                strides=(1, stride, stride),
                padding='same' if padding > 0 else 'valid',
                use_bias=False,
                input_shape=(None, None, None, in_planes),
            ),
            layers.BatchNormalization(),
            layers.ReLU(),
            layers.Conv3D(
                filters=out_planes,
                kernel_size=(3, 1, 1),
                strides=(stride, 1, 1),
                padding='same' if padding > 0 else 'valid',
                use_bias=False,
            ),
        ]
        super().__init__(layers_list)

    @staticmethod
    def get_downsample_stride(stride: int) -> Tuple[int, int, int]:
        return stride, stride, stride

conv_layer = Conv2Plus1D(in_planes=64, out_planes=128, midplanes=64, stride=2, padding=1)
print(conv_layer.summary())


Model: "conv2_plus1d"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv3d (Conv3D)             (None, None, None, None,  36864     
                              64)                                
                                                                 
 batch_normalization (BatchN  (None, None, None, None,  256      
 ormalization)                64)                                
                                                                 
 re_lu (ReLU)                (None, None, None, None,  0         
                              64)                                
                                                                 
 conv3d_1 (Conv3D)           (None, None, None, None,  24576     
                              128)                               
                                                                 
Total params: 61,696
Trainable params: 61,568
Non-trai

In [3]:
class R2Plus1dStem(tf.keras.Sequential):
    """R(2+1)D stem is different than the default one as it uses separated 3D convolution"""

    def __init__(self) -> None:
        layers_list = [
            layers.Conv3D(
                filters=45,
                kernel_size=(1, 7, 7),
                strides=(1, 2, 2),
                padding='same',  # Corresponds to PyTorch padding=(0, 3, 3)
                use_bias=False,
                input_shape=(None, None, None, 3),  # Input shape for the first layer
            ),
            layers.BatchNormalization(),
            layers.ReLU(),
            layers.Conv3D(
                filters=64,
                kernel_size=(3, 1, 1),
                strides=(1, 1, 1),
                padding='same',  # Corresponds to PyTorch padding=(1, 0, 0)
                use_bias=False,
            ),
            layers.BatchNormalization(),
            layers.ReLU(),
        ]
        super().__init__(layers_list)

# Example usage
stem_layer = R2Plus1dStem()
print(stem_layer.summary())

Model: "r2_plus1d_stem"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv3d_2 (Conv3D)           (None, None, None, None,  6615      
                              45)                                
                                                                 
 batch_normalization_1 (Batc  (None, None, None, None,  180      
 hNormalization)              45)                                
                                                                 
 re_lu_1 (ReLU)              (None, None, None, None,  0         
                              45)                                
                                                                 
 conv3d_3 (Conv3D)           (None, None, None, None,  8640      
                              64)                                
                                                                 
 batch_normalization_2 (Batc  (None, None, None, Non

In [6]:
from typing import Callable, Optional

class Bottleneck(tf.keras.layers.Layer):
    expansion = 4

    def __init__(
        self,
        inplanes: int,
        out_planes: int,
        conv_builder: Callable[..., tf.keras.layers.Layer],
        stride: int = 1,
        downsample: Optional[tf.keras.layers.Layer] = None,
    ) -> None:
        super().__init__()
        
        midplanes = (inplanes * out_planes * 3 * 3 * 3) // (inplanes * 3 * 3 + 3 * out_planes)

        self.conv1 = tf.keras.Sequential([
            layers.Conv3D(
                filters=out_planes,
                kernel_size=1,
                strides=1,
                padding='valid',
                use_bias=False
            ),
            layers.BatchNormalization(),
            layers.ReLU(),
        ])

        self.conv2 = tf.keras.Sequential([
            conv_builder(out_planes, out_planes, midplanes, stride),
            layers.BatchNormalization(),
            layers.ReLU(),
        ])

        self.conv3 = tf.keras.Sequential([
            layers.Conv3D(
                filters=out_planes * self.expansion,
                kernel_size=1,
                strides=1,
                padding='valid',
                use_bias=False
            ),
            layers.BatchNormalization(),
        ])

        self.relu = layers.ReLU()

        if downsample is None and (stride != 1 or inplanes != out_planes * self.expansion):
            self.downsample = tf.keras.Sequential([
                layers.Conv3D(
                    filters=out_planes * self.expansion,
                    kernel_size=1,
                    strides=(stride, stride, stride),
                    padding='valid',
                    use_bias=False
                ),
                layers.BatchNormalization(),
            ])
        else:
            self.downsample = downsample

        self.stride = stride

    def call(self, x):
        residual = x

        out = self.conv1(x)
        out = self.conv2(out)
        out = self.conv3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

def example_conv_builder(in_planes, out_planes, midplanes, stride):
    return layers.Conv3D(
        filters=out_planes,
        kernel_size=(3, 3, 3),
        strides=(stride, stride, stride),
        padding='same',
        use_bias=False
    )

bottleneck_layer = Bottleneck(
    inplanes=64,
    out_planes=128,
    conv_builder=example_conv_builder,
    stride=2
)

input_tensor = tf.random.normal([1, 8, 112, 112, 64])  # Batch size , Depth , Height , Width, Channels 
output = bottleneck_layer(input_tensor)
print(output.shape)

(1, 4, 56, 56, 512)


In [9]:
class BasicBlock(tf.keras.layers.Layer):
    expansion = 1
    
    def __init__(
        self, 
        inplanes: int, 
        planes: int, 
        conv_builder, 
        stride: int = 1, 
        downsample = None
    ):
        super().__init__()
        
        # Calculate midplanes similar to PyTorch implementation
        midplanes = (inplanes * planes * 3 * 3 * 3) // (inplanes * 3 * 3 + 3 * planes)
        
        # Simulating Sequential with a custom conv1 method
        self.conv1 = tf.keras.Sequential([
            conv_builder(inplanes, planes, midplanes, stride),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.ReLU()
        ])
        
        # Simulating Sequential with a custom conv2 method
        self.conv2 = tf.keras.Sequential([
            conv_builder(planes, planes, midplanes),
            tf.keras.layers.BatchNormalization()
        ])
        
        self.downsample = downsample
        self.stride = stride
        self.relu = tf.keras.layers.ReLU()
    
    def call(self, x):
        residual = x
        
        out = self.conv1(x)
        out = self.conv2(out)
        
        if self.downsample is not None:
            residual = self.downsample(x)
        
        out += residual
        out = self.relu(out)
        
        return out
    

def simple_conv_builder(in_channels, out_channels, midplanes, stride=1):
    return tf.keras.layers.Conv3D(
        filters=out_channels, 
        kernel_size=3, 
        strides=stride, 
        padding='same'
    )

block = BasicBlock(
    inplanes=64, 
    planes=128, 
    conv_builder=simple_conv_builder
)

x = tf.random.normal((1, 32, 32, 32, 64))

output = block(x)
print("Input shape:", x.shape)
print("Output shape:", output.shape)

InvalidArgumentError: Exception encountered when calling layer "basic_block_1" "                 f"(type BasicBlock).

{{function_node __wrapped__AddV2_device_/job:localhost/replica:0/task:0/device:GPU:0}} required broadcastable shapes [Op:AddV2]

Call arguments received by layer "basic_block_1" "                 f"(type BasicBlock):
  • x=tf.Tensor(shape=(1, 32, 32, 32, 64), dtype=float32)

## Deformable Conv Layer TF

In [None]:
import tensorflow as tf
from keras.layers import Conv2D


class DeformableConvLayer(Conv2D):
    """Only support "channel last" data format"""
    def __init__(self,
                 filters,
                 kernel_size,
                 strides=(1, 1),
                 padding='same',
                 data_format=None,
                 dilation_rate=(1, 1),
                 num_deformable_group=None,
                 activation=None,
                 use_bias=True,
                 kernel_initializer='glorot_uniform',
                 bias_initializer='zeros',
                 kernel_regularizer=None,
                 bias_regularizer=None,
                 activity_regularizer=None,
                 kernel_constraint=None,
                 bias_constraint=None,
                 **kwargs):
        """`kernel_size`, `strides` and `dilation_rate` must have the same value in both axis.

        :param num_deformable_group: split output channels into groups, offset shared in each group. If
        this parameter is None, then set  num_deformable_group=filters.
        """
        super().__init__(
            filters=filters,
            kernel_size=kernel_size,
            strides=strides,
            padding=padding,
            data_format=data_format,
            dilation_rate=dilation_rate,
            activation=activation,
            use_bias=use_bias,
            kernel_initializer=kernel_initializer,
            bias_initializer=bias_initializer,
            kernel_regularizer=kernel_regularizer,
            bias_regularizer=bias_regularizer,
            activity_regularizer=activity_regularizer,
            kernel_constraint=kernel_constraint,
            bias_constraint=bias_constraint,
            **kwargs)
        self.kernel = None
        self.bias = None
        self.offset_layer_kernel = None
        self.offset_layer_bias = None
        if num_deformable_group is None:
            num_deformable_group = filters
        if filters % num_deformable_group != 0:
            raise ValueError('"filters" mod "num_deformable_group" must be zero')
        self.num_deformable_group = num_deformable_group

    def build(self, input_shape):
        input_dim = int(input_shape[-1])
        # kernel_shape = self.kernel_size + (input_dim, self.filters)
        # we want to use depth-wise conv
        kernel_shape = self.kernel_size + (self.filters * input_dim, 1)
        self.kernel = self.add_weight(
            name='kernel',
            shape=kernel_shape,
            initializer=self.kernel_initializer,
            regularizer=self.kernel_regularizer,
            constraint=self.kernel_constraint,
            trainable=True,
            dtype=self.dtype)
        if self.use_bias:
            self.bias = self.add_weight(
                name='bias',
                shape=(self.filters,),
                initializer=self.bias_initializer,
                regularizer=self.bias_regularizer,
                constraint=self.bias_constraint,
                trainable=True,
                dtype=self.dtype)

        # create offset conv layer
        offset_num = self.kernel_size[0] * self.kernel_size[1] * self.num_deformable_group
        self.offset_layer_kernel = self.add_weight(
            name='offset_layer_kernel',
            shape=self.kernel_size + (input_dim, offset_num * 2),  # 2 means x and y axis
            initializer=tf.zeros_initializer(),
            regularizer=self.kernel_regularizer,
            trainable=True,
            dtype=self.dtype)
        self.offset_layer_bias = self.add_weight(
            name='offset_layer_bias',
            shape=(offset_num * 2,),
            initializer=tf.zeros_initializer(),
            # initializer=tf.random_uniform_initializer(-5, 5),
            regularizer=self.bias_regularizer,
            trainable=True,
            dtype=self.dtype)
        self.built = True

    def call(self, inputs, training=None, **kwargs):
        # get offset, shape [batch_size, out_h, out_w, filter_h, * filter_w * channel_out * 2]
        offset = tf.nn.conv3d(inputs,
                              filters=self.offset_layer_kernel,
                              strides=[1, *self.strides, 1],
                              padding=self.padding.upper(),
                              dilations=[1, *self.dilation_rate, 1])
        print(offset.shape, "offset.shape")
        offset += self.offset_layer_bias

        # add padding if needed
        inputs = self._pad_input(inputs)

        # some length
        if inputs.get_shape()[0] is None:
            batch_size = 1
        else:
            batch_size = int(inputs.get_shape()[0])
        channel_in = int(inputs.get_shape()[-1])
        in_h, in_w = [int(i) for i in inputs.get_shape()[1: 3]]  # input feature map size
        out_h, out_w = [int(i) for i in offset.get_shape()[1: 3]]  # output feature map size
        filter_h, filter_w = self.kernel_size

        # get x, y axis offset
        offset = tf.reshape(offset, [batch_size, out_h, out_w, -1, 2])
        y_off, x_off = offset[:, :, :, :, 0], offset[:, :, :, :, 1]

        # input feature map gird coordinates
        y, x = self._get_conv_indices([in_h, in_w])
        y, x = [tf.expand_dims(i, axis=-1) for i in [y, x]]
        y, x = [tf.tile(i, [batch_size, 1, 1, 1, self.num_deformable_group]) for i in [y, x]]
        y, x = [tf.reshape(i, [*i.shape[0: 3], -1]) for i in [y, x]]
        y = tf.cast(y, tf.float32)
        x = tf.cast(x, tf.float32)
        
        # add offset
        y, x = y + y_off, x + x_off
        y = tf.clip_by_value(y, 0, in_h - 1)
        x = tf.clip_by_value(x, 0, in_w - 1)

        # get four coordinates of points around (x, y)
        y0 = tf.cast(tf.floor(y), tf.int32)
        x0 = tf.cast(tf.floor(x), tf.int32)
        y1, x1 = y0 + 1, x0 + 1
        # clip
        y0, y1 = [tf.clip_by_value(i, 0, in_h - 1) for i in [y0, y1]]
        x0, x1 = [tf.clip_by_value(i, 0, in_w - 1) for i in [x0, x1]]

        # get pixel values
        indices = [[y0, x0], [y0, x1], [y1, x0], [y1, x1]]
        p0, p1, p2, p3 = [DeformableConvLayer._get_pixel_values_at_point(inputs, i) for i in indices]

        # cast to float
        x0 = tf.cast(x0, tf.float32)
        x1 = tf.cast(x1, tf.float32)
        y0 = tf.cast(y0, tf.float32)
        y1 = tf.cast(y1, tf.float32)


        # weights
        w0 = (y1 - y) * (x1 - x)
        w1 = (y1 - y) * (x - x0)
        w2 = (y - y0) * (x1 - x)
        w3 = (y - y0) * (x - x0)
        # expand dim for broadcast
        w0, w1, w2, w3 = [tf.expand_dims(i, axis=-1) for i in [w0, w1, w2, w3]]
        # bilinear interpolation
        pixels = tf.add_n([w0 * p0, w1 * p1, w2 * p2, w3 * p3])

        # reshape the "big" feature map
        pixels = tf.reshape(pixels, [batch_size, out_h, out_w, filter_h, filter_w, self.num_deformable_group, channel_in])
        pixels = tf.transpose(pixels, [0, 1, 3, 2, 4, 5, 6])
        pixels = tf.reshape(pixels, [batch_size, out_h * filter_h, out_w * filter_w, self.num_deformable_group, channel_in])

        # copy channels to same group
        feat_in_group = self.filters // self.num_deformable_group
        pixels = tf.tile(pixels, [1, 1, 1, 1, feat_in_group])
        pixels = tf.reshape(pixels, [batch_size, out_h * filter_h, out_w * filter_w, -1])

        # depth-wise conv
        out = tf.nn.depthwise_conv2d(pixels, self.kernel, [1, filter_h, filter_w, 1], 'VALID')
        # add the output feature maps in the same group
        out = tf.reshape(out, [batch_size, out_h, out_w, self.filters, channel_in])
        out = tf.reduce_sum(out, axis=-1)
        if self.use_bias:
            out += self.bias
        return self.activation(out)

    def _pad_input(self, inputs):
        """Check if input feature map needs padding, because we don't use the standard Conv() function.

        :param inputs:
        :return: padded input feature map
        """
        # When padding is 'same', we should pad the feature map.
        # if padding == 'same', output size should be `ceil(input / stride)`
        if self.padding == 'same':
            in_shape = inputs.get_shape().as_list()[1: 3]
            padding_list = []
            for i in range(2):
                filter_size = self.kernel_size[i]
                dilation = self.dilation_rate[i]
                dilated_filter_size = filter_size + (filter_size - 1) * (dilation - 1)
                same_output = (in_shape[i] + self.strides[i] - 1) // self.strides[i]
                valid_output = (in_shape[i] - dilated_filter_size + self.strides[i]) // self.strides[i]
                if same_output == valid_output:
                    padding_list += [0, 0]
                else:
                    p = dilated_filter_size - 1
                    p_0 = p // 2
                    padding_list += [p_0, p - p_0]
            if sum(padding_list) != 0:
                padding = [[0, 0],
                           [padding_list[0], padding_list[1]],  # top, bottom padding
                           [padding_list[2], padding_list[3]],  # left, right padding
                           [0, 0]]
                inputs = tf.pad(inputs, padding)
        return inputs

    def _get_conv_indices(self, feature_map_size):
        """the x, y coordinates in the window when a filter sliding on the feature map

        :param feature_map_size:
        :return: y, x with shape [1, out_h, out_w, filter_h * filter_w]
        """
        feat_h, feat_w = [int(i) for i in feature_map_size[0: 2]]

        x, y = tf.meshgrid(tf.range(feat_w), tf.range(feat_h))
        x, y = [tf.reshape(i, [1, *i.get_shape(), 1]) for i in [x, y]]  # shape [1, h, w, 1]
        x, y = [tf.image.extract_patches(i,
                                               [1, *self.kernel_size, 1],
                                               [1, *self.strides, 1],
                                               [1, *self.dilation_rate, 1],
                                               'VALID')
                for i in [x, y]]  # shape [1, out_h, out_w, filter_h * filter_w]
        return y, x

    @staticmethod
    def _get_pixel_values_at_point(inputs, indices):
        """get pixel values

        :param inputs:
        :param indices: shape [batch_size, H, W, I], I = filter_h * filter_w * channel_out
        :return:
        """
        y, x = indices
        batch, h, w, n = y.get_shape().as_list()[0: 4]

        batch_idx = tf.reshape(tf.range(0, batch), (batch, 1, 1, 1))
        b = tf.tile(batch_idx, (1, h, w, n))
        pixel_idx = tf.stack([b, y, x], axis=-1)
        return tf.gather_nd(inputs, pixel_idx)


if __name__ == '__main__':
    layer = DeformableConvLayer(32, [5, 5])
    layer.build([16])
    # layer.call()
    print(layer.offset_layer_kernel)
    # print(layer)


## Deformable Conv Layer Torch


In [None]:
class DeformableConvLayer(Conv2d):

    def __init__(
            self,
            in_channels,
            filters,
            kernel_size,
            strides=(1, 1),
            padding='same',
            # data_format=None,
            dilation_rate=(1,1),
            num_deformable_group = None,
            # activation = None,
            use_bias = True,
            **kwargs
    ):
        """`kernel_size`, `strides` and `dilation_rate` must have the same value in both axis.
        
        :param num_deformable_group: split output channels into groups, offset shared in each group. If 
        this parameter is None, then  set num_deformable_group=filters.
        """
        # Conv2d()
        super().__init__(
            in_channels=in_channels,
            out_channels=filters,
            kernel_size=kernel_size,
            stride=strides,
            padding=padding,
            # data_format=data_format,
            dilation=dilation_rate,
            # activation=activation,
            bias=use_bias,
            **kwargs)
        self.kernel = None
        self.bias_tensor = None
        self.offset_layer_kernel = None
        self.offsetlayer_bias = None
        self.use_bias = use_bias
        if num_deformable_group is None:
            num_deformable_group = filters
        if filters % num_deformable_group != 0:
            raise ValueError('"filters" mod "num_deformable_group must be zero')
        self.num_deformable_group = num_deformable_group
    
    def build(self, input_shape):
        input_dim = int(input_shape[-1])
        #kernel_shape = self.kernel_size + (input_dim, self.filters)
        # we want to use depth-wise conv
        kernel_shape = self.kernel_size + (self.out_channels * input_dim, 1)
        self.kernel = nn.Parameter(
            torch.zeros(kernel_shape, dtype=torch.float, requires_grad=True)
        )
        nn.init.xavier_uniform_(self.kernel)
        print(self.bias)
        if self.use_bias:
            self.bias = nn.Parameter(
                torch.zeros(kernel_shape, dtype=torch.float),
                requires_grad=True
            )
        nn.init.zeros_(self.bias)
        
        # create offset conv layer 
        offset_num = self.kernel_size[0] * self.kernel_size[1] * self.num_deformable_group
        self.offset_layer_kernel = nn.Parameter(
            torch.zeros(self.kernel_size + (input_dim, offset_num * 2), dtype=torch.float), # 2 mean x and y axis 
            requires_grad=True
                    )
        nn.init.zeros_(self.offset_layer_kernel)

        self.offset_layer_bias = nn.Parameter(
            torch.zeros(offset_num * 2,),
            requires_grad=True
        )
        nn.init.zeros_(self.offset_layer_bias)

    def forward(self, inputs, training=None, **kwargs):
        print(f"inputs shape is {inputs.shape}")
        #get offset shape [batch_size, out_h, out_w, filter_h * filter_w * chanel_out * 2]
        # offset = nn.Conv2d(inputs, 
        #           self.offset_layer_kernel, 
        #           bias=self.offset_layer_bias, 
        #           stride=self.strides, 
        #           padding=self.padding, 
        #           dilation=[1, self.dilation_rate, 1])
        offset_num = self.kernel_size[0] * self.kernel_size[1] * self.num_deformable_group
        
        offset = nn.Conv2d(
            in_channels=self.in_channels,
            out_channels= offset_num * 2,  # We want 2 channels for each offset (x, y)
            kernel_size=self.kernel_size,
            stride=self.stride,
            padding=self.padding,
            dilation=self.dilation,
            bias=True  # We keep bias True as in TensorFlow
        )

        # Manually set the kernel weights and biases
        offset.weight = self.offset_layer_kernel
        offset.bias = self.offset_layer_bias
        offset += self.offset_layer_bias
        
        # add padding if needed 
        inputs =  self._pad_input(inputs)

        # handle batch size None
        if input.shape[0] is None:
            batch_size = 1
        else:
            batch_size = int(inputs.shape[0])
        channel_in = int(inputs.shape[-1])
        in_h, in_w = [int(i) for i in input.shape[1: 3]] # input feature map size 
        out_w, out_h = [int(i) for i in offset.shape[1: 3]] # output feature map size 
        filter_h, filter_w = self.kernel_size

        # get x, y axis offset 
        offset = torch.reshape(offset, [batch_size, out_h, out_w, -1, 2]) 
        y_off, x_off = offset[:, :, :, :, 0], offset[:, :, :, :, 0]

        # input features 
        y, x = self._get_conv_indices([in_h, in_w])
        y, x = [torch.tile(i, [batch_size, 1, 1, 1, self.num_deformable_group]) for i in [y, x]]
        y, x = [torch.reshape(i, [*i.shape[0:3], -1]) for i in [y, x]]
        y = y.to(torch.float32)
        x = x.to(torch.float32)

        # add offset
        y, x = y + y_off, x + x_off 
        y = torch.clip(y, 0, in_h - 1)
        x = torch.clip(x, 0, in_w - 1)

        # get four coordinates of points around (x, y)
        y0 = torch.floor(y).to(torch.int32)
        x0 = torch.floor(x).to(torch.int32)
        y1, x1 = y0 + 1, x0 + 1

        #clip
        y0, y1 = [torch.clip(i, 0, in_h - 1) for i in [y0, y1]]
        x0, x1 = [torch.clip(i, 0, in_w - 1) for i in [x0, x1]]

        # get pixel values 
        indices = [[y0, x0], [y0, x1], [y1, x0], [y1, x1]]
        p0, p1, p2, p3 = [DeformableConvLayer._get_pixel_values_at_point(inputs, i) for i in indices]

        # cast to float
        x0 = x0.to(torch.float32)
        x1 = x1.to(torch.float32)
        y0 = y0.to(torch.float32)
        y1 = y1.to(torch.float32) 

        # weights
        w0 = (y1 - y) * (x1 - x)
        w1 = (y1 - y) * (x - x0)
        w2 = (y - y0) * (x1 - x)
        w3 = (y - y0) * (x - x0)

        # expand dim for broadcast
        w0, w1, w2, w3 = [w.unsqueeze(-1) for w in [w0, w1, w2, w3]]

        # bilinear interpolation
        pixels = w0 * p0 + w1 * p1 + w2 * p2 + w3 * p3

        # reshape the "big" feature map
        pixels = pixels.reshape(batch_size, out_h, out_w, filter_h, filter_w, self.num_deformable_group, channel_in)
        pixels = pixels.permute(0, 1, 3, 2, 4, 5, 6)
        pixels = pixels.reshape(batch_size, out_h * filter_h, out_w * filter_w, self.num_deformable_group, channel_in)

        # copy channels to same group
        feat_in_group = self.filters // self.num_deformable_group
        pixels = torch.tile(pixels, [1, 1, 1, 1, feat_in_group])
        pixels = torch.reshape(pixels, [batch_size, out_h * filter_h, out_w * filter_w, -1])

        out = torch.nn.functional.conv2d(pixels, self.kernel, stride=(filter_h, filter_w), groups=pixels.shape[3])

        # add the output feature maps in the same group
        out = torch.reshape(out, [batch_size, out_h, out_w, self.filters, channel_in])
        out = torch.sum(out, axis=-1)

        if self.use_bias:
            out += self.bias
        return self.activation(out)
    
    def _pad_input(self, inputs):
        """Check if input feature map needs padding, because we don't use the standart Conv() function.

        :param inputs:
        :return: padded input feature map 
        """

        #When paddin is 'same', we should pad the feature map.
        # if padding == 'same', output size should be `ceil(input / stride)`
        if self.padding == 'same':
            in_shape = inputs.shape[1:3]
            padding_list = []
            for i in range(2):
                filter_size = self.kernel_size[i]
                dilation = self.dilation_rate[i]
                dilated_filter_size = filter_size + (filter_size - 1) * (dilation - 1)
                same_output = (in_shape[i] + self.strides[i] - 1) // self.strides[i]
                valid_output = (in_shape[i] - dilated_filter_size + self.strides[i]) // self.strides[i]
                if same_output == valid_output:
                    padding_list += [0, 0]
                else:
                    p = dilated_filter_size - 1
                    p_0 = p // 2
                    padding_list += [p_0, p - p_0] 
    
    def _get_conv_indices(self, feature_map_size):
        """the x, y coordinates in the window when a filter sliding on the feature map

        :param feature_map_size:
        :return: y, x with shape [1, out_h, out_w, filter_h * filter_w]
        """
        feat_h, feat_w = [int(i) for i in feature_map_size[0: 2]]

        x, y = torch.meshgrid(torch.range(feat_w),  torch.range(feat_h))
        x, y = [torch.reshape(i, [1, *i.shape, 1]) for i in [x, y]]
        x, y = [nn.functional.unfold(i, 
                                     kernel_size=[1, *self.kernel_size, 1],
                                     stride=[1, *self.strides, 1],
                                     dilation=[1, *self.dilation_rate, 1],
                                     padding=0) 
                                     for i in [x, y]]
        print(f"Shapes after unfold. x.shape {x.shape} y.shape {y.shape}")
        return y, x
    
    def _get_pixel_values_at_point(inputs, indices):
        """get pixel values

        :param inputs:
        :param indices: shape [batch_size, H, W, I], I = filter_h * filter_w * chanel_out
        :return:
        """

        y, x = indices 
        batch, h, w, n = y.shape[0: 4] 

        batch_idx = torch.reshape(torch.range(0, batch), (batch, 1, 1, 1))
        b = torch.tile(batch_idx, (1, h, w, n))
        print(f"b.shape is {b.shape}")
        pixel_idx = torch.stack([b, y, x], dim=-1)
        print(f"pixel index {pixel_idx}")
        return  torch.index_select(inputs, pixel_idx)


In [None]:
input_tensor = torch.randint(0, 3, (112, 112, 3))
Conv2d()

In [110]:
layer = DeformableConvLayer(3, 32, [5, 5])
layer.build([20, 10, 3])
layer(input_tensor)
print(layer.offset_layer_kernel)
# print(layer)

Parameter containing:
tensor([ 0.0258,  0.0370, -0.1138, -0.0252, -0.0704,  0.1035,  0.0233,  0.1130,
         0.0458, -0.0329, -0.0951, -0.0125, -0.0268, -0.0197, -0.0618, -0.0790,
         0.0018, -0.0489, -0.0547,  0.0347, -0.0726, -0.1150,  0.0094, -0.0300,
         0.0680,  0.0602,  0.0736,  0.0996,  0.0058, -0.0634,  0.0613,  0.0226],
       requires_grad=True)
inputs shape is torch.Size([112, 112, 3])


AttributeError: 'Conv2d' object has no attribute 'shape'