### `Conv2d` result comparison:
prepare common weight, bias, input and gradient data

In [1]:
import numpy as np

data = np.random.randn(128, 4, 64, 64).astype('f')
weight = np.random.randn(16, 2, 3, 2).astype('f')
bias = np.random.randn(16).astype('f')
grad = np.random.randn(128, 16, 66, 23).astype('f')

tortto `conv2d`: 

In [2]:
import tortto as tt
import tortto.nn as nn

m = nn.Conv2d(in_channels=4, 
              out_channels=16, 
              kernel_size=(3, 2), 
              stride=(1, 3), 
              padding=(2, 3), 
              dilation=(1, 2),
              groups=2, 
              bias=True, 
              padding_mode='zeros')  # only support zero padding for now
m.weight = nn.Parameter(tt.tensor(weight))
m.bias = nn.Parameter(tt.tensor(bias))
x = tt.tensor(data, requires_grad=True)
y = m(x)
y.backward(tt.tensor(grad))

pytorch `Conv2d`:  

In [3]:
import torch

m_torch = torch.nn.Conv2d(in_channels=4, 
                          out_channels=16, 
                          kernel_size=(3, 2), 
                          stride=(1, 3), 
                          padding=(2, 3),
                          dilation=(1, 2), 
                          groups=2, 
                          bias=True, 
                          padding_mode='zeros')
m_torch.weight = torch.nn.Parameter(torch.tensor(weight))
m_torch.bias = torch.nn.Parameter(torch.tensor(bias))
x_torch = torch.tensor(data, requires_grad=True)
y_torch = m_torch(x_torch)
y_torch.backward(torch.tensor(grad))

output and gradients comparison:

In [4]:
print(np.allclose(y.detach().numpy(), y_torch.detach().numpy(), atol=1e-5, rtol=1e-5)) # output
print(np.allclose(x.grad, x_torch.grad.detach().numpy(), atol=1e-5, rtol=1e-5)) # input grad
print(np.allclose(m.weight.grad, m_torch.weight.grad.detach().numpy(), atol=1e-5, rtol=1e-3)) # weight grad
print(np.allclose(m.bias.grad, m_torch.bias.grad.detach().numpy(), atol=1e-5, rtol=1e-3)) # bias grad

True
True
True
True


### `Conv2d` speed comparison in GPU:

In [5]:
!nvidia-smi -L

GPU 0: NVIDIA GeForce RTX 2080 (UUID: GPU-d7a0f83a-dd13-2c12-cf81-2d76458e89a8)


In [6]:
from cupyx.profiler import benchmark

m = m.cuda()
m_torch = m_torch.cuda()

def tortto_conv2d_gpu():
    x = tt.tensor(data, requires_grad=True).cuda()
    y = m(x)
    y.backward(tt.tensor(grad).cuda())

def torch_conv2d_gpu():
    x_torch = torch.tensor(data, requires_grad=True).cuda()
    y_torch = m_torch(x_torch)
    y_torch.backward(torch.tensor(grad).cuda())

print(benchmark(tortto_conv2d_gpu, (), n_repeat=50), '\n')
print(benchmark(torch_conv2d_gpu, (), n_repeat=50))

tortto_conv2d_gpu   :    CPU: 11462.856 us   +/- 303.534 (min: 11264.330 / max: 12227.156) us     GPU-0: 11468.411 us   +/- 303.613 (min: 11269.728 / max: 12232.576) us 

torch_conv2d_gpu    :    CPU: 10081.922 us   +/- 50.814 (min: 10002.339 / max: 10189.587) us     GPU-0: 10094.026 us   +/- 50.711 (min: 10015.264 / max: 10201.856) us


As shown above, in this example tortto `conv2d` is only slightly slower than pytorch.  
In real training scenarios it's less than 3 times slower than pytorch.