In [1]:
import os, sys
sys.path.insert(0, '..')

In [2]:
from pathlib import Path
import torch
from torchvision.io import read_image, write_png
from profiling.profiler import profile
from utils import *

In [3]:
from numba import cuda
from numba.cuda import as_cuda_array as ca

<img width="500" src="../images/image.png" id="jupyter"/>

In [14]:
k_size = 3
conv = torch.nn.Conv2d(1, 1, k_size, bias=False, padding=k_size//2).cuda()
m1 = torch.rand(1000, 2000).contiguous().cuda()
f = conv.weight[0][0].detach().contiguous().cuda()

## Basic convolution kernel (without shared memory)

Start from numba for debugging

In [5]:
@cuda.jit
def conv2d_k(m, f, out, r):
    # get row and column indices
    row,col = cuda.grid(2)
    if row < out.shape[0] and col < out.shape[1]:  # Ensure threads are within output shape
        val = 0
        for i in range(f.shape[0]):
            for j in range(f.shape[1]):
                in_row = row - r + i
                in_col = col - r +j
                if (m.shape[0]>in_row >=0 and m.shape[1]>in_col >=0):
                    val += m[in_row, in_col] * f[i, j]  # Convolution operation
        out[row, col] = val  # Store result in output array


def conv_2d(m, f):
    h,w  = m.shape
    out = torch.zeros(h, w, dtype=m.dtype, device=m.device)
    # TOTAL block size is limited by 1024 threads
    block_size = 32
    blocks = cdiv(h,block_size), cdiv(w,block_size)
    conv2d_k[blocks, (block_size, block_size)](ca(m), ca(f), ca(out), f.shape[0]//2) 
    return out

In [6]:
torch.isclose(conv(m1[None,]), conv_2d(m1,f), atol=1e-7).all()

tensor(True, device='cuda:0')

In [7]:
# %timeit conv_2d(m1,f)

In [18]:
%timeit with torch.no_grad(): conv(m1[None,])

113 µs ± 3.07 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


Now rewrite into CUDA kernel

In [8]:
mod = load_cu_file('./conv2d.cu')

In [15]:
torch.isclose(conv(m1[None,]), mod.conv2d(m1,f)).all()

tensor(True, device='cuda:0')

We see that we're slightly slower than pytorch

In [16]:
%timeit mod.conv2d(m1,f)

154 µs ± 254 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


## With shared memory

Now we're going to use tiled convolution where we collaboratively store patches of matrix into shared memory and then reuse it later when computing convolution. Another way is to also load padding to cover whole edges but there are already high chances to hit L2 cache for big matrices

In [17]:
torch.isclose(conv(m1[None,]), mod.conv2d_shared(m1,f)).all()

tensor(True, device='cuda:0')

Unexpectedly tiled convolution works slower than a naive one

In [20]:
%timeit mod.conv2d(m1, f)

154 µs ± 463 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [19]:
%timeit mod.conv2d_shared(m1, f)

185 µs ± 909 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [44]:
from functools import partial
profile(partial(mod.conv2d,m1[None,]), f)

-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                     Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
            ProfilerStep*        46.95%     292.000us        67.68%     421.000us     210.500us             2  
              aten::zeros         2.89%      18.000us        18.97%     118.000us      59.000us             2  
              aten::empty         4.66%      29.000us         4.66%      29.000us      14.500us             2  
              aten::zero_         1.61%      10.000us        11.41%      71.000us      35.500us             2  
              aten::fill_         3.54%      22.000us         9.81%      61.000us      30.500us             2  
         cudaLaunchKernel         8.04%      50.000us         8.04%      50.000us      12.500us         

STAGE:2024-04-15 23:31:10 38918:38918 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-04-15 23:31:10 38918:38918 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-04-15 23:31:10 38918:38918 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


# 1D conv

In [22]:
mod_1d = load_cu_file('./conv1d.cu')

In [30]:
k_size = 3
conv = torch.nn.Conv1d(1, 1, k_size, bias=False, padding=k_size//2).cuda()
m1 = torch.rand(2000).contiguous().cuda()
f = conv.weight[0][0].detach().contiguous().cuda()

In [40]:
assert torch.isclose(conv(m1[None,]), mod_1d.conv1d(m1,f)).all()
assert torch.isclose(conv(m1[None,]), mod_1d.conv1d_shared(m1,f)).all()