In [7]:
import torch
import numpy as np

from torch.nn.functional import conv2d
from torch.utils.cpp_extension import load_inline
from torch.profiler import profile, record_function, ProfilerActivity

In [8]:
def trace_handler(prof):
    print(prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1))
    prof.export_chrome_trace("tmp/test_trace_" + str(prof.step_num) + ".json")

def profile_func(func, *tensors, trace_handler=trace_handler):
        
    """ In this example with wait=1, warmup=1, active=2, repeat=1, profiler will skip the first step/iteration,
        start warming up on the second, record the third and the forth iterations, after which the trace will become available
        and on_trace_ready (when set) is called; the cycle repeats starting with the next step """
    with torch.profiler.profile(
        activities=[
            torch.profiler.ProfilerActivity.CPU,
            torch.profiler.ProfilerActivity.CUDA,
        ],
        schedule=torch.profiler.schedule(wait=1, warmup=1, active=2, repeat=1), on_trace_ready=trace_handler
        # on_trace_ready=torch.profiler.tensorboard_trace_handler('./log')
        # used when outputting for tensorboard
        ) as p:
            for iter in range(10):
                func(*tensors)
                # send a signal to the profiler that the next iteration has started
                p.step()

In [9]:
cuda_source = '''
#define IN_TILE_DIM 5
#define OUT_TILE_DIM ((IN_TILE_DIM) - 2 * (FILTER_RADIUS))
#define FILTER_RADIUS 1

__constant__ float F_c[2*FILTER_RADIUS+1][2*FILTER_RADIUS+1] {{1,2,3},
                                                              {4,5,6},
                                                              {7,8,9}};
                                                              
__global__ void convolution_tiled_2D_const_mem_kernel(float* N, float* P, 
													  int width, int height) {
	int col = blockIdx.x * OUT_TILE_DIM + threadIdx.x - FILTER_RADIUS;
	int row = blockIdx.y * OUT_TILE_DIM + threadIdx.y - FILTER_RADIUS;
	// Loading input tile
	__shared__ float N_s[IN_TILE_DIM][IN_TILE_DIM];
	if(row>=0 && row<height && col>=0 && col<height){
		N_s[threadIdx.y][threadIdx.x] = N[row * width + col];
	} else {
		N_s[threadIdx.y][threadIdx.x] = 0.0f;
	}
	__syncthreads();
	// Caclulating output elements
	int tileCol = threadIdx.x - FILTER_RADIUS;
	int tileRow = threadIdx.y - FILTER_RADIUS;
	// Turning off the threads at the edges of the block
	if (col >= 0 && col < width && row >=0 && row < width) {
		if (tileCol >= 0 && tileCol < OUT_TILE_DIM && tileRow >= 0 
								&& tileRow < OUT_TILE_DIM) { 
			float Pvalue = 0.0f;
			for (int fRow = 0; fRow < 2 * FILTER_RADIUS + 1; fRow++) {
				for (int fCol = 0; fCol < 2 * FILTER_RADIUS + 1; fCol++) {
					Pvalue += F_c[fRow][fCol] * N_s[tileRow+fRow][tileCol+fCol];
				}
			}
			P[row*width+col] = Pvalue;
		}
	}
}

torch::Tensor conv2d(torch::Tensor matrix) {
    const auto height = matrix.size(0);
    const auto width = matrix.size(1);

    auto result = torch::empty_like(matrix);

    dim3 threads_per_block(IN_TILE_DIM, IN_TILE_DIM); // launches thread blocks whose dimension matches that of the input tiles
    dim3 number_of_blocks((width + threads_per_block.x - 1) / threads_per_block.x,
                          (height + threads_per_block.y - 1) / threads_per_block.y);

    convolution_tiled_2D_const_mem_kernel<<<number_of_blocks, threads_per_block>>>(
        matrix.data_ptr<float>(), result.data_ptr<float>(), width, height);

    return result;
    }
'''

cpp_source = "torch::Tensor conv2d(torch::Tensor matrix);"

In [10]:
conv2d_extension = load_inline(
    name="conv2d_extension",
    cpp_sources=cpp_source,
    cuda_sources=cuda_source,
    functions=["conv2d"],
    with_cuda=True,
    extra_cuda_cflags=[
        "-O2",
        # "-allow-unsupported-compiler",
        # "'--expt-relaxed-constexpr'",
    ],
    build_directory="tmp",
)


ImportError: /home/panagiotis/Desktop/dev/git/cuda/tmp/conv2d_extension_v2.so: cannot open shared object file: No such file or directory

In [5]:
a = torch.tensor([[1., 2., 3.], [4., 5., 6.], [7., 8., 9]], device='cuda')
print(conv2d_extension.conv2d(a))

tensor([[ 94., 154., 106.],
        [186., 285., 186.],
        [106., 154.,  94.]], device='cuda:0')


In [6]:
%%timeit
conv2d_extension.conv2d(a)

5.63 µs ± 326 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [7]:
weight = torch.tensor([[1., 2., 3.],[4., 5., 6.],[7.,8.,9.]], device='cuda')
weight

tensor([[1., 2., 3.],
        [4., 5., 6.],
        [7., 8., 9.]], device='cuda:0')

In [8]:
conv2d(a.reshape((1,1,3,3)), weight.reshape((1,1,3,3)), padding='same')

tensor([[[[ 94., 154., 106.],
          [186., 285., 186.],
          [106., 154.,  94.]]]], device='cuda:0')

In [9]:
%%timeit
conv2d(a.reshape((1,1,3,3)), weight.reshape((1,1,3,3)), padding='same')

15.8 µs ± 956 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


### Try on a bigger matrix

In [23]:
a_100 = torch.randint(100, (100, 100)).to('cuda').float()

In [29]:
conv2d_extension.conv2d(a_100)

tensor([[1892., 2686., 2242.,  ..., 1579., 1087.,  584.],
        [2269., 3042., 2179.,  ..., 2061., 1487.,  991.],
        [2435., 3080., 2394.,  ..., 2737., 2559., 1880.],
        ...,
        [2079., 2336., 1982.,  ..., 1515., 1633., 1080.],
        [1703., 2401., 2168.,  ..., 2826., 2959., 1857.],
        [ 724., 1169., 1061.,  ..., 1452., 1615.,  977.]], device='cuda:0')

In [13]:
%%timeit
conv2d_extension.conv2d(a_100)

5.76 µs ± 612 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [25]:
conv2d(a_100.reshape((1,1,100,100)), weight.reshape((1,1,3,3)), padding='same')

tensor([[[[1892., 2686., 2242.,  ..., 1579., 1087.,  584.],
          [2269., 3042., 2179.,  ..., 2061., 1487.,  991.],
          [2435., 3080., 2394.,  ..., 2737., 2559., 1880.],
          ...,
          [2079., 2336., 1982.,  ..., 1515., 1633., 1080.],
          [1703., 2401., 2168.,  ..., 2826., 2959., 1857.],
          [ 724., 1169., 1061.,  ..., 1452., 1615.,  977.]]]], device='cuda:0')

In [14]:
%%timeit
conv2d(a_100.reshape((1,1,100,100)), weight.reshape((1,1,3,3)), padding='same')

14.7 µs ± 926 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
