In [2]:
%%writefile test_kernel.cu
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>

// CUDA Kernel 函数
template <typename scalar_t>
__global__ void square_cuda_kernel(
    const scalar_t* __restrict__ input,
    scalar_t* __restrict__ output,
    int size) {
    
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) {
        output[idx] = input[idx] * input[idx];
    }
}

// C++ 调用 CUDA 的辅助函数
void square_cuda_launcher(torch::Tensor input, torch::Tensor output) {
    const int size = input.numel();
    const int threads = 256;
    const int blocks = (size + threads - 1) / threads;

    // AT_DISPATCH_FLOATING_TYPES 是 PyTorch 提供的宏，用于自动处理 float/double 类型分发
    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "square_cuda", ([&] {
        square_cuda_kernel<scalar_t><<<blocks, threads>>>(
            input.data_ptr<scalar_t>(),
            output.data_ptr<scalar_t>(),
            size);
    }));
}

Writing test_kernel.cu


In [9]:
!pip install ninja
!ls
!ls /usr/local/bin/python -l

sample_data  test_ext.cpp  test_kernel.cu
-r-xr-xr-x 1 root root 155 Jan  1  2000 /usr/local/bin/python


In [5]:
import torch
from torch.utils.cpp_extension import load

# 这里会自动编译 my_kernel.cu 和 my_ext.cpp
# verbose=True 可以看到编译过程的日志，便于排错
my_custom_op = load(
    name="my_custom_op_jit",
    sources=["test_ext.cpp", "test_kernel.cu"],
    verbose=True
)
x = torch.rand(1024, 1024, device='cuda')

%timeit y = my_custom_op.square(x)
%timeit y = x*x

ImportError: /root/.cache/torch_extensions/py312_cu126/my_custom_op_jit/my_custom_op_jit.so: cannot open shared object file: No such file or directory