In [None]:
%%writefile elementwise_fp32.cu
#include <torch/extension.h>
#include <cuda_runtime.h>

__global__ void elementwise_add_f32_kernel(const float* A, const float* B, float* C, int N) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < N) {
        C[idx] = A[idx] + B[idx];
    }
}

void elementwise_add_f32(torch::Tensor A, torch::Tensor B, torch::Tensor C) {
    int N = A.numel();
    int threads = 256;
    int blocks = (N + threads - 1) / threads;
    elementwise_add_f32_kernel<<<blocks, threads>>>(
        A.data_ptr<float>(),
        B.data_ptr<float>(),
        C.data_ptr<float>(),
        N
    );
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("elementwise_add_f32", &elementwise_add_f32, "Elementwise Add f32");
}


In [None]:
# 测试代码如下
import time
import torch
from torch.utils.cpp_extension import load

torch.set_grad_enabled(False)

# 编译并加载 CUDA 扩展
lib = load(
    name="elementwise_lib",
    sources=["elementwise_fp32.cu"],
    extra_cuda_cflags=["-O3"],
    extra_cflags=["-std=c++17"],
    verbose=True
)

def run_benchmark(func, a, b, out=None, iters=1000):
    # warmup
    for _ in range(10):
        func(a, b, out)
    torch.cuda.synchronize()

    start = time.time()
    for _ in range(iters):
        func(a, b, out)
    torch.cuda.synchronize()
    end = time.time()

    mean_time = (end - start) * 1000 / iters
    print(f"Time per call: {mean_time:.6f} ms")

# 测试数据
N = 1024 * 1024  # 一百万个元素
a = torch.randn(N, device="cuda", dtype=torch.float32)
b = torch.randn(N, device="cuda", dtype=torch.float32)
c = torch.zeros_like(a)

# 跑自定义 kernel
run_benchmark(lib.elementwise_add_f32, a, b, c)

# 跑 PyTorch 内置加法
run_benchmark(lambda x,y,z: torch.add(x,y,out=z), a, b, c)

# 对比结果
print("验证结果是否一致:", torch.allclose(a+b, c))
