In [None]:
import torch
from torch.utils.cpp_extension import load_inline

if not torch.cuda.is_available():
    raise RuntimeError("CUDA is not available")

device = torch.device("cuda")

# Define CUDA kernel and C++ wrapper as strings
cuda_source = """
__global__ void add_kernel(float* a, float* b, float* c, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) {
        c[idx] = a[idx] + b[idx];
    }
}
"""

cpp_source = """
#include <torch/extension.h>

void add_wrapper(torch::Tensor a, torch::Tensor b, torch::Tensor c) {
    int size = a.numel();
    add_kernel<<< (size + 255) / 256, 256 >>>(
        a.data_ptr<float>(),
        b.data_ptr<float>(),
        c.data_ptr<float>(),
        size
    );
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("add_wrapper", &add_wrapper, "Add two tensors (CUDA)");
}
"""

# Load the inline extension
custom_ops = load_inline(
    name="custom_add_extension",
    cpp_sources=cpp_source,
    cuda_sources=cuda_source,
    functions=["add_wrapper"],
    with_cuda=True,
    extra_cuda_cflags=["-arch=sm_75"] # Example flag for a specific GPU architecture
)