In [1]:
%%writefile matmul.cu
#include <cuda_runtime.h>
#include <stdio.h>

// Kernel 定义
__global__ void matrix_multiplication_kernel(const float* A, const float* B, float* C, int M, int N, int K) {
    int row =  blockDim.y * blockIdx.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    
    if(col >= K || row >= M){
        return;
    }
    
    float acc = 0.0f;
    for(int i = 0; i < N; i++){
        acc += A[row * N + i] * B[i * K + col];
    }
    C[row * K + col] = acc; 
}

// 宿主端 wrapper 函数
extern "C" void solve(const float* A, const float* B, float* C, int M, int N, int K) {
    dim3 threadsPerBlock(16, 16);
    // 注意：grid 的计算需要向上取整，你的代码已经包含了这个逻辑，但建议加上括号保证运算顺序
    dim3 blocksPerGrid((K + threadsPerBlock.x - 1) / threadsPerBlock.x,
                       (M + threadsPerBlock.y - 1) / threadsPerBlock.y);
    
    matrix_multiplication_kernel<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, M, N, K);
    
    // 检查是否有错误发生（调试用）
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        printf("CUDA Error: %s\n", cudaGetErrorString(err));
    }
    
    // 等待 GPU 完成
    cudaDeviceSynchronize();
}

Writing matmul.cu


In [1]:
!nvcc -shared -o libmatmul.so matmul.cu -Xcompiler -fPIC

In [None]:
import torch
import ctypes
import numpy as np

# 1. 加载编译好的 .so 库
lib = ctypes.CDLL('./libmatmul.so')

# 2. 定义函数参数类型
# void solve(const float* A, const float* B, float* C, int M, int N, int K)
# 指针对应 c_void_p (因为我们要传显存地址), int 对应 c_int
lib.solve.argtypes = [
    ctypes.c_void_p, 
    ctypes.c_void_p, 
    ctypes.c_void_p, 
    ctypes.c_int, 
    ctypes.c_int, 
    ctypes.c_int
]

def cuda_matmul(a_tensor, b_tensor):
    # 获取维度
    M, N = a_tensor.shape
    N_b, K = b_tensor.shape
    
    assert N == N_b, f"矩阵维度不匹配: {N} != {N_b}"
    
    # 初始化输出矩阵 C (在 GPU 上分配)
    c_tensor = torch.zeros((M, K), device='cuda', dtype=torch.float32)
    
    # 确保输入是连续内存且在 GPU 上
    if not a_tensor.is_contiguous(): a_tensor = a_tensor.contiguous()
    if not b_tensor.is_contiguous(): b_tensor = b_tensor.contiguous()
    
    # 3. 调用 CUDA 函数
    # 注意：必须传入 data_ptr()，这是物理显存地址
    lib.solve(
        ctypes.c_void_p(a_tensor.data_ptr()),
        ctypes.c_void_p(b_tensor.data_ptr()),
        ctypes.c_void_p(c_tensor.data_ptr()),
        ctypes.c_int(M),
        ctypes.c_int(N),
        ctypes.c_int(K)
    )
    
    return c_tensor

# --- 测试部分 ---

# 设置维度
M, N, K = 1024, 512, 1024

print(f"正在进行矩阵乘法测试: [{M}x{N}] * [{N}x{K}]")

# 创建随机数据 (在 GPU 上)
A = torch.randn(M, N, device='cuda', dtype=torch.float32)
B = torch.randn(N, K, device='cuda', dtype=torch.float32)

# 1. 运行你的 CUDA Kernel
C_custom = cuda_matmul(A, B)

# 2. 运行 PyTorch 内置矩阵乘法 (作为标准答案)
C_torch = torch.matmul(A, B)

# 3. 验证结果
# 允许一点浮点误差
if torch.allclose(C_custom, C_torch, atol=1e-3):
    print("✅ 测试通过！结果正确。")
else:
    print("❌ 测试失败。结果不一致。")
    print("最大误差:", (C_custom - C_torch).abs().max().item())

In [None]:
# 性能测试
print("PyTorch 原生:")
%timeit torch.matmul(A, B); torch.cuda.synchronize()

print("CUDA Kernel:")
%timeit cuda_matmul(A, B); torch.cuda.synchronize()