In [None]:
%%writefile transpose_v1.cu
#include <cuda_runtime.h>

__global__ void matrix_transpose_kernel(const float* input, float* output, int rows, int cols) {
    int row = blockDim.y*blockIdx.y+ threadIdx.y;
    int col = blockDim.x*blockIdx.x+ threadIdx.x;
    if (row<rows && col<cols)
    {
        output[col*rows+row] = input[row*cols+col];
    }
}

// input, output are device pointers (i.e. pointers to memory on the GPU)
extern "C" void solve(const float* input, float* output, int rows, int cols) {
    dim3 threadsPerBlock(16, 16);
    dim3 blocksPerGrid((cols + threadsPerBlock.x - 1) / threadsPerBlock.x,
                       (rows + threadsPerBlock.y - 1) / threadsPerBlock.y);

    matrix_transpose_kernel<<<blocksPerGrid, threadsPerBlock>>>(input, output, rows, cols);
    cudaDeviceSynchronize();
}

: 

In [None]:
!nvcc -shared -o libtranspose_v1.so transpose_v1.cu -Xcompiler -fPIC

In [None]:
import torch
import ctypes
import numpy as np

# 1. 加载编译好的 .so 库
lib = ctypes.CDLL('./libtranspose_v1.so')

# 2. 定义函数参数类型
# void solve(const float* A, const float* B, float* C, int M, int N, int K)
# 指针对应 c_void_p (因为我们要传显存地址), int 对应 c_int
lib.solve.argtypes = [
    ctypes.c_void_p, 
    ctypes.c_void_p,
    ctypes.c_int, 
    ctypes.c_int
]

def cuda_transpose(a_tensor):
    # 获取维度
    M, N = a_tensor.shape
    
    # 初始化输出矩阵 C (在 GPU 上分配)
    c_tensor = torch.zeros((N, M), device='cuda', dtype=torch.float32)
    
    # 确保输入是连续内存且在 GPU 上
    if not a_tensor.is_contiguous(): a_tensor = a_tensor.contiguous()
    
    # 3. 调用 CUDA 函数
    # 注意：必须传入 data_ptr()，这是物理显存地址
    lib.solve(
        ctypes.c_void_p(a_tensor.data_ptr()),
        ctypes.c_void_p(c_tensor.data_ptr()),
        ctypes.c_int(M),
        ctypes.c_int(N),
    )
    
    return c_tensor

# --- 测试部分 ---

# 设置维度
M, N = 1024, 512

print(f"正在进行矩阵转置: {M}x{N}")

# 创建随机数据 (在 GPU 上)
A = torch.randn(M, N, device='cuda', dtype=torch.float32)

# 1. 运行你的 CUDA Kernel
C_custom = cuda_transpose(A)

# 2. 运行 PyTorch 内置矩阵乘法 (作为标准答案)
C_torch = torch.transpose(A, -2, -1)

# 3. 验证结果
# 允许一点浮点误差
if torch.allclose(C_custom, C_torch, atol=1e-3):
    print("✅ 测试通过！结果正确。")
else:
    print("❌ 测试失败。结果不一致。")
    print(C_custom)
    print("最大误差:", (C_custom - C_torch).abs().max().item())

正在进行矩阵转置: 1024x512
✅ 测试通过！结果正确。


In [6]:
%timeit C_custom = cuda_transpose(A); torch.cuda.synchronize()

%timeit C_torch = torch.transpose(A, -2, -1); torch.cuda.synchronize()

51.7 µs ± 60.8 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
8.72 µs ± 109 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


上面的矩阵转置算子有很大的问题： 在input里面是一行一行的取数据的，但是在output里面是一列一列的放数据的。

1. 取一行数据可以触发内存合并，连续的线程取连续的数据可以合并为一起请求。速度非常快
2. 但是output里面放数据中，连续的线程往不连续的内存里面放数据，并且步长为rows，造成warp冲突，原本可以并行访问的共享内存现在只能串行访问。

可视化代码： https://gemini.google.com/share/d587562e89e7


In [None]:
%%writefile transpose_v2.cu
#include <cuda_runtime.h>

__global__ void matrix_transpose_kernel(const float* input, float* output, int rows, int cols) {
    int row = blockDim.y*blockIdx.y+ threadIdx.y;
    int col = blockDim.x*blockIdx.x+ threadIdx.x;
    if (row<rows && col<cols)
    {
        output[col*rows+row] = input[row*cols+col];
    }
}

// input, output are device pointers (i.e. pointers to memory on the GPU)
extern "C" void solve(const float* input, float* output, int rows, int cols) {
    dim3 threadsPerBlock(16, 16);
    dim3 blocksPerGrid((cols + threadsPerBlock.x - 1) / threadsPerBlock.x,
                       (rows + threadsPerBlock.y - 1) / threadsPerBlock.y);

    matrix_transpose_kernel<<<blocksPerGrid, threadsPerBlock>>>(input, output, rows, cols);
    cudaDeviceSynchronize();
}