## Multi-GPU Memory Sharing

VMM enables fine-grained control over which GPUs can access which memory:

```cpp
// Grant GPU 1 access to memory on GPU 0
CUmemAccessDesc accessDesc;
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = 1;  // GPU 1 gets access
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
cuMemSetAccess(ptr_on_gpu0, size, &accessDesc, 1);
```

In [None]:
%%writefile multi_gpu_vmm.cu
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>

__global__ void writeKernel(int* data, int n, int value) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) data[idx] = value;
}

__global__ void readKernel(int* data, int n, long long* sum) {
    __shared__ long long sdata[256];
    int tid = threadIdx.x;
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    
    sdata[tid] = (idx < n) ? data[idx] : 0;
    __syncthreads();
    
    for (int s = 128; s > 0; s >>= 1) {
        if (tid < s) sdata[tid] += sdata[tid + s];
        __syncthreads();
    }
    
    if (tid == 0) atomicAdd(sum, sdata[0]);
}

int main() {
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    printf("Found %d GPU(s)\n", deviceCount);
    
    if (deviceCount < 2) {
        printf("Multi-GPU VMM requires 2+ GPUs. Demonstrating single-GPU case.\n");
        
        // Single GPU demonstration
        cuInit(0);
        
        CUmemAllocationProp prop = {};
        prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
        prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
        prop.location.id = 0;
        
        size_t granularity;
        cuMemGetAllocationGranularity(&granularity, &prop,
                                      CU_MEM_ALLOC_GRANULARITY_MINIMUM);
        
        size_t size = 1 << 20;
        size = ((size + granularity - 1) / granularity) * granularity;
        
        CUdeviceptr ptr;
        cuMemAddressReserve(&ptr, size, granularity, 0, 0);
        
        CUmemGenericAllocationHandle handle;
        cuMemCreate(&handle, size, &prop, 0);
        cuMemMap(ptr, size, 0, handle, 0);
        
        CUmemAccessDesc accessDesc = {};
        accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
        accessDesc.location.id = 0;
        accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
        cuMemSetAccess(ptr, size, &accessDesc, 1);
        
        int n = size / sizeof(int);
        writeKernel<<<(n+255)/256, 256>>>((int*)ptr, n, 1);
        
        long long* d_sum;
        cudaMalloc(&d_sum, sizeof(long long));
        cudaMemset(d_sum, 0, sizeof(long long));
        readKernel<<<(n+255)/256, 256>>>((int*)ptr, n, d_sum);
        
        long long h_sum;
        cudaMemcpy(&h_sum, d_sum, sizeof(long long), cudaMemcpyDeviceToHost);
        printf("Sum: %lld (expected %d)\n", h_sum, n);
        
        cudaFree(d_sum);
        cuMemUnmap(ptr, size);
        cuMemRelease(handle);
        cuMemAddressFree(ptr, size);
        
        return 0;
    }
    
    // Multi-GPU case
    cuInit(0);
    
    // Check P2P capability
    int canAccess;
    cudaDeviceCanAccessPeer(&canAccess, 1, 0);
    if (!canAccess) {
        printf("GPU 1 cannot access GPU 0 memory\n");
        return 1;
    }
    printf("P2P access supported between GPU 0 and GPU 1\n");
    
    // Allocate on GPU 0 with VMM
    cudaSetDevice(0);
    
    CUmemAllocationProp prop = {};
    prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
    prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
    prop.location.id = 0;
    
    size_t granularity;
    cuMemGetAllocationGranularity(&granularity, &prop,
                                  CU_MEM_ALLOC_GRANULARITY_MINIMUM);
    
    size_t size = 1 << 20;
    size = ((size + granularity - 1) / granularity) * granularity;
    
    CUdeviceptr ptr;
    cuMemAddressReserve(&ptr, size, granularity, 0, 0);
    
    CUmemGenericAllocationHandle handle;
    cuMemCreate(&handle, size, &prop, 0);
    cuMemMap(ptr, size, 0, handle, 0);
    
    // Grant access to BOTH GPUs
    CUmemAccessDesc accessDescs[2];
    for (int i = 0; i < 2; i++) {
        accessDescs[i].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
        accessDescs[i].location.id = i;
        accessDescs[i].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
    }
    cuMemSetAccess(ptr, size, accessDescs, 2);
    printf("Granted access to GPU 0 and GPU 1\n");
    
    // GPU 0 writes
    cudaSetDevice(0);
    int n = size / sizeof(int);
    writeKernel<<<(n+255)/256, 256>>>((int*)ptr, n, 42);
    cudaDeviceSynchronize();
    printf("GPU 0 wrote data\n");
    
    // GPU 1 reads (P2P access!)
    cudaSetDevice(1);
    long long* d_sum;
    cudaMalloc(&d_sum, sizeof(long long));
    cudaMemset(d_sum, 0, sizeof(long long));
    readKernel<<<(n+255)/256, 256>>>((int*)ptr, n, d_sum);
    
    long long h_sum;
    cudaMemcpy(&h_sum, d_sum, sizeof(long long), cudaMemcpyDeviceToHost);
    printf("GPU 1 read sum: %lld (expected %lld)\n", h_sum, (long long)n * 42);
    
    // Cleanup
    cudaFree(d_sum);
    cuMemUnmap(ptr, size);
    cuMemRelease(handle);
    cuMemAddressFree(ptr, size);
    
    printf("Multi-GPU VMM complete!\n");
    return 0;
}

In [None]:
!nvcc multi_gpu_vmm.cu -o multi_gpu_vmm -lcuda && ./multi_gpu_vmm

## Fabric Handles for NVLink Clusters

For larger clusters with NVLink/NVSwitch, fabric handles enable efficient memory sharing:

```cpp
// Export allocation as fabric handle
CUmemFabricHandle fabricHandle;
cuMemExportToShareableHandle(&fabricHandle, handle, 
                              CU_MEM_HANDLE_TYPE_FABRIC, 0);

// Import on another GPU
CUmemGenericAllocationHandle importedHandle;
cuMemImportFromShareableHandle(&importedHandle, &fabricHandle,
                                CU_MEM_HANDLE_TYPE_FABRIC);
```

## Key Takeaways

1. **Fine-grained access** - Control exactly which GPUs access which memory
2. **`cuMemSetAccess`** - Grant R/W permissions per device
3. **P2P over VMM** - More control than `cudaEnablePeerAccess`
4. **Fabric handles** - Scale to large NVLink clusters