## Why Virtual Memory Management?

Traditional `cudaMalloc`:
- Allocates both virtual address AND physical memory together
- Fixed size - cannot grow without copy
- Simple but inflexible

VMM approach:
- Reserve virtual address range (cheap, can be huge)
- Allocate physical memory separately
- Map physical to virtual on demand
- **Growable data structures without copying!**

In [None]:
%%writefile vmm_basics.cu
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>

__global__ void fillKernel(int* data, int n, int value) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) data[idx] = value;
}

__global__ void sumKernel(int* data, int n, long long* result) {
    __shared__ long long sdata[256];
    int tid = threadIdx.x;
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    
    sdata[tid] = (idx < n) ? data[idx] : 0;
    __syncthreads();
    
    for (int s = 128; s > 0; s >>= 1) {
        if (tid < s) sdata[tid] += sdata[tid + s];
        __syncthreads();
    }
    
    if (tid == 0) atomicAdd(result, sdata[0]);
}

int main() {
    // Initialize CUDA Driver API
    cuInit(0);
    
    int device = 0;
    cudaSetDevice(device);
    
    // Step 1: Get allocation properties and granularity
    CUmemAllocationProp prop = {};
    prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
    prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
    prop.location.id = device;
    
    size_t granularity;
    cuMemGetAllocationGranularity(&granularity, &prop,
                                  CU_MEM_ALLOC_GRANULARITY_MINIMUM);
    printf("Allocation granularity: %zu bytes\n", granularity);
    
    // Step 2: Reserve virtual address range (1 GB)
    size_t reserveSize = 1ULL << 30;  // 1 GB
    CUdeviceptr ptr;
    CUresult res = cuMemAddressReserve(&ptr, reserveSize, granularity, 0, 0);
    if (res != CUDA_SUCCESS) {
        printf("Failed to reserve address: %d\n", res);
        return 1;
    }
    printf("Reserved %zu MB virtual address at 0x%llx\n", 
           reserveSize / (1024*1024), (unsigned long long)ptr);
    
    // Step 3: Create physical memory (1 MB initially)
    size_t physSize = 1 << 20;  // 1 MB
    physSize = ((physSize + granularity - 1) / granularity) * granularity;
    
    CUmemGenericAllocationHandle handle;
    res = cuMemCreate(&handle, physSize, &prop, 0);
    if (res != CUDA_SUCCESS) {
        printf("Failed to create physical memory: %d\n", res);
        return 1;
    }
    printf("Created %zu KB physical memory\n", physSize / 1024);
    
    // Step 4: Map physical to virtual
    res = cuMemMap(ptr, physSize, 0, handle, 0);
    if (res != CUDA_SUCCESS) {
        printf("Failed to map memory: %d\n", res);
        return 1;
    }
    printf("Mapped physical to virtual\n");
    
    // Step 5: Set access permissions
    CUmemAccessDesc accessDesc = {};
    accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
    accessDesc.location.id = device;
    accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
    
    res = cuMemSetAccess(ptr, physSize, &accessDesc, 1);
    if (res != CUDA_SUCCESS) {
        printf("Failed to set access: %d\n", res);
        return 1;
    }
    printf("Access permissions set\n");
    
    // Use the memory!
    int n = physSize / sizeof(int);
    int* data = (int*)ptr;
    
    fillKernel<<<(n+255)/256, 256>>>(data, n, 1);
    
    long long* d_sum;
    cudaMalloc(&d_sum, sizeof(long long));
    cudaMemset(d_sum, 0, sizeof(long long));
    
    sumKernel<<<(n+255)/256, 256>>>(data, n, d_sum);
    
    long long h_sum;
    cudaMemcpy(&h_sum, d_sum, sizeof(long long), cudaMemcpyDeviceToHost);
    printf("Sum of %d ones = %lld\n", n, h_sum);
    
    // Cleanup
    cudaFree(d_sum);
    cuMemUnmap(ptr, physSize);
    cuMemRelease(handle);
    cuMemAddressFree(ptr, reserveSize);
    
    printf("\nVMM workflow complete!\n");
    return 0;
}

In [None]:
!nvcc vmm_basics.cu -o vmm_basics -lcuda && ./vmm_basics

## VMM Workflow Summary

```
┌─────────────────────────────────────────────────────┐
│                  Virtual Address Space               │
│  ┌────────────────────────────────────────────────┐ │
│  │ cuMemAddressReserve (1 GB)                     │ │
│  │ ┌──────────┐                                   │ │
│  │ │ Mapped   │ ← cuMemMap                        │ │
│  │ │ (1 MB)   │                                   │ │
│  │ └──────────┘                                   │ │
│  │ [Unmapped space - can grow into later]         │ │
│  └────────────────────────────────────────────────┘ │
└─────────────────────────────────────────────────────┘
           ↑
           │ cuMemMap
           ↓
┌─────────────────────────────────────────────────────┐
│              Physical Memory (GPU RAM)              │
│  ┌──────────┐                                       │
│  │ cuMemCreate (1 MB)                               │
│  │ (handle)  │                                       │
│  └──────────┘                                       │
└─────────────────────────────────────────────────────┘
```

## Key Takeaways

1. **Virtual ≠ Physical** - Reserve huge VA, allocate small physical
2. **Granularity** - Must align allocations
3. **Access control** - Must explicitly set R/W permissions
4. **Growth without copy** - Map more physical to reserved VA