## Why Custom Allocators?

- **Reduce fragmentation** - Pool similar-sized allocations
- **Avoid synchronization** - Pre-allocate memory
- **Enable growth** - Expand without copy using VMM
- **Application-specific** - Optimize for your workload

In [None]:
%%writefile simple_pool_allocator.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <vector>
#include <stack>

// Simple fixed-size block pool allocator
class BlockPoolAllocator {
    char* pool;
    size_t blockSize;
    size_t numBlocks;
    std::stack<void*> freeBlocks;
    
public:
    BlockPoolAllocator(size_t blockSz, size_t nBlocks) 
        : blockSize(blockSz), numBlocks(nBlocks) {
        // Allocate entire pool
        cudaMalloc(&pool, blockSize * numBlocks);
        
        // Initialize free list
        for (size_t i = 0; i < numBlocks; i++) {
            freeBlocks.push(pool + i * blockSize);
        }
        
        printf("Created pool: %zu blocks of %zu bytes\n", numBlocks, blockSize);
    }
    
    void* allocate() {
        if (freeBlocks.empty()) return nullptr;
        
        void* block = freeBlocks.top();
        freeBlocks.pop();
        return block;
    }
    
    void deallocate(void* ptr) {
        freeBlocks.push(ptr);
    }
    
    size_t available() const { return freeBlocks.size(); }
    
    ~BlockPoolAllocator() {
        cudaFree(pool);
    }
};

__global__ void useBlock(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) data[idx] = (float)idx;
}

int main() {
    // Create pool of 1KB blocks
    size_t blockSize = 1024 * sizeof(float);  // 4KB
    size_t numBlocks = 100;
    
    BlockPoolAllocator pool(blockSize, numBlocks);
    
    printf("Available blocks: %zu\n", pool.available());
    
    // Allocate several blocks
    std::vector<void*> allocated;
    for (int i = 0; i < 10; i++) {
        void* ptr = pool.allocate();
        if (ptr) {
            allocated.push_back(ptr);
            useBlock<<<1, 256>>>((float*)ptr, 1024);
        }
    }
    cudaDeviceSynchronize();
    
    printf("After 10 allocations: %zu available\n", pool.available());
    
    // Return blocks to pool
    for (void* ptr : allocated) {
        pool.deallocate(ptr);
    }
    
    printf("After deallocation: %zu available\n", pool.available());
    
    return 0;
}

In [None]:
!nvcc simple_pool_allocator.cu -o simple_pool_allocator && ./simple_pool_allocator

## VMM-Based Growable Buffer

In [None]:
%%writefile vmm_growable_buffer.cu
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <vector>

class GrowableGPUBuffer {
    CUdeviceptr ptr;
    size_t reservedSize;
    size_t mappedSize;
    size_t granularity;
    int device;
    CUmemAllocationProp prop;
    CUmemAccessDesc accessDesc;
    std::vector<CUmemGenericAllocationHandle> handles;
    
public:
    GrowableGPUBuffer(size_t maxSize) : mappedSize(0) {
        cuInit(0);
        cudaGetDevice(&device);
        
        // Setup allocation properties
        memset(&prop, 0, sizeof(prop));
        prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
        prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
        prop.location.id = device;
        
        cuMemGetAllocationGranularity(&granularity, &prop,
                                      CU_MEM_ALLOC_GRANULARITY_MINIMUM);
        
        // Reserve virtual address space
        reservedSize = align(maxSize);
        cuMemAddressReserve(&ptr, reservedSize, granularity, 0, 0);
        
        // Setup access descriptor
        memset(&accessDesc, 0, sizeof(accessDesc));
        accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
        accessDesc.location.id = device;
        accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
        
        printf("GrowableBuffer: Reserved %zu MB\n", reservedSize / (1024*1024));
    }
    
    size_t align(size_t size) {
        return ((size + granularity - 1) / granularity) * granularity;
    }
    
    bool grow(size_t newSize) {
        newSize = align(newSize);
        if (newSize <= mappedSize) return true;
        if (newSize > reservedSize) return false;
        
        size_t toMap = newSize - mappedSize;
        
        // Create new physical allocation
        CUmemGenericAllocationHandle handle;
        CUresult res = cuMemCreate(&handle, toMap, &prop, 0);
        if (res != CUDA_SUCCESS) return false;
        
        // Map to virtual address
        res = cuMemMap(ptr + mappedSize, toMap, 0, handle, 0);
        if (res != CUDA_SUCCESS) {
            cuMemRelease(handle);
            return false;
        }
        
        // Set access
        res = cuMemSetAccess(ptr + mappedSize, toMap, &accessDesc, 1);
        if (res != CUDA_SUCCESS) return false;
        
        handles.push_back(handle);
        mappedSize = newSize;
        
        return true;
    }
    
    void* data() { return (void*)ptr; }
    size_t size() { return mappedSize; }
    size_t capacity() { return reservedSize; }
    
    ~GrowableGPUBuffer() {
        cuMemUnmap(ptr, mappedSize);
        for (auto& h : handles) cuMemRelease(h);
        cuMemAddressFree(ptr, reservedSize);
    }
};

__global__ void fillKernel(int* data, int n, int value) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) data[idx] = value;
}

int main() {
    // Create buffer with 1GB max capacity
    GrowableGPUBuffer buffer(1ULL << 30);
    
    // Start with 1MB
    size_t size1 = 1 << 20;
    buffer.grow(size1);
    printf("After grow(1MB): %zu bytes mapped\n", buffer.size());
    
    int n1 = size1 / sizeof(int);
    fillKernel<<<(n1+255)/256, 256>>>((int*)buffer.data(), n1, 1);
    cudaDeviceSynchronize();
    
    // Grow to 100MB - NO COPY NEEDED!
    size_t size2 = 100 << 20;
    buffer.grow(size2);
    printf("After grow(100MB): %zu bytes mapped\n", buffer.size());
    
    // Original data still at same address, new space available
    int n2 = size2 / sizeof(int);
    fillKernel<<<(n2+255)/256, 256>>>((int*)buffer.data() + n1, n2 - n1, 2);
    cudaDeviceSynchronize();
    
    // Verify
    int h_val1, h_val2;
    cudaMemcpy(&h_val1, buffer.data(), sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(&h_val2, (int*)buffer.data() + n1, sizeof(int), cudaMemcpyDeviceToHost);
    
    printf("Original region value: %d (expected 1)\n", h_val1);
    printf("New region value: %d (expected 2)\n", h_val2);
    
    printf("\nGrowable buffer SUCCESS - no copying during growth!\n");
    
    return 0;
}

In [None]:
!nvcc vmm_growable_buffer.cu -o vmm_growable_buffer -lcuda && ./vmm_growable_buffer

## Key Takeaways

1. **Block pools** - Fast allocation for fixed-size objects
2. **VMM growable** - Expand without copying
3. **Pre-allocation** - Avoid runtime allocation overhead
4. **Application-specific** - Design for your workload patterns