## Why Custom Allocators?

- **Reduce fragmentation** - Pool similar-sized allocations
- **Avoid synchronization** - Pre-allocate memory
- **Enable growth** - Expand without copy using VMM
- **Application-specific** - Optimize for your workload

In [None]:
%%writefile simple_pool_allocator.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <vector>
#include <stack>

// Simple fixed-size block pool allocator
class BlockPoolAllocator {
    char* pool;
    size_t blockSize;
    size_t numBlocks;
    std::stack<void*> freeBlocks;
    
public:
    BlockPoolAllocator(size_t blockSz, size_t nBlocks) 
        : blockSize(blockSz), numBlocks(nBlocks) {
        // Allocate entire pool
        cudaMalloc(&pool, blockSize * numBlocks);
        
        // Initialize free list
        for (size_t i = 0; i < numBlocks; i++) {
            freeBlocks.push(pool + i * blockSize);
        }
        
        printf("Created pool: %zu blocks of %zu bytes\n", numBlocks, blockSize);
    }
    
    void* allocate() {
        if (freeBlocks.empty()) return nullptr;
        
        void* block = freeBlocks.top();
        freeBlocks.pop();
        return block;
    }
    
    void deallocate(void* ptr) {
        freeBlocks.push(ptr);
    }
    
    size_t available() const { return freeBlocks.size(); }
    
    ~BlockPoolAllocator() {
        cudaFree(pool);
    }
};

__global__ void useBlock(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) data[idx] = (float)idx;
}

int main() {
    // Create pool of 1KB blocks
    size_t blockSize = 1024 * sizeof(float);  // 4KB
    size_t numBlocks = 100;
    
    BlockPoolAllocator pool(blockSize, numBlocks);
    
    printf("Available blocks: %zu\n", pool.available());
    
    // Allocate several blocks
    std::vector<void*> allocated;
    for (int i = 0; i < 10; i++) {
        void* ptr = pool.allocate();
        if (ptr) {
            allocated.push_back(ptr);
            useBlock<<<1, 256>>>((float*)ptr, 1024);
        }
    }
    cudaDeviceSynchronize();
    
    printf("After 10 allocations: %zu available\n", pool.available());
    
    // Return blocks to pool
    for (void* ptr : allocated) {
        pool.deallocate(ptr);
    }
    
    printf("After deallocation: %zu available\n", pool.available());
    
    return 0;
}

In [None]:
!nvcc simple_pool_allocator.cu -o simple_pool_allocator && ./simple_pool_allocator

## VMM-Based Growable Buffer

In [None]:
%%writefile vmm_growable_buffer.cu
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <vector>

class GrowableGPUBuffer {
    CUdeviceptr ptr;
    size_t reservedSize;
    size_t mappedSize;
    size_t granularity;
    int device;
    CUmemAllocationProp prop;
    CUmemAccessDesc accessDesc;
    std::vector<CUmemGenericAllocationHandle> handles;
    
public:
    GrowableGPUBuffer(size_t maxSize) : mappedSize(0) {
        cuInit(0);
        cudaGetDevice(&device);
        
        // Setup allocation properties
        memset(&prop, 0, sizeof(prop));
        prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
        prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
        prop.location.id = device;
        
        cuMemGetAllocationGranularity(&granularity, &prop,
                                      CU_MEM_ALLOC_GRANULARITY_MINIMUM);
        
        // Reserve virtual address space
        reservedSize = align(maxSize);
        cuMemAddressReserve(&ptr, reservedSize, granularity, 0, 0);
        
        // Setup access descriptor
        memset(&accessDesc, 0, sizeof(accessDesc));
        accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
        accessDesc.location.id = device;
        accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
        
        printf("GrowableBuffer: Reserved %zu MB\n", reservedSize / (1024*1024));
    }
    
    size_t align(size_t size) {
        return ((size + granularity - 1) / granularity) * granularity;
    }
    
    bool grow(size_t newSize) {
        newSize = align(newSize);
        if (newSize <= mappedSize) return true;
        if (newSize > reservedSize) return false;
        
        size_t toMap = newSize - mappedSize;
        
        // Create new physical allocation
        CUmemGenericAllocationHandle handle;
        CUresult res = cuMemCreate(&handle, toMap, &prop, 0);
        if (res != CUDA_SUCCESS) return false;
        
        // Map to virtual address
        res = cuMemMap(ptr + mappedSize, toMap, 0, handle, 0);
        if (res != CUDA_SUCCESS) {
            cuMemRelease(handle);
            return false;
        }
        
        // Set access
        res = cuMemSetAccess(ptr + mappedSize, toMap, &accessDesc, 1);
        if (res != CUDA_SUCCESS) return false;
        
        handles.push_back(handle);
        mappedSize = newSize;
        
        return true;
    }
    
    void* data() { return (void*)ptr; }
    size_t size() { return mappedSize; }
    size_t capacity() { return reservedSize; }
    
    ~GrowableGPUBuffer() {
        cuMemUnmap(ptr, mappedSize);
        for (auto& h : handles) cuMemRelease(h);
        cuMemAddressFree(ptr, reservedSize);
    }
};

__global__ void fillKernel(int* data, int n, int value) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) data[idx] = value;
}

int main() {
    // Create buffer with 1GB max capacity
    GrowableGPUBuffer buffer(1ULL << 30);
    
    // Start with 1MB
    size_t size1 = 1 << 20;
    buffer.grow(size1);
    printf("After grow(1MB): %zu bytes mapped\n", buffer.size());
    
    int n1 = size1 / sizeof(int);
    fillKernel<<<(n1+255)/256, 256>>>((int*)buffer.data(), n1, 1);
    cudaDeviceSynchronize();
    
    // Grow to 100MB - NO COPY NEEDED!
    size_t size2 = 100 << 20;
    buffer.grow(size2);
    printf("After grow(100MB): %zu bytes mapped\n", buffer.size());
    
    // Original data still at same address, new space available
    int n2 = size2 / sizeof(int);
    fillKernel<<<(n2+255)/256, 256>>>((int*)buffer.data() + n1, n2 - n1, 2);
    cudaDeviceSynchronize();
    
    // Verify
    int h_val1, h_val2;
    cudaMemcpy(&h_val1, buffer.data(), sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(&h_val2, (int*)buffer.data() + n1, sizeof(int), cudaMemcpyDeviceToHost);
    
    printf("Original region value: %d (expected 1)\n", h_val1);
    printf("New region value: %d (expected 2)\n", h_val2);
    
    printf("\nGrowable buffer SUCCESS - no copying during growth!\n");
    
    return 0;
}

In [None]:
!nvcc vmm_growable_buffer.cu -o vmm_growable_buffer -lcuda && ./vmm_growable_buffer

---

## ðŸŽ¯ Exercises

### ðŸ”· CUDA C++ Exercises (Primary)

Complete these exercises to practice custom allocator design:

In [None]:
%%writefile custom_allocator_exercises.cu
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <vector>
#include <stack>
#include <map>

/*
 * Custom Allocator Exercises
 * 
 * Exercise 1: Slab Allocator
 * - Manage multiple size classes (small, medium, large)
 * - Each size class has its own pool
 * - Reduces fragmentation for varied allocation sizes
 * 
 * Exercise 2: Buddy Allocator
 * - Binary tree-based allocation
 * - Efficient splitting and coalescing
 * - Power-of-2 sizes
 * 
 * Exercise 3: Arena Allocator with VMM
 * - Fast bump-pointer allocation
 * - Expand arena using VMM when needed
 * - Bulk deallocation (reset)
 */

#define CU_CHECK(call) do { \
    CUresult err = call; \
    if (err != CUDA_SUCCESS) { \
        const char* errStr; \
        cuGetErrorString(err, &errStr); \
        printf("CUDA Driver Error: %s at %s:%d\n", errStr, __FILE__, __LINE__); \
        exit(1); \
    } \
} while(0)

#define CUDA_CHECK(call) do { \
    cudaError_t err = call; \
    if (err != cudaSuccess) { \
        printf("CUDA Error: %s at %s:%d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
        exit(1); \
    } \
} while(0)

// Exercise 1: Slab Allocator
class SlabAllocator {
    struct Slab {
        char* base;
        size_t blockSize;
        size_t numBlocks;
        std::stack<void*> freeList;
    };
    
    std::map<size_t, Slab> slabs;  // size class -> slab
    
public:
    SlabAllocator() {
        // TODO Exercise 1a: Initialize size classes
        // Common sizes: 256B, 1KB, 4KB, 16KB, 64KB
        printf("Exercise 1: Slab Allocator\n");
        printf("Initialize size classes: 256B, 1KB, 4KB, 16KB, 64KB\n\n");
        
        // YOUR CODE HERE:
        // std::vector<size_t> sizes = {256, 1024, 4096, 16384, 65536};
        // for (size_t sz : sizes) {
        //     initSlab(sz, 100);  // 100 blocks per slab
        // }
    }
    
    void initSlab(size_t blockSize, size_t numBlocks) {
        Slab slab;
        slab.blockSize = blockSize;
        slab.numBlocks = numBlocks;
        
        // Allocate slab memory
        CUDA_CHECK(cudaMalloc(&slab.base, blockSize * numBlocks));
        
        // Initialize free list
        for (size_t i = 0; i < numBlocks; i++) {
            slab.freeList.push(slab.base + i * blockSize);
        }
        
        slabs[blockSize] = slab;
        printf("Created slab: %zu blocks of %zu bytes\n", numBlocks, blockSize);
    }
    
    void* allocate(size_t size) {
        // TODO Exercise 1b: Find appropriate size class
        // Round up to nearest size class
        
        // YOUR CODE HERE:
        // for (auto& [blockSize, slab] : slabs) {
        //     if (size <= blockSize && !slab.freeList.empty()) {
        //         void* ptr = slab.freeList.top();
        //         slab.freeList.pop();
        //         return ptr;
        //     }
        // }
        
        printf("Allocate %zu bytes - find matching slab\n", size);
        return nullptr;
    }
    
    void deallocate(void* ptr, size_t size) {
        // TODO Exercise 1c: Return to appropriate slab
        // YOUR CODE HERE
        printf("Deallocate pointer back to slab\n");
    }
    
    void printStats() {
        printf("\nSlab Statistics:\n");
        for (auto& [blockSize, slab] : slabs) {
            size_t used = slab.numBlocks - slab.freeList.size();
            printf("  %zu B: %zu/%zu used\n", blockSize, used, slab.numBlocks);
        }
    }
    
    ~SlabAllocator() {
        for (auto& [_, slab] : slabs) {
            cudaFree(slab.base);
        }
    }
};

// Exercise 2: Simple Buddy Allocator
class BuddyAllocator {
    char* pool;
    size_t poolSize;
    size_t minBlockSize;
    int levels;
    
    // Bitmap for tracking allocations at each level
    std::vector<std::vector<bool>> allocated;
    
public:
    BuddyAllocator(size_t size, size_t minBlock = 256) 
        : minBlockSize(minBlock) {
        // Round up to power of 2
        poolSize = 1;
        while (poolSize < size) poolSize <<= 1;
        
        // Calculate levels
        levels = 0;
        size_t s = poolSize;
        while (s >= minBlockSize) {
            levels++;
            s >>= 1;
        }
        
        printf("Exercise 2: Buddy Allocator\n");
        printf("Pool size: %zu, Min block: %zu, Levels: %d\n\n", 
               poolSize, minBlockSize, levels);
        
        // TODO Exercise 2a: Allocate pool memory
        // CUDA_CHECK(cudaMalloc(&pool, poolSize));
        
        // TODO Exercise 2b: Initialize allocation bitmaps
        // allocated.resize(levels);
        // for (int i = 0; i < levels; i++) {
        //     allocated[i].resize(1 << i, false);
        // }
    }
    
    void* allocate(size_t size) {
        // TODO Exercise 2c: Find appropriate level and allocate
        // 1. Round size up to power of 2
        // 2. Find the level for this size
        // 3. Find a free block at this level (split larger if needed)
        // 4. Mark block as allocated
        
        printf("Allocate %zu bytes using buddy algorithm\n", size);
        return nullptr;
    }
    
    void deallocate(void* ptr, size_t size) {
        // TODO Exercise 2d: Deallocate and coalesce
        // 1. Find the block in the tree
        // 2. Mark as free
        // 3. Check if buddy is also free
        // 4. If so, coalesce and repeat at parent level
        
        printf("Deallocate and coalesce with buddy if free\n");
    }
    
    ~BuddyAllocator() {
        if (pool) cudaFree(pool);
    }
};

// Exercise 3: Arena Allocator with VMM
class VMMArenaAllocator {
    CUdeviceptr vaBase;
    size_t vaSize;         // Total reserved VA
    size_t committed;      // Currently mapped physical memory
    size_t used;           // Current allocation offset
    size_t granularity;
    CUmemAllocationProp prop;
    std::vector<CUmemGenericAllocationHandle> handles;
    
public:
    VMMArenaAllocator(size_t maxSize) : vaSize(maxSize), committed(0), used(0) {
        printf("Exercise 3: VMM Arena Allocator\n");
        printf("Reserve %zu bytes of virtual address space\n\n", maxSize);
        
        cuInit(0);
        
        int device;
        cudaGetDevice(&device);
        
        memset(&prop, 0, sizeof(prop));
        prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
        prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
        prop.location.id = device;
        
        CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, 
                                                CU_MEM_ALLOC_GRANULARITY_MINIMUM));
        
        // TODO Exercise 3a: Reserve virtual address range
        // vaSize = ((vaSize + granularity - 1) / granularity) * granularity;
        // CU_CHECK(cuMemAddressReserve(&vaBase, vaSize, granularity, 0, 0));
        
        printf("Arena initialized with granularity: %zu bytes\n", granularity);
    }
    
    void* allocate(size_t size) {
        // Align to 256 bytes
        size = ((size + 255) / 256) * 256;
        
        // TODO Exercise 3b: Expand if needed
        if (used + size > committed) {
            size_t needed = used + size - committed;
            needed = ((needed + granularity - 1) / granularity) * granularity;
            
            printf("Expanding arena by %zu bytes\n", needed);
            
            // YOUR CODE HERE:
            // CUmemGenericAllocationHandle handle;
            // CU_CHECK(cuMemCreate(&handle, needed, &prop, 0));
            // CU_CHECK(cuMemMap(vaBase + committed, needed, 0, handle, 0));
            // 
            // CUmemAccessDesc accessDesc = {};
            // accessDesc.location = prop.location;
            // accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
            // CU_CHECK(cuMemSetAccess(vaBase + committed, needed, &accessDesc, 1));
            // 
            // handles.push_back(handle);
            // committed += needed;
        }
        
        // Bump allocation
        void* ptr = (void*)(vaBase + used);
        used += size;
        
        printf("Allocated %zu bytes at offset %zu\n", size, used - size);
        return ptr;
    }
    
    void reset() {
        // TODO Exercise 3c: Reset arena (keep memory mapped)
        printf("Reset arena - used: %zu -> 0, committed: %zu (unchanged)\n", 
               used, committed);
        used = 0;
    }
    
    size_t getUsed() const { return used; }
    size_t getCommitted() const { return committed; }
    size_t getReserved() const { return vaSize; }
    
    ~VMMArenaAllocator() {
        // Cleanup
        if (vaBase) {
            cuMemUnmap(vaBase, committed);
            for (auto& handle : handles) {
                cuMemRelease(handle);
            }
            cuMemAddressFree(vaBase, vaSize);
        }
    }
};

// Test kernel
__global__ void testKernel(int* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) data[idx] = idx;
}

int main() {
    printf("=== Custom Allocator Exercises ===\n\n");
    
    // Exercise 1: Slab Allocator
    printf("--- Exercise 1: Slab Allocator ---\n");
    SlabAllocator slab;
    // Test allocations of various sizes
    printf("Test allocating: 100B, 500B, 2KB, 10KB\n");
    void* p1 = slab.allocate(100);   // Should use 256B slab
    void* p2 = slab.allocate(500);   // Should use 1KB slab
    void* p3 = slab.allocate(2048);  // Should use 4KB slab
    void* p4 = slab.allocate(10000); // Should use 16KB slab
    slab.printStats();
    printf("\n");
    
    // Exercise 2: Buddy Allocator
    printf("--- Exercise 2: Buddy Allocator ---\n");
    BuddyAllocator buddy(1 << 20, 256);  // 1MB pool, 256B minimum
    printf("Test allocating: 1KB, 4KB, 16KB\n");
    void* b1 = buddy.allocate(1024);
    void* b2 = buddy.allocate(4096);
    void* b3 = buddy.allocate(16384);
    printf("\n");
    
    // Exercise 3: VMM Arena
    printf("--- Exercise 3: VMM Arena Allocator ---\n");
    VMMArenaAllocator arena(1ULL << 30);  // 1GB reserve
    printf("Test sequential allocations:\n");
    for (int i = 0; i < 5; i++) {
        arena.allocate(1 << 20);  // 1MB each
    }
    printf("Used: %zu, Committed: %zu, Reserved: %zu\n",
           arena.getUsed(), arena.getCommitted(), arena.getReserved());
    
    printf("Reset arena...\n");
    arena.reset();
    printf("After reset - Used: %zu\n", arena.getUsed());
    
    printf("\n=== Exercises Complete ===\n");
    printf("Uncomment TODO sections to implement each allocator!\n");
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o custom_allocator_exercises custom_allocator_exercises.cu -lcuda && ./custom_allocator_exercises

### ðŸ”¶ Python/Numba Exercises (Optional)

The following exercises explore allocator concepts in Python:

1. **Pool allocator simulation**: Implement a Python class that pre-allocates device arrays and manages a free list
2. **Memory tracking**: Create a wrapper around `cuda.to_device()` that tracks allocations and reports fragmentation
3. **Arena pattern**: Implement a bump allocator using a pre-allocated NumPy/CuPy array with offset tracking

## Key Takeaways

1. **Block pools** - Fast allocation for fixed-size objects
2. **VMM growable** - Expand without copying
3. **Pre-allocation** - Avoid runtime allocation overhead
4. **Application-specific** - Design for your workload patterns