## The Problem with Synchronous Allocation

Traditional `cudaMalloc`:
- Implicitly synchronizes the device
- Breaks async execution pipelines
- Overhead on every allocation

Stream-ordered allocation:
- Allocation tied to stream ordering
- No implicit synchronization
- Memory pools for fast reuse

In [None]:
%%writefile stream_ordered_alloc.cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void processKernel(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) data[idx] = sqrtf((float)idx);
}

int main() {
    const int N = 1 << 20;
    const size_t size = N * sizeof(float);
    
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    
    // Traditional approach - synchronous allocation
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    const int ITERATIONS = 100;
    
    // Benchmark synchronous
    cudaEventRecord(start);
    for (int i = 0; i < ITERATIONS; i++) {
        float* d_data;
        cudaMalloc(&d_data, size);  // Synchronizes!
        processKernel<<<(N+255)/256, 256, 0, stream>>>(d_data, N);
        cudaFree(d_data);  // Synchronizes!
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float syncMs;
    cudaEventElapsedTime(&syncMs, start, stop);
    
    // Benchmark stream-ordered (async)
    cudaEventRecord(start);
    for (int i = 0; i < ITERATIONS; i++) {
        float* d_data;
        cudaMallocAsync(&d_data, size, stream);  // Non-blocking!
        processKernel<<<(N+255)/256, 256, 0, stream>>>(d_data, N);
        cudaFreeAsync(d_data, stream);  // Non-blocking!
    }
    cudaStreamSynchronize(stream);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float asyncMs;
    cudaEventElapsedTime(&asyncMs, start, stop);
    
    printf("Synchronous (cudaMalloc/cudaFree): %.2f ms\n", syncMs);
    printf("Stream-ordered (Async):            %.2f ms\n", asyncMs);
    printf("Speedup: %.1fx\n", syncMs / asyncMs);
    
    cudaStreamDestroy(stream);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    
    return 0;
}

In [None]:
!nvcc stream_ordered_alloc.cu -o stream_ordered_alloc && ./stream_ordered_alloc

## Memory Pool Configuration

In [None]:
%%writefile mempool_config.cu
#include <stdio.h>
#include <cuda_runtime.h>

int main() {
    int device = 0;
    cudaSetDevice(device);
    
    // Get default memory pool
    cudaMemPool_t pool;
    cudaDeviceGetDefaultMemPool(&pool, device);
    
    // Configure pool: set release threshold
    // Memory below this threshold is kept for reuse
    uint64_t threshold = 256 * 1024 * 1024;  // 256 MB
    cudaMemPoolSetAttribute(pool, cudaMemPoolAttrReleaseThreshold, &threshold);
    printf("Set release threshold: %llu MB\n", threshold / (1024*1024));
    
    // Query pool attributes
    uint64_t usedBytes, reservedBytes;
    cudaMemPoolGetAttribute(pool, cudaMemPoolAttrUsedMemCurrent, &usedBytes);
    cudaMemPoolGetAttribute(pool, cudaMemPoolAttrReservedMemCurrent, &reservedBytes);
    printf("Used: %llu bytes, Reserved: %llu bytes\n", usedBytes, reservedBytes);
    
    // Allocate some memory
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    
    float* d_data;
    size_t size = 100 * 1024 * 1024;  // 100 MB
    cudaMallocAsync(&d_data, size, stream);
    cudaStreamSynchronize(stream);
    
    cudaMemPoolGetAttribute(pool, cudaMemPoolAttrUsedMemCurrent, &usedBytes);
    cudaMemPoolGetAttribute(pool, cudaMemPoolAttrReservedMemCurrent, &reservedBytes);
    printf("After 100MB alloc - Used: %llu MB, Reserved: %llu MB\n", 
           usedBytes/(1024*1024), reservedBytes/(1024*1024));
    
    // Free it
    cudaFreeAsync(d_data, stream);
    cudaStreamSynchronize(stream);
    
    cudaMemPoolGetAttribute(pool, cudaMemPoolAttrUsedMemCurrent, &usedBytes);
    cudaMemPoolGetAttribute(pool, cudaMemPoolAttrReservedMemCurrent, &reservedBytes);
    printf("After free - Used: %llu MB, Reserved: %llu MB\n", 
           usedBytes/(1024*1024), reservedBytes/(1024*1024));
    printf("(Memory kept for reuse up to threshold)\n");
    
    // Trim pool to release memory
    cudaMemPoolTrimTo(pool, 0);
    cudaMemPoolGetAttribute(pool, cudaMemPoolAttrReservedMemCurrent, &reservedBytes);
    printf("After trim - Reserved: %llu MB\n", reservedBytes/(1024*1024));
    
    cudaStreamDestroy(stream);
    return 0;
}

In [None]:
!nvcc mempool_config.cu -o mempool_config && ./mempool_config

## Multi-Stream Pool Sharing

In [None]:
%%writefile multistream_pool.cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void kernel(float* data, int n, float val) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) data[idx] = val;
}

int main() {
    const int N = 1 << 20;
    const size_t size = N * sizeof(float);
    
    cudaStream_t stream1, stream2;
    cudaStreamCreate(&stream1);
    cudaStreamCreate(&stream2);
    
    // Allocate in stream1
    float* d_data;
    cudaMallocAsync(&d_data, size, stream1);
    kernel<<<(N+255)/256, 256, 0, stream1>>>(d_data, N, 1.0f);
    
    // Create dependency: stream2 waits for stream1
    cudaEvent_t event;
    cudaEventCreate(&event);
    cudaEventRecord(event, stream1);
    cudaStreamWaitEvent(stream2, event);
    
    // Now stream2 can use the data
    kernel<<<(N+255)/256, 256, 0, stream2>>>(d_data, N, 2.0f);
    
    // Free in stream2
    cudaFreeAsync(d_data, stream2);
    
    cudaStreamSynchronize(stream2);
    
    printf("Multi-stream pool sharing successful!\n");
    
    cudaEventDestroy(event);
    cudaStreamDestroy(stream1);
    cudaStreamDestroy(stream2);
    
    return 0;
}

In [None]:
!nvcc multistream_pool.cu -o multistream_pool && ./multistream_pool

## Key Takeaways

1. **`cudaMallocAsync`** - Non-blocking allocation tied to stream
2. **`cudaFreeAsync`** - Non-blocking deallocation
3. **Memory pools** - Reuse memory without returning to OS
4. **Release threshold** - Control when pool releases memory
5. **Multi-stream** - Use events for cross-stream dependencies