## The Problem with Synchronous Allocation

Traditional `cudaMalloc`:
- Implicitly synchronizes the device
- Breaks async execution pipelines
- Overhead on every allocation

Stream-ordered allocation:
- Allocation tied to stream ordering
- No implicit synchronization
- Memory pools for fast reuse

In [None]:
%%writefile stream_ordered_alloc.cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void processKernel(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) data[idx] = sqrtf((float)idx);
}

int main() {
    const int N = 1 << 20;
    const size_t size = N * sizeof(float);
    
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    
    // Traditional approach - synchronous allocation
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    const int ITERATIONS = 100;
    
    // Benchmark synchronous
    cudaEventRecord(start);
    for (int i = 0; i < ITERATIONS; i++) {
        float* d_data;
        cudaMalloc(&d_data, size);  // Synchronizes!
        processKernel<<<(N+255)/256, 256, 0, stream>>>(d_data, N);
        cudaFree(d_data);  // Synchronizes!
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float syncMs;
    cudaEventElapsedTime(&syncMs, start, stop);
    
    // Benchmark stream-ordered (async)
    cudaEventRecord(start);
    for (int i = 0; i < ITERATIONS; i++) {
        float* d_data;
        cudaMallocAsync(&d_data, size, stream);  // Non-blocking!
        processKernel<<<(N+255)/256, 256, 0, stream>>>(d_data, N);
        cudaFreeAsync(d_data, stream);  // Non-blocking!
    }
    cudaStreamSynchronize(stream);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float asyncMs;
    cudaEventElapsedTime(&asyncMs, start, stop);
    
    printf("Synchronous (cudaMalloc/cudaFree): %.2f ms\n", syncMs);
    printf("Stream-ordered (Async):            %.2f ms\n", asyncMs);
    printf("Speedup: %.1fx\n", syncMs / asyncMs);
    
    cudaStreamDestroy(stream);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    
    return 0;
}

In [None]:
!nvcc stream_ordered_alloc.cu -o stream_ordered_alloc && ./stream_ordered_alloc

## Memory Pool Configuration

In [None]:
%%writefile mempool_config.cu
#include <stdio.h>
#include <cuda_runtime.h>

int main() {
    int device = 0;
    cudaSetDevice(device);
    
    // Get default memory pool
    cudaMemPool_t pool;
    cudaDeviceGetDefaultMemPool(&pool, device);
    
    // Configure pool: set release threshold
    // Memory below this threshold is kept for reuse
    uint64_t threshold = 256 * 1024 * 1024;  // 256 MB
    cudaMemPoolSetAttribute(pool, cudaMemPoolAttrReleaseThreshold, &threshold);
    printf("Set release threshold: %llu MB\n", threshold / (1024*1024));
    
    // Query pool attributes
    uint64_t usedBytes, reservedBytes;
    cudaMemPoolGetAttribute(pool, cudaMemPoolAttrUsedMemCurrent, &usedBytes);
    cudaMemPoolGetAttribute(pool, cudaMemPoolAttrReservedMemCurrent, &reservedBytes);
    printf("Used: %llu bytes, Reserved: %llu bytes\n", usedBytes, reservedBytes);
    
    // Allocate some memory
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    
    float* d_data;
    size_t size = 100 * 1024 * 1024;  // 100 MB
    cudaMallocAsync(&d_data, size, stream);
    cudaStreamSynchronize(stream);
    
    cudaMemPoolGetAttribute(pool, cudaMemPoolAttrUsedMemCurrent, &usedBytes);
    cudaMemPoolGetAttribute(pool, cudaMemPoolAttrReservedMemCurrent, &reservedBytes);
    printf("After 100MB alloc - Used: %llu MB, Reserved: %llu MB\n", 
           usedBytes/(1024*1024), reservedBytes/(1024*1024));
    
    // Free it
    cudaFreeAsync(d_data, stream);
    cudaStreamSynchronize(stream);
    
    cudaMemPoolGetAttribute(pool, cudaMemPoolAttrUsedMemCurrent, &usedBytes);
    cudaMemPoolGetAttribute(pool, cudaMemPoolAttrReservedMemCurrent, &reservedBytes);
    printf("After free - Used: %llu MB, Reserved: %llu MB\n", 
           usedBytes/(1024*1024), reservedBytes/(1024*1024));
    printf("(Memory kept for reuse up to threshold)\n");
    
    // Trim pool to release memory
    cudaMemPoolTrimTo(pool, 0);
    cudaMemPoolGetAttribute(pool, cudaMemPoolAttrReservedMemCurrent, &reservedBytes);
    printf("After trim - Reserved: %llu MB\n", reservedBytes/(1024*1024));
    
    cudaStreamDestroy(stream);
    return 0;
}

In [None]:
!nvcc mempool_config.cu -o mempool_config && ./mempool_config

## Multi-Stream Pool Sharing

In [None]:
%%writefile multistream_pool.cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void kernel(float* data, int n, float val) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) data[idx] = val;
}

int main() {
    const int N = 1 << 20;
    const size_t size = N * sizeof(float);
    
    cudaStream_t stream1, stream2;
    cudaStreamCreate(&stream1);
    cudaStreamCreate(&stream2);
    
    // Allocate in stream1
    float* d_data;
    cudaMallocAsync(&d_data, size, stream1);
    kernel<<<(N+255)/256, 256, 0, stream1>>>(d_data, N, 1.0f);
    
    // Create dependency: stream2 waits for stream1
    cudaEvent_t event;
    cudaEventCreate(&event);
    cudaEventRecord(event, stream1);
    cudaStreamWaitEvent(stream2, event);
    
    // Now stream2 can use the data
    kernel<<<(N+255)/256, 256, 0, stream2>>>(d_data, N, 2.0f);
    
    // Free in stream2
    cudaFreeAsync(d_data, stream2);
    
    cudaStreamSynchronize(stream2);
    
    printf("Multi-stream pool sharing successful!\n");
    
    cudaEventDestroy(event);
    cudaStreamDestroy(stream1);
    cudaStreamDestroy(stream2);
    
    return 0;
}

In [None]:
!nvcc multistream_pool.cu -o multistream_pool && ./multistream_pool

---

## ðŸŽ¯ Exercises

### ðŸ”· CUDA C++ Exercises (Primary)

Complete these exercises to practice stream-ordered memory allocation:

In [None]:
%%writefile stream_ordered_exercises.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <vector>

/*
 * Stream-Ordered Memory Allocation Exercises
 * 
 * Exercise 1: Implement a pipeline with async allocation
 * - Create multiple streams
 * - Use cudaMallocAsync in each stream
 * - Overlap allocation, compute, and deallocation
 * 
 * Exercise 2: Memory pool configuration
 * - Create a custom memory pool
 * - Set release threshold
 * - Measure memory reuse efficiency
 * 
 * Exercise 3: Cross-stream memory sharing
 * - Allocate in stream A
 * - Use cudaEventRecord to synchronize
 * - Access memory in stream B
 */

#define CUDA_CHECK(call) do { \
    cudaError_t err = call; \
    if (err != cudaSuccess) { \
        printf("CUDA Error: %s at %s:%d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
        exit(1); \
    } \
} while(0)

__global__ void processKernel(float* data, int n, float multiplier) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        data[idx] = sqrtf((float)idx) * multiplier;
    }
}

__global__ void reduceKernel(float* data, int n, float* result) {
    __shared__ float sdata[256];
    int tid = threadIdx.x;
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    
    sdata[tid] = (idx < n) ? data[idx] : 0.0f;
    __syncthreads();
    
    for (int s = 128; s > 0; s >>= 1) {
        if (tid < s) sdata[tid] += sdata[tid + s];
        __syncthreads();
    }
    
    if (tid == 0) atomicAdd(result, sdata[0]);
}

// Exercise 1: Async Allocation Pipeline
void exercise1_asyncPipeline() {
    printf("=== Exercise 1: Async Allocation Pipeline ===\n");
    
    const int NUM_STREAMS = 4;
    const int N = 1 << 20;
    const size_t size = N * sizeof(float);
    const int ITERATIONS = 10;
    
    cudaStream_t streams[NUM_STREAMS];
    for (int i = 0; i < NUM_STREAMS; i++) {
        CUDA_CHECK(cudaStreamCreate(&streams[i]));
    }
    
    cudaEvent_t start, stop;
    CUDA_CHECK(cudaEventCreate(&start));
    CUDA_CHECK(cudaEventCreate(&stop));
    
    // TODO Exercise 1a: Implement pipelined async allocation
    // For each iteration:
    //   1. cudaMallocAsync a buffer in stream[i % NUM_STREAMS]
    //   2. Launch processKernel in that stream
    //   3. cudaFreeAsync in that stream
    
    printf("Implement async allocation pipeline with %d streams\n", NUM_STREAMS);
    printf("Each iteration should:\n");
    printf("  - cudaMallocAsync %zu bytes\n", size);
    printf("  - Launch processKernel\n");
    printf("  - cudaFreeAsync\n\n");
    
    // YOUR CODE HERE:
    CUDA_CHECK(cudaEventRecord(start));
    for (int iter = 0; iter < ITERATIONS; iter++) {
        cudaStream_t stream = streams[iter % NUM_STREAMS];
        float* d_data;
        
        // TODO: Uncomment and complete:
        // CUDA_CHECK(cudaMallocAsync(&d_data, size, stream));
        // processKernel<<<(N+255)/256, 256, 0, stream>>>(d_data, N, 1.0f);
        // CUDA_CHECK(cudaFreeAsync(d_data, stream));
    }
    CUDA_CHECK(cudaEventRecord(stop));
    
    // Sync all streams
    for (int i = 0; i < NUM_STREAMS; i++) {
        CUDA_CHECK(cudaStreamSynchronize(streams[i]));
    }
    
    float ms;
    CUDA_CHECK(cudaEventElapsedTime(&ms, start, stop));
    printf("Pipeline time: %.3f ms for %d iterations\n\n", ms, ITERATIONS);
    
    // Cleanup
    for (int i = 0; i < NUM_STREAMS; i++) {
        CUDA_CHECK(cudaStreamDestroy(streams[i]));
    }
    CUDA_CHECK(cudaEventDestroy(start));
    CUDA_CHECK(cudaEventDestroy(stop));
}

// Exercise 2: Memory Pool Configuration
void exercise2_poolConfiguration() {
    printf("=== Exercise 2: Memory Pool Configuration ===\n");
    
    const size_t SIZE = 1 << 24;  // 16 MB
    
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));
    
    // TODO Exercise 2a: Get the current device's memory pool
    cudaMemPool_t memPool;
    int device;
    CUDA_CHECK(cudaGetDevice(&device));
    
    printf("Get current device memory pool and configure it:\n");
    printf("  - cudaDeviceGetMemPool\n");
    printf("  - cudaMemPoolSetAttribute for release threshold\n\n");
    
    // YOUR CODE HERE:
    // CUDA_CHECK(cudaDeviceGetMemPool(&memPool, device));
    
    // TODO Exercise 2b: Set release threshold
    // uint64_t threshold = UINT64_MAX;  // Never release back to OS
    // CUDA_CHECK(cudaMemPoolSetAttribute(memPool, 
    //            cudaMemPoolAttrReleaseThreshold, &threshold));
    
    // TODO Exercise 2c: Allocate and free multiple times, measure reuse
    printf("Allocate/free %zu bytes multiple times and observe memory reuse\n", SIZE);
    
    for (int i = 0; i < 5; i++) {
        float* d_data;
        CUDA_CHECK(cudaMallocAsync(&d_data, SIZE, stream));
        
        // Get pool statistics
        size_t currentSize = 0, highWatermark = 0;
        // CUDA_CHECK(cudaMemPoolGetAttribute(memPool, 
        //            cudaMemPoolAttrUsedMemCurrent, &currentSize));
        // CUDA_CHECK(cudaMemPoolGetAttribute(memPool,
        //            cudaMemPoolAttrUsedMemHigh, &highWatermark));
        
        printf("  Iteration %d: allocated, current=%zu, high=%zu\n", 
               i, currentSize, highWatermark);
        
        CUDA_CHECK(cudaFreeAsync(d_data, stream));
    }
    CUDA_CHECK(cudaStreamSynchronize(stream));
    
    printf("\n");
    CUDA_CHECK(cudaStreamDestroy(stream));
}

// Exercise 3: Cross-Stream Memory Sharing
void exercise3_crossStreamSharing() {
    printf("=== Exercise 3: Cross-Stream Memory Sharing ===\n");
    
    const int N = 1 << 20;
    const size_t size = N * sizeof(float);
    
    cudaStream_t streamA, streamB;
    CUDA_CHECK(cudaStreamCreate(&streamA));
    CUDA_CHECK(cudaStreamCreate(&streamB));
    
    cudaEvent_t dataReady, computeDone;
    CUDA_CHECK(cudaEventCreate(&dataReady));
    CUDA_CHECK(cudaEventCreate(&computeDone));
    
    printf("Pattern: Allocate in streamA, use in streamB\n");
    printf("  1. cudaMallocAsync in streamA\n");
    printf("  2. cudaEventRecord(dataReady, streamA)\n");
    printf("  3. cudaStreamWaitEvent(streamB, dataReady)\n");
    printf("  4. Use memory in streamB\n");
    printf("  5. cudaEventRecord(computeDone, streamB)\n");
    printf("  6. cudaStreamWaitEvent(streamA, computeDone)\n");
    printf("  7. cudaFreeAsync in streamA\n\n");
    
    // TODO Exercise 3: Implement the cross-stream pattern
    float* d_data;
    float* d_result;
    
    // YOUR CODE HERE:
    // Step 1: Allocate in streamA
    // CUDA_CHECK(cudaMallocAsync(&d_data, size, streamA));
    // CUDA_CHECK(cudaMallocAsync(&d_result, sizeof(float), streamA));
    // CUDA_CHECK(cudaMemsetAsync(d_result, 0, sizeof(float), streamA));
    
    // Step 2: Signal data is ready
    // CUDA_CHECK(cudaEventRecord(dataReady, streamA));
    
    // Step 3: streamB waits for data
    // CUDA_CHECK(cudaStreamWaitEvent(streamB, dataReady));
    
    // Step 4: Use in streamB
    // processKernel<<<(N+255)/256, 256, 0, streamB>>>(d_data, N, 2.0f);
    // reduceKernel<<<(N+255)/256, 256, 0, streamB>>>(d_data, N, d_result);
    
    // Step 5-6: Signal compute done, streamA waits
    // CUDA_CHECK(cudaEventRecord(computeDone, streamB));
    // CUDA_CHECK(cudaStreamWaitEvent(streamA, computeDone));
    
    // Step 7: Free in streamA
    // CUDA_CHECK(cudaFreeAsync(d_data, streamA));
    // CUDA_CHECK(cudaFreeAsync(d_result, streamA));
    
    CUDA_CHECK(cudaStreamSynchronize(streamA));
    CUDA_CHECK(cudaStreamSynchronize(streamB));
    
    printf("Cross-stream sharing pattern complete!\n\n");
    
    CUDA_CHECK(cudaEventDestroy(dataReady));
    CUDA_CHECK(cudaEventDestroy(computeDone));
    CUDA_CHECK(cudaStreamDestroy(streamA));
    CUDA_CHECK(cudaStreamDestroy(streamB));
}

int main() {
    printf("=== Stream-Ordered Memory Allocation Exercises ===\n\n");
    
    exercise1_asyncPipeline();
    exercise2_poolConfiguration();
    exercise3_crossStreamSharing();
    
    printf("=== Exercises Complete ===\n");
    printf("Uncomment the TODO sections to complete each exercise!\n");
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o stream_ordered_exercises stream_ordered_exercises.cu && ./stream_ordered_exercises

### ðŸ”¶ Python/Numba Exercises (Optional)

The following exercises explore stream-ordered concepts accessible in Python:

1. **Stream management**: Create multiple CUDA streams with `cuda.stream()` and measure overlap efficiency
2. **Async transfers**: Use `cuda.to_device(data, stream=s)` for async memory transfers between streams
3. **Event synchronization**: Implement producer-consumer pattern using `cuda.event()` for cross-stream sync

## Key Takeaways

1. **`cudaMallocAsync`** - Non-blocking allocation tied to stream
2. **`cudaFreeAsync`** - Non-blocking deallocation
3. **Memory pools** - Reuse memory without returning to OS
4. **Release threshold** - Control when pool releases memory
5. **Multi-stream** - Use events for cross-stream dependencies