## Adaptive Grid Sizing

CDP allows kernels to decide at runtime how much parallelism to spawn based on actual data.

In [None]:
%%writefile adaptive_grid.cu
#include <stdio.h>
#include <cuda_runtime.h>

// Worker kernel - processes items in range
__global__ void processRange(int* data, int start, int end, int* output) {
    int idx = start + blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < end) {
        output[idx] = data[idx] * data[idx];  // Square each element
    }
}

// Coordinator kernel - adapts parallelism to workload
__global__ void adaptiveProcess(int* data, int* counts, int numRegions, int* output) {
    int region = blockIdx.x;
    if (region >= numRegions) return;
    
    int count = counts[region];
    int start = 0;
    for (int i = 0; i < region; i++) start += counts[i];
    int end = start + count;
    
    // Adapt grid size to actual workload
    if (count > 0) {
        int threadsPerBlock = 128;
        int blocks = (count + threadsPerBlock - 1) / threadsPerBlock;
        
        printf("Region %d: %d items -> launching %d blocks\n", region, count, blocks);
        processRange<<<blocks, threadsPerBlock>>>(data, start, end, output);
    }
}

int main() {
    // Variable-sized regions (simulating irregular workload)
    int h_counts[] = {100, 5, 500, 20, 1000};
    int numRegions = 5;
    int totalItems = 0;
    for (int i = 0; i < numRegions; i++) totalItems += h_counts[i];
    
    printf("Total items: %d across %d regions\n", totalItems, numRegions);
    
    int *d_data, *d_counts, *d_output;
    cudaMalloc(&d_data, totalItems * sizeof(int));
    cudaMalloc(&d_counts, numRegions * sizeof(int));
    cudaMalloc(&d_output, totalItems * sizeof(int));
    
    // Initialize data
    int* h_data = new int[totalItems];
    for (int i = 0; i < totalItems; i++) h_data[i] = i;
    cudaMemcpy(d_data, h_data, totalItems * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_counts, h_counts, numRegions * sizeof(int), cudaMemcpyHostToDevice);
    
    // Launch adaptive coordinator
    adaptiveProcess<<<numRegions, 1>>>(d_data, d_counts, numRegions, d_output);
    cudaDeviceSynchronize();
    
    // Verify results
    int* h_output = new int[totalItems];
    cudaMemcpy(h_output, d_output, totalItems * sizeof(int), cudaMemcpyDeviceToHost);
    printf("\nSample results: output[0]=%d, output[100]=%d, output[600]=%d\n",
           h_output[0], h_output[100], h_output[600]);
    
    delete[] h_data;
    delete[] h_output;
    cudaFree(d_data);
    cudaFree(d_counts);
    cudaFree(d_output);
    
    return 0;
}

In [None]:
!nvcc -rdc=true -lcudadevrt adaptive_grid.cu -o adaptive_grid && ./adaptive_grid

## Quadtree Construction with CDP

Quadtrees adaptively subdivide 2D space based on point density - perfect for CDP.

In [None]:
%%writefile cdp_quadtree.cu
#include <stdio.h>
#include <cuda_runtime.h>

#define MAX_POINTS_PER_NODE 4
#define MAX_DEPTH 8

struct Point {
    float x, y;
};

struct BoundingBox {
    float minX, minY, maxX, maxY;
    
    __device__ bool contains(Point p) {
        return p.x >= minX && p.x <= maxX && p.y >= minY && p.y <= maxY;
    }
    
    __device__ BoundingBox quadrant(int q) {
        float midX = (minX + maxX) / 2;
        float midY = (minY + maxY) / 2;
        switch (q) {
            case 0: return {minX, midY, midX, maxY};  // NW
            case 1: return {midX, midY, maxX, maxY};  // NE
            case 2: return {minX, minY, midX, midY};  // SW
            case 3: return {midX, minY, maxX, midY};  // SE
        }
        return *this;
    }
};

// Count points in bounding box
__device__ int countPointsInBox(Point* points, int n, BoundingBox box) {
    int count = 0;
    for (int i = 0; i < n; i++) {
        if (box.contains(points[i])) count++;
    }
    return count;
}

// Recursive quadtree construction
__global__ void buildQuadtree(Point* points, int n, BoundingBox box, int depth, int* nodeCount) {
    int count = countPointsInBox(points, n, box);
    
    if (count == 0 || depth >= MAX_DEPTH) {
        return;
    }
    
    // Count this node
    int nodeId = atomicAdd(nodeCount, 1);
    
    if (count <= MAX_POINTS_PER_NODE) {
        // Leaf node - no further subdivision
        printf("Leaf node %d at depth %d: %d points\n", nodeId, depth, count);
        return;
    }
    
    // Internal node - subdivide
    printf("Internal node %d at depth %d: %d points -> subdividing\n", nodeId, depth, count);
    
    // Launch children for each quadrant
    for (int q = 0; q < 4; q++) {
        BoundingBox childBox = box.quadrant(q);
        buildQuadtree<<<1, 1>>>(points, n, childBox, depth + 1, nodeCount);
    }
    cudaDeviceSynchronize();
}

int main() {
    const int N = 100;
    Point h_points[N];
    
    // Generate clustered points
    srand(42);
    for (int i = 0; i < N/2; i++) {
        // Cluster 1: upper-right
        h_points[i] = {0.7f + 0.3f * rand() / RAND_MAX, 0.7f + 0.3f * rand() / RAND_MAX};
    }
    for (int i = N/2; i < N; i++) {
        // Cluster 2: lower-left
        h_points[i] = {0.0f + 0.3f * rand() / RAND_MAX, 0.0f + 0.3f * rand() / RAND_MAX};
    }
    
    Point* d_points;
    int* d_nodeCount;
    cudaMalloc(&d_points, N * sizeof(Point));
    cudaMalloc(&d_nodeCount, sizeof(int));
    
    cudaMemcpy(d_points, h_points, N * sizeof(Point), cudaMemcpyHostToDevice);
    cudaMemset(d_nodeCount, 0, sizeof(int));
    
    BoundingBox rootBox = {0, 0, 1, 1};
    buildQuadtree<<<1, 1>>>(d_points, N, rootBox, 0, d_nodeCount);
    cudaDeviceSynchronize();
    
    int h_nodeCount;
    cudaMemcpy(&h_nodeCount, d_nodeCount, sizeof(int), cudaMemcpyDeviceToHost);
    printf("\nTotal nodes created: %d\n", h_nodeCount);
    
    cudaFree(d_points);
    cudaFree(d_nodeCount);
    
    return 0;
}

In [None]:
!nvcc -rdc=true -lcudadevrt cdp_quadtree.cu -o cdp_quadtree && ./cdp_quadtree

## When CDP Shines vs. When to Avoid

### Good Use Cases ✅
- Recursive divide-and-conquer
- Irregular/adaptive workloads
- Tree/graph traversal
- Workload discovered at runtime

### Avoid When ❌
- Regular, predictable parallelism
- Very deep recursion (>16 levels)
- Launching many tiny kernels
- CPU coordination is acceptable

## Key Takeaways

1. **Adaptive parallelism** - spawn work based on actual data
2. **Hierarchical structures** - quadtree/octree natural fit
3. **Irregular workloads** - regions with variable sizes
4. **Trade-offs** - launch overhead vs. CPU roundtrip

## Next: Day 4 - CDP Optimization & Best Practices