## CDP2 Tail Launch Optimization

CUDA 12+ introduces CDP2 with tail launch - child kernel reuses parent's resources when parent is about to exit.

In [None]:
%%writefile cdp_tail_launch.cu
#include <stdio.h>
#include <cuda_runtime.h>

// With tail launch: child inherits parent slot
__global__ void tailRecursive(int* data, int n, int depth) {
    if (n <= 1 || depth >= 10) {
        if (threadIdx.x == 0) {
            printf("Base case at depth %d, n=%d\n", depth, n);
        }
        return;
    }
    
    // Process current level
    int tid = threadIdx.x;
    if (tid < n) {
        data[tid] += depth;
    }
    __syncthreads();
    
    // Tail launch - parent exits immediately after
    if (tid == 0) {
        tailRecursive<<<1, n/2>>>(data, n/2, depth + 1);
    }
    // No cudaDeviceSynchronize - tail launch pattern!
}

int main() {
    const int N = 256;
    int *d_data;
    cudaMalloc(&d_data, N * sizeof(int));
    cudaMemset(d_data, 0, N * sizeof(int));
    
    tailRecursive<<<1, N>>>(d_data, N, 0);
    cudaDeviceSynchronize();
    
    int h_data[N];
    cudaMemcpy(h_data, d_data, N * sizeof(int), cudaMemcpyDeviceToHost);
    printf("Results: data[0]=%d, data[10]=%d, data[100]=%d\n",
           h_data[0], h_data[10], h_data[100]);
    
    cudaFree(d_data);
    return 0;
}

In [None]:
!nvcc -rdc=true -lcudadevrt cdp_tail_launch.cu -o cdp_tail_launch && ./cdp_tail_launch

## Batching Child Launches

Launch overhead can dominate - batch work into fewer, larger kernels.

In [None]:
%%writefile cdp_batching.cu
#include <stdio.h>
#include <cuda_runtime.h>

// BAD: Many tiny kernel launches
__global__ void badPattern(int* data, int n) {
    for (int i = 0; i < n; i++) {
        // Each iteration launches a tiny kernel
        // Massive overhead!
    }
}

// GOOD: Batch worker kernel
__global__ void batchWorker(int* data, int* offsets, int* counts, int numTasks) {
    int taskId = blockIdx.x;
    if (taskId >= numTasks) return;
    
    int offset = offsets[taskId];
    int count = counts[taskId];
    
    for (int i = threadIdx.x; i < count; i += blockDim.x) {
        data[offset + i] *= 2;
    }
}

// GOOD: Batch coordinator
__global__ void batchCoordinator(int* data, int* offsets, int* counts, int numTasks) {
    if (threadIdx.x == 0) {
        // One launch for all tasks!
        int threadsPerBlock = 256;
        batchWorker<<<numTasks, threadsPerBlock>>>(data, offsets, counts, numTasks);
        cudaDeviceSynchronize();
        printf("Processed %d tasks in single batched launch\n", numTasks);
    }
}

int main() {
    const int NUM_TASKS = 100;
    const int TOTAL_DATA = 10000;
    
    int h_offsets[NUM_TASKS], h_counts[NUM_TASKS];
    int offset = 0;
    for (int i = 0; i < NUM_TASKS; i++) {
        h_offsets[i] = offset;
        h_counts[i] = TOTAL_DATA / NUM_TASKS;
        offset += h_counts[i];
    }
    
    int *d_data, *d_offsets, *d_counts;
    cudaMalloc(&d_data, TOTAL_DATA * sizeof(int));
    cudaMalloc(&d_offsets, NUM_TASKS * sizeof(int));
    cudaMalloc(&d_counts, NUM_TASKS * sizeof(int));
    
    cudaMemcpy(d_offsets, h_offsets, NUM_TASKS * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_counts, h_counts, NUM_TASKS * sizeof(int), cudaMemcpyHostToDevice);
    
    // Initialize data
    int* h_data = new int[TOTAL_DATA];
    for (int i = 0; i < TOTAL_DATA; i++) h_data[i] = 1;
    cudaMemcpy(d_data, h_data, TOTAL_DATA * sizeof(int), cudaMemcpyHostToDevice);
    
    batchCoordinator<<<1, 1>>>(d_data, d_offsets, d_counts, NUM_TASKS);
    cudaDeviceSynchronize();
    
    cudaMemcpy(h_data, d_data, TOTAL_DATA * sizeof(int), cudaMemcpyDeviceToHost);
    printf("Result check: data[0]=%d, data[5000]=%d\n", h_data[0], h_data[5000]);
    
    delete[] h_data;
    cudaFree(d_data);
    cudaFree(d_offsets);
    cudaFree(d_counts);
    
    return 0;
}

In [None]:
!nvcc -rdc=true -lcudadevrt cdp_batching.cu -o cdp_batching && ./cdp_batching

## Memory Optimization

### Use Unified Memory for CDP
- Automatic coherence between parent/child
- Simpler programming model
- Works well with migration hints

In [None]:
%%writefile cdp_unified_memory.cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void childProcess(int* data, int start, int count) {
    int idx = start + blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < start + count) {
        data[idx] = data[idx] * 2 + 1;
    }
}

__global__ void parentWithUnified(int* data, int n) {
    // With unified memory, parent and child see consistent view
    if (threadIdx.x == 0) {
        int blocks = (n + 255) / 256;
        childProcess<<<blocks, 256>>>(data, 0, n);
        cudaDeviceSynchronize();
        
        // Can immediately read child's results!
        printf("After child: data[0]=%d, data[n-1]=%d\n", data[0], data[n-1]);
    }
}

int main() {
    const int N = 1024;
    int* data;
    
    // Unified memory - works seamlessly with CDP
    cudaMallocManaged(&data, N * sizeof(int));
    
    for (int i = 0; i < N; i++) data[i] = i;
    printf("Before: data[0]=%d, data[n-1]=%d\n", data[0], data[N-1]);
    
    parentWithUnified<<<1, 1>>>(data, N);
    cudaDeviceSynchronize();
    
    // Host can also read (after sync)
    printf("Host sees: data[0]=%d, data[n-1]=%d\n", data[0], data[N-1]);
    
    cudaFree(data);
    return 0;
}

In [None]:
!nvcc -rdc=true -lcudadevrt cdp_unified_memory.cu -o cdp_unified_memory && ./cdp_unified_memory

## CDP Best Practices Summary

| Practice | Why |
|----------|-----|
| Use base cases | Avoid deep recursion |
| Batch launches | Reduce overhead |
| Prefer tail launch | Resource reuse |
| Use Unified Memory | Simpler coherence |
| Profile with Nsight | Identify bottlenecks |
| Consider alternatives | Sometimes CPU coord is faster |

## Exercise: Optimized CDP Pattern

Take a naive CDP implementation and apply these optimizations:
1. Add base case threshold
2. Batch sibling launches
3. Use unified memory
4. Compare performance

## Week 15 Summary

You've learned:
- **CDP fundamentals**: Parent-child kernels, memory visibility
- **Recursive algorithms**: Quicksort, tree traversal on GPU
- **Adaptive parallelism**: Runtime-determined grid sizes
- **Optimization**: Tail launch, batching, unified memory

## Complete the Checkpoint Quiz!