## CDP2 Tail Launch Optimization

CUDA 12+ introduces CDP2 with tail launch - child kernel reuses parent's resources when parent is about to exit.

In [None]:
%%writefile cdp_tail_launch.cu
#include <stdio.h>
#include <cuda_runtime.h>

// With tail launch: child inherits parent slot
__global__ void tailRecursive(int* data, int n, int depth) {
    if (n <= 1 || depth >= 10) {
        if (threadIdx.x == 0) {
            printf("Base case at depth %d, n=%d\n", depth, n);
        }
        return;
    }
    
    // Process current level
    int tid = threadIdx.x;
    if (tid < n) {
        data[tid] += depth;
    }
    __syncthreads();
    
    // Tail launch - parent exits immediately after
    if (tid == 0) {
        tailRecursive<<<1, n/2>>>(data, n/2, depth + 1);
    }
    // No cudaDeviceSynchronize - tail launch pattern!
}

int main() {
    const int N = 256;
    int *d_data;
    cudaMalloc(&d_data, N * sizeof(int));
    cudaMemset(d_data, 0, N * sizeof(int));
    
    tailRecursive<<<1, N>>>(d_data, N, 0);
    cudaDeviceSynchronize();
    
    int h_data[N];
    cudaMemcpy(h_data, d_data, N * sizeof(int), cudaMemcpyDeviceToHost);
    printf("Results: data[0]=%d, data[10]=%d, data[100]=%d\n",
           h_data[0], h_data[10], h_data[100]);
    
    cudaFree(d_data);
    return 0;
}

In [None]:
!nvcc -rdc=true -lcudadevrt cdp_tail_launch.cu -o cdp_tail_launch && ./cdp_tail_launch

## Batching Child Launches

Launch overhead can dominate - batch work into fewer, larger kernels.

In [None]:
%%writefile cdp_batching.cu
#include <stdio.h>
#include <cuda_runtime.h>

// BAD: Many tiny kernel launches
__global__ void badPattern(int* data, int n) {
    for (int i = 0; i < n; i++) {
        // Each iteration launches a tiny kernel
        // Massive overhead!
    }
}

// GOOD: Batch worker kernel
__global__ void batchWorker(int* data, int* offsets, int* counts, int numTasks) {
    int taskId = blockIdx.x;
    if (taskId >= numTasks) return;
    
    int offset = offsets[taskId];
    int count = counts[taskId];
    
    for (int i = threadIdx.x; i < count; i += blockDim.x) {
        data[offset + i] *= 2;
    }
}

// GOOD: Batch coordinator
__global__ void batchCoordinator(int* data, int* offsets, int* counts, int numTasks) {
    if (threadIdx.x == 0) {
        // One launch for all tasks!
        int threadsPerBlock = 256;
        batchWorker<<<numTasks, threadsPerBlock>>>(data, offsets, counts, numTasks);
        cudaDeviceSynchronize();
        printf("Processed %d tasks in single batched launch\n", numTasks);
    }
}

int main() {
    const int NUM_TASKS = 100;
    const int TOTAL_DATA = 10000;
    
    int h_offsets[NUM_TASKS], h_counts[NUM_TASKS];
    int offset = 0;
    for (int i = 0; i < NUM_TASKS; i++) {
        h_offsets[i] = offset;
        h_counts[i] = TOTAL_DATA / NUM_TASKS;
        offset += h_counts[i];
    }
    
    int *d_data, *d_offsets, *d_counts;
    cudaMalloc(&d_data, TOTAL_DATA * sizeof(int));
    cudaMalloc(&d_offsets, NUM_TASKS * sizeof(int));
    cudaMalloc(&d_counts, NUM_TASKS * sizeof(int));
    
    cudaMemcpy(d_offsets, h_offsets, NUM_TASKS * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_counts, h_counts, NUM_TASKS * sizeof(int), cudaMemcpyHostToDevice);
    
    // Initialize data
    int* h_data = new int[TOTAL_DATA];
    for (int i = 0; i < TOTAL_DATA; i++) h_data[i] = 1;
    cudaMemcpy(d_data, h_data, TOTAL_DATA * sizeof(int), cudaMemcpyHostToDevice);
    
    batchCoordinator<<<1, 1>>>(d_data, d_offsets, d_counts, NUM_TASKS);
    cudaDeviceSynchronize();
    
    cudaMemcpy(h_data, d_data, TOTAL_DATA * sizeof(int), cudaMemcpyDeviceToHost);
    printf("Result check: data[0]=%d, data[5000]=%d\n", h_data[0], h_data[5000]);
    
    delete[] h_data;
    cudaFree(d_data);
    cudaFree(d_offsets);
    cudaFree(d_counts);
    
    return 0;
}

In [None]:
!nvcc -rdc=true -lcudadevrt cdp_batching.cu -o cdp_batching && ./cdp_batching

## Memory Optimization

### Use Unified Memory for CDP
- Automatic coherence between parent/child
- Simpler programming model
- Works well with migration hints

In [None]:
%%writefile cdp_unified_memory.cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void childProcess(int* data, int start, int count) {
    int idx = start + blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < start + count) {
        data[idx] = data[idx] * 2 + 1;
    }
}

__global__ void parentWithUnified(int* data, int n) {
    // With unified memory, parent and child see consistent view
    if (threadIdx.x == 0) {
        int blocks = (n + 255) / 256;
        childProcess<<<blocks, 256>>>(data, 0, n);
        cudaDeviceSynchronize();
        
        // Can immediately read child's results!
        printf("After child: data[0]=%d, data[n-1]=%d\n", data[0], data[n-1]);
    }
}

int main() {
    const int N = 1024;
    int* data;
    
    // Unified memory - works seamlessly with CDP
    cudaMallocManaged(&data, N * sizeof(int));
    
    for (int i = 0; i < N; i++) data[i] = i;
    printf("Before: data[0]=%d, data[n-1]=%d\n", data[0], data[N-1]);
    
    parentWithUnified<<<1, 1>>>(data, N);
    cudaDeviceSynchronize();
    
    // Host can also read (after sync)
    printf("Host sees: data[0]=%d, data[n-1]=%d\n", data[0], data[N-1]);
    
    cudaFree(data);
    return 0;
}

In [None]:
!nvcc -rdc=true -lcudadevrt cdp_unified_memory.cu -o cdp_unified_memory && ./cdp_unified_memory

## CDP Best Practices Summary

| Practice | Why |
|----------|-----|
| Use base cases | Avoid deep recursion |
| Batch launches | Reduce overhead |
| Prefer tail launch | Resource reuse |
| Use Unified Memory | Simpler coherence |
| Profile with Nsight | Identify bottlenecks |
| Consider alternatives | Sometimes CPU coord is faster |

---

## ðŸŽ¯ Exercises

### ðŸ”· CUDA C++ Exercises (Primary)

Take a naive CDP implementation and apply these optimizations:
1. Add base case threshold
2. Batch sibling launches
3. Use unified memory
4. Compare performance

In [None]:
%%writefile cdp_optimization_exercises.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <chrono>

/*
 * CDP Optimization Exercises
 * Compile: nvcc -arch=sm_75 -rdc=true -lcudadevrt -o cdp_optimization_exercises cdp_optimization_exercises.cu
 * 
 * Exercise 1: Naive vs Optimized CDP
 * - Compare performance with/without base case threshold
 * 
 * Exercise 2: Batching Child Launches
 * - Reduce launch overhead by batching work
 * 
 * Exercise 3: Unified Memory with CDP
 * - Simplify memory management with managed memory
 * 
 * Exercise 4: Tail Launch Pattern
 * - Use tail launch for resource efficiency
 */

#define CHECK_CUDA(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            printf("CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \
                   cudaGetErrorString(err)); \
            exit(EXIT_FAILURE); \
        } \
    } while(0)

// =============================================================================
// Exercise 1: Naive vs Optimized Base Case
// =============================================================================

// Naive: No base case threshold (many tiny kernels)
__global__ void naiveRecursive(int* data, int start, int end, int depth) {
    if (start >= end) return;
    
    int mid = (start + end) / 2;
    data[mid] = depth;  // Mark with depth
    
    // Always recurse - no base case!
    if (mid > start) {
        naiveRecursive<<<1, 1>>>(data, start, mid, depth + 1);
    }
    if (mid + 1 < end) {
        naiveRecursive<<<1, 1>>>(data, mid + 1, end, depth + 1);
    }
    cudaDeviceSynchronize();
}

// Optimized: Base case threshold
#define BASE_THRESHOLD 16

__device__ void iterativeProcess(int* data, int start, int end, int depth) {
    for (int i = start; i < end; i++) {
        data[i] = depth;
    }
}

__global__ void optimizedRecursive(int* data, int start, int end, int depth) {
    if (start >= end) return;
    
    // Base case: process iteratively
    if (end - start <= BASE_THRESHOLD) {
        iterativeProcess(data, start, end, depth);
        return;
    }
    
    int mid = (start + end) / 2;
    data[mid] = depth;
    
    // Create streams for concurrent execution
    cudaStream_t leftStream, rightStream;
    cudaStreamCreateWithFlags(&leftStream, cudaStreamNonBlocking);
    cudaStreamCreateWithFlags(&rightStream, cudaStreamNonBlocking);
    
    if (mid > start) {
        optimizedRecursive<<<1, 1, 0, leftStream>>>(data, start, mid, depth + 1);
    }
    if (mid + 1 < end) {
        optimizedRecursive<<<1, 1, 0, rightStream>>>(data, mid + 1, end, depth + 1);
    }
    
    cudaDeviceSynchronize();
    cudaStreamDestroy(leftStream);
    cudaStreamDestroy(rightStream);
}

void exercise1_base_case() {
    printf("\n=== Exercise 1: Naive vs Optimized Base Case ===\n");
    
    const int N = 512;  // Small to avoid too many launches
    int *d_data1, *d_data2;
    
    CHECK_CUDA(cudaMalloc(&d_data1, N * sizeof(int)));
    CHECK_CUDA(cudaMalloc(&d_data2, N * sizeof(int)));
    CHECK_CUDA(cudaMemset(d_data1, 0, N * sizeof(int)));
    CHECK_CUDA(cudaMemset(d_data2, 0, N * sizeof(int)));
    
    // Time naive version
    cudaEvent_t start1, stop1;
    cudaEventCreate(&start1);
    cudaEventCreate(&stop1);
    
    cudaEventRecord(start1);
    naiveRecursive<<<1, 1>>>(d_data1, 0, N, 0);
    cudaEventRecord(stop1);
    CHECK_CUDA(cudaDeviceSynchronize());
    
    float naiveTime;
    cudaEventElapsedTime(&naiveTime, start1, stop1);
    
    // Time optimized version
    cudaEvent_t start2, stop2;
    cudaEventCreate(&start2);
    cudaEventCreate(&stop2);
    
    cudaEventRecord(start2);
    optimizedRecursive<<<1, 1>>>(d_data2, 0, N, 0);
    cudaEventRecord(stop2);
    CHECK_CUDA(cudaDeviceSynchronize());
    
    float optimizedTime;
    cudaEventElapsedTime(&optimizedTime, start2, stop2);
    
    printf("Naive time:     %.3f ms\n", naiveTime);
    printf("Optimized time: %.3f ms\n", optimizedTime);
    printf("Speedup: %.2fx\n", naiveTime / optimizedTime);
    
    cudaFree(d_data1);
    cudaFree(d_data2);
    cudaEventDestroy(start1);
    cudaEventDestroy(stop1);
    cudaEventDestroy(start2);
    cudaEventDestroy(stop2);
}

// =============================================================================
// Exercise 2: Batching Child Launches
// =============================================================================

// Worker kernel for batch processing
__global__ void batchWorkerKernel(int* data, int* offsets, int* sizes, int numTasks) {
    int taskId = blockIdx.x;
    if (taskId >= numTasks) return;
    
    int offset = offsets[taskId];
    int size = sizes[taskId];
    
    // Process this task's data
    for (int i = threadIdx.x; i < size; i += blockDim.x) {
        data[offset + i] = taskId * 100 + i;
    }
}

// Naive: One launch per task
__global__ void naiveLauncher(int* data, int* offsets, int* sizes, int numTasks) {
    for (int i = 0; i < numTasks; i++) {
        int size = sizes[i];
        int blocks = (size + 127) / 128;
        // Simulated work - in reality this launches many small kernels
        batchWorkerKernel<<<blocks, 128>>>(data, offsets, sizes, 1);
        cudaDeviceSynchronize();
    }
}

// Optimized: Single batched launch
__global__ void batchedLauncher(int* data, int* offsets, int* sizes, int numTasks) {
    if (threadIdx.x == 0) {
        // Single launch handles all tasks!
        batchWorkerKernel<<<numTasks, 128>>>(data, offsets, sizes, numTasks);
        cudaDeviceSynchronize();
        printf("Batched: Processed %d tasks in 1 launch\n", numTasks);
    }
}

void exercise2_batching() {
    printf("\n=== Exercise 2: Batching Child Launches ===\n");
    
    const int NUM_TASKS = 50;
    const int TASK_SIZE = 256;
    const int TOTAL = NUM_TASKS * TASK_SIZE;
    
    int h_offsets[NUM_TASKS], h_sizes[NUM_TASKS];
    for (int i = 0; i < NUM_TASKS; i++) {
        h_offsets[i] = i * TASK_SIZE;
        h_sizes[i] = TASK_SIZE;
    }
    
    int *d_data, *d_offsets, *d_sizes;
    CHECK_CUDA(cudaMalloc(&d_data, TOTAL * sizeof(int)));
    CHECK_CUDA(cudaMalloc(&d_offsets, NUM_TASKS * sizeof(int)));
    CHECK_CUDA(cudaMalloc(&d_sizes, NUM_TASKS * sizeof(int)));
    
    CHECK_CUDA(cudaMemcpy(d_offsets, h_offsets, NUM_TASKS * sizeof(int), cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(d_sizes, h_sizes, NUM_TASKS * sizeof(int), cudaMemcpyHostToDevice));
    
    // Run batched version
    batchedLauncher<<<1, 1>>>(d_data, d_offsets, d_sizes, NUM_TASKS);
    CHECK_CUDA(cudaDeviceSynchronize());
    
    // Verify
    int* h_data = new int[TOTAL];
    CHECK_CUDA(cudaMemcpy(h_data, d_data, TOTAL * sizeof(int), cudaMemcpyDeviceToHost));
    printf("Sample: data[0]=%d, data[256]=%d, data[512]=%d\n",
           h_data[0], h_data[256], h_data[512]);
    
    delete[] h_data;
    cudaFree(d_data);
    cudaFree(d_offsets);
    cudaFree(d_sizes);
}

// =============================================================================
// Exercise 3: Unified Memory with CDP
// =============================================================================

__global__ void childWithUnifiedMem(int* uniData, int idx, int value) {
    // Child can directly access and modify unified memory
    uniData[idx] = value * 2;
}

__global__ void parentWithUnifiedMem(int* uniData, int n) {
    if (threadIdx.x == 0) {
        printf("Parent: Processing %d elements with unified memory\n", n);
        
        // Launch children that modify unified memory
        for (int i = 0; i < n; i++) {
            childWithUnifiedMem<<<1, 1>>>(uniData, i, i + 1);
        }
        cudaDeviceSynchronize();
        
        // Parent can immediately read children's results
        printf("Parent: Results ready - uniData[0]=%d, uniData[5]=%d\n",
               uniData[0], uniData[5]);
    }
}

void exercise3_unified_memory() {
    printf("\n=== Exercise 3: Unified Memory with CDP ===\n");
    
    const int N = 10;
    int* uniData;
    
    // Allocate unified memory - accessible by both host and device
    CHECK_CUDA(cudaMallocManaged(&uniData, N * sizeof(int)));
    
    // Initialize from host
    for (int i = 0; i < N; i++) uniData[i] = 0;
    
    // Launch parent (which launches children)
    parentWithUnifiedMem<<<1, 1>>>(uniData, N);
    CHECK_CUDA(cudaDeviceSynchronize());
    
    // Read results from host - no explicit copy needed!
    printf("Host reading unified memory:\n");
    for (int i = 0; i < N; i++) {
        printf("  uniData[%d] = %d\n", i, uniData[i]);
    }
    
    cudaFree(uniData);
}

// =============================================================================
// Exercise 4: Tail Launch Pattern
// =============================================================================

__global__ void tailLaunchKernel(int* data, int n, int depth) {
    // Process current level
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < n) {
        data[tid] += 1;  // Increment at each level
    }
    __syncthreads();
    
    // Base case
    if (n <= 1 || depth >= 8) {
        if (tid == 0) {
            printf("Tail launch: Reached depth %d\n", depth);
        }
        return;
    }
    
    // Tail launch: parent exits immediately after launching child
    // Child inherits parent's resources
    if (tid == 0) {
        int newN = n / 2;
        tailLaunchKernel<<<1, newN>>>(data, newN, depth + 1);
        // No cudaDeviceSynchronize() - this is the tail launch pattern!
        // Parent will exit and child reuses its resources
    }
}

void exercise4_tail_launch() {
    printf("\n=== Exercise 4: Tail Launch Pattern ===\n");
    
    const int N = 256;
    int *d_data;
    
    CHECK_CUDA(cudaMalloc(&d_data, N * sizeof(int)));
    CHECK_CUDA(cudaMemset(d_data, 0, N * sizeof(int)));
    
    tailLaunchKernel<<<1, N>>>(d_data, N, 0);
    CHECK_CUDA(cudaDeviceSynchronize());
    
    int* h_data = new int[N];
    CHECK_CUDA(cudaMemcpy(h_data, d_data, N * sizeof(int), cudaMemcpyDeviceToHost));
    
    // Elements processed at different depths will have different values
    printf("Values at different positions:\n");
    printf("  data[0] = %d (processed at all levels)\n", h_data[0]);
    printf("  data[64] = %d (processed at fewer levels)\n", h_data[64]);
    printf("  data[128] = %d (processed at even fewer levels)\n", h_data[128]);
    
    delete[] h_data;
    cudaFree(d_data);
}

// =============================================================================
// Main
// =============================================================================

int main() {
    printf("CDP Optimization Exercises\n");
    printf("==========================\n");
    
    // Set CDP limits
    cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, 16);
    cudaDeviceSetLimit(cudaLimitDevRuntimePendingLaunchCount, 4096);
    
    exercise1_base_case();
    exercise2_batching();
    exercise3_unified_memory();
    exercise4_tail_launch();
    
    printf("\nâœ… All CDP optimization exercises completed!\n");
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -rdc=true -lcudadevrt -o cdp_optimization_exercises cdp_optimization_exercises.cu && ./cdp_optimization_exercises

### ðŸ”¶ Python/Numba Exercises (Alternative)

## Week 15 Summary

You've learned:
- **CDP fundamentals**: Parent-child kernels, memory visibility
- **Recursive algorithms**: Quicksort, tree traversal on GPU
- **Adaptive parallelism**: Runtime-determined grid sizes
- **Optimization**: Tail launch, batching, unified memory

## Complete the Checkpoint Quiz!