In [None]:
%%writefile cdp_basics.cu
#include <stdio.h>
#include <cuda_runtime.h>

// Child kernel - launched from GPU
__global__ void childKernel(int* data, int offset, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        data[offset + idx] *= 2;  // Double the value
    }
}

// Parent kernel - launches child kernels
__global__ void parentKernel(int* data, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    
    // Only thread 0 launches children
    if (tid == 0) {
        printf("Parent: Launching child kernels from GPU!\n");
        
        int chunkSize = n / 4;
        int threadsPerBlock = 64;
        int blocks = (chunkSize + threadsPerBlock - 1) / threadsPerBlock;
        
        // Launch 4 child kernels, each processing a chunk
        for (int i = 0; i < 4; i++) {
            childKernel<<<blocks, threadsPerBlock>>>(data, i * chunkSize, chunkSize);
        }
        
        // Wait for all children to complete
        cudaDeviceSynchronize();
        printf("Parent: All children completed!\n");
    }
}

int main() {
    const int N = 256;
    int *h_data, *d_data;
    
    h_data = (int*)malloc(N * sizeof(int));
    cudaMalloc(&d_data, N * sizeof(int));
    
    // Initialize data
    for (int i = 0; i < N; i++) h_data[i] = i;
    cudaMemcpy(d_data, h_data, N * sizeof(int), cudaMemcpyHostToDevice);
    
    printf("Before: data[0]=%d, data[100]=%d\n", h_data[0], h_data[100]);
    
    // Launch parent kernel (which launches children)
    parentKernel<<<1, 1>>>(d_data, N);
    cudaDeviceSynchronize();
    
    cudaMemcpy(h_data, d_data, N * sizeof(int), cudaMemcpyDeviceToHost);
    printf("After:  data[0]=%d, data[100]=%d\n", h_data[0], h_data[100]);
    
    cudaFree(d_data);
    free(h_data);
    return 0;
}

In [None]:
# Compile with relocatable device code and device runtime
!nvcc -rdc=true -lcudadevrt cdp_basics.cu -o cdp_basics && ./cdp_basics

## Key Compilation Flags

- `-rdc=true`: Enable relocatable device code (required for CDP)
- `-lcudadevrt`: Link device runtime library

## Memory Visibility Rules

1. **Global Memory**: Visible to all grids (parent and children)
2. **Shared Memory**: NOT visible across grids
3. **Local Memory**: Private to each thread
4. **Unified Memory**: Preferred for CDP (automatic coherence)

In [None]:
%%writefile cdp_memory.cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void childModify(int* globalData, int parentValue) {
    int tid = threadIdx.x;
    
    // Child can READ parent's local variable (passed by value)
    printf("Child[%d]: Received parentValue = %d\n", tid, parentValue);
    
    // Child can MODIFY global memory
    globalData[tid] = parentValue + tid;
}

__global__ void parentKernel(int* globalData) {
    __shared__ int sharedData[32];
    int localVar = 42;
    
    sharedData[threadIdx.x] = threadIdx.x * 10;
    __syncthreads();
    
    if (threadIdx.x == 0) {
        // Pass value to child (NOT pointer to shared/local!)
        childModify<<<1, 4>>>(globalData, localVar);
        cudaDeviceSynchronize();
        
        // Check child's modifications
        printf("Parent: globalData = [%d, %d, %d, %d]\n",
               globalData[0], globalData[1], globalData[2], globalData[3]);
    }
}

int main() {
    int *d_data;
    cudaMalloc(&d_data, 32 * sizeof(int));
    cudaMemset(d_data, 0, 32 * sizeof(int));
    
    parentKernel<<<1, 32>>>(d_data);
    cudaDeviceSynchronize();
    
    cudaFree(d_data);
    return 0;
}

In [None]:
!nvcc -rdc=true -lcudadevrt cdp_memory.cu -o cdp_memory && ./cdp_memory

## Device-Side Synchronization

| Function | Scope | Use Case |
|----------|-------|----------|
| `cudaDeviceSynchronize()` | Wait for ALL children | Most common |
| `cudaStreamSynchronize(stream)` | Wait for stream | Ordered execution |
| Implicit sync at parent exit | Automatic | Default behavior |

In [None]:
%%writefile cdp_streams.cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void worker(int id, int* result) {
    printf("Worker %d running on GPU\n", id);
    result[id] = id * 100;
}

__global__ void coordinator(int* results) {
    cudaStream_t streams[4];
    
    // Create device-side streams
    for (int i = 0; i < 4; i++) {
        cudaStreamCreateWithFlags(&streams[i], cudaStreamNonBlocking);
    }
    
    // Launch workers into different streams (concurrent execution)
    for (int i = 0; i < 4; i++) {
        worker<<<1, 1, 0, streams[i]>>>(i, results);
    }
    
    // Wait for all streams
    cudaDeviceSynchronize();
    
    // Cleanup streams
    for (int i = 0; i < 4; i++) {
        cudaStreamDestroy(streams[i]);
    }
    
    printf("Results: [%d, %d, %d, %d]\n",
           results[0], results[1], results[2], results[3]);
}

int main() {
    int *d_results;
    cudaMalloc(&d_results, 4 * sizeof(int));
    
    coordinator<<<1, 1>>>(d_results);
    cudaDeviceSynchronize();
    
    cudaFree(d_results);
    return 0;
}

In [None]:
!nvcc -rdc=true -lcudadevrt cdp_streams.cu -o cdp_streams && ./cdp_streams

## Exercise: Parallel Sum with CDP

Implement a parallel sum using dynamic parallelism:
1. Parent divides array into chunks
2. Each child computes partial sum
3. Parent combines partial sums

In [None]:
%%writefile cdp_sum_exercise.cu
#include <stdio.h>
#include <cuda_runtime.h>

// TODO: Implement child kernel that sums a chunk
__global__ void sumChunk(float* data, int start, int end, float* partialSum) {
    // Your code here
}

// TODO: Implement parent kernel that coordinates the work
__global__ void parallelSum(float* data, int n, int numChunks, float* result) {
    // Your code here
}

int main() {
    // Test your implementation
    return 0;
}

## Key Takeaways

1. **CDP enables GPU-side kernel launches** - no CPU roundtrip needed
2. **Compilation flags required**: `-rdc=true -lcudadevrt`
3. **Memory visibility**: Only global/unified memory shared across grids
4. **Synchronization**: Use `cudaDeviceSynchronize()` to wait for children
5. **Device streams**: Enable concurrent child execution

## Next: Day 2 - Recursive Algorithms with CDP