In [None]:
# ⚙️ Setup
import subprocess, sys
try:
    import google.colab
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numba"])
except ImportError:
    pass

import numpy as np
from numba import cuda

print("⚠️  CUDA C++ is PRIMARY. Python/Numba for quick testing only.")
print("⚠️  Dynamic Parallelism is a CUDA C++ feature (CC 3.5+)!")
if cuda.is_available():
    print(f"GPU: {cuda.get_current_device().name}")

---

## Part 1: What is Dynamic Parallelism?

### Concept

```
Traditional CUDA:
━━━━━━━━━━━━━━━━━━
Host (CPU) launches kernels on Device (GPU)

[Host] ──launch──> [Kernel A]
[Host] ──launch──> [Kernel B]
[Host] ──launch──> [Kernel C]

Dynamic Parallelism:
━━━━━━━━━━━━━━━━━━━━━
GPU kernels can launch other kernels!

[Host] ──launch──> [Kernel A]
                      │
                      ├──launch──> [Child 1]
                      ├──launch──> [Child 2]
                      └──launch──> [Child 3]

Use Cases:
• Recursive algorithms (quicksort, tree traversal)
• Adaptive mesh refinement
• Work that generates more work
• Data-dependent parallelism
```

---

## Part 2: Basic Child Kernel Launch

### CUDA C++ Dynamic Parallelism (Primary)

```cpp
// dynamic_basic.cu - Basic child kernel launch
#include <stdio.h>
#include <cuda_runtime.h>

// ============================================
// Child Kernel - Launched from GPU
// ============================================
__global__ void childKernel(int parentId) {
    int tid = threadIdx.x;
    printf("  Child of parent %d: thread %d\n", parentId, tid);
}

// ============================================
// Parent Kernel - Launches Children
// ============================================
__global__ void parentKernel() {
    int tid = threadIdx.x;
    
    printf("Parent thread %d launching child...\n", tid);
    
    // Launch child kernel from GPU!
    childKernel<<<1, 4>>>(tid);
    
    // Wait for child to complete
    cudaDeviceSynchronize();  // Device-side sync!
    
    printf("Parent thread %d: child completed\n", tid);
}

int main() {
    printf("Host launching parent kernel...\n");
    
    parentKernel<<<1, 2>>>();
    cudaDeviceSynchronize();  // Host-side sync
    
    printf("All done!\n");
    return 0;
}
```

**Compile with relocatable device code:**
```bash
nvcc -arch=sm_70 -rdc=true -o dynamic_basic dynamic_basic.cu
```

---

## Part 3: Memory Visibility

### Memory Rules for Dynamic Parallelism

```
┌─────────────────────────────────────────────────────────┐
│           MEMORY VISIBILITY RULES                       │
├─────────────────────────────────────────────────────────┤
│                                                         │
│  Global Memory:                                         │
│  ✅ Parent writes visible to child after launch         │
│  ✅ Child writes visible to parent after sync           │
│  ⚠️  Must sync properly!                                │
│                                                         │
│  Local Memory:                                          │
│  ❌ Parent's local memory NOT visible to child          │
│  ❌ Child's local memory NOT visible to parent          │
│                                                         │
│  Shared Memory:                                         │
│  ❌ Parent's shared NOT visible to child                │
│  ❌ Each kernel has its own shared memory               │
│                                                         │
│  Constant Memory:                                       │
│  ✅ Visible to all (set before host launch)             │
│                                                         │
└─────────────────────────────────────────────────────────┘
```

### CUDA C++ Memory Example (Primary)

```cpp
// dynamic_memory.cu - Memory visibility
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void childKernel(float* data, int idx) {
    // Child can read parent's global memory writes
    float parentValue = data[idx];
    printf("Child: read %.1f from parent\n", parentValue);
    
    // Child writes to global memory
    data[idx] = parentValue * 2.0f;
}

__global__ void parentKernel(float* data) {
    int tid = threadIdx.x;
    
    // Parent writes to global memory
    data[tid] = (float)(tid + 1) * 10.0f;
    printf("Parent %d: wrote %.1f\n", tid, data[tid]);
    
    // __threadfence() ensures visibility before child launch
    __threadfence();
    
    // Launch child
    childKernel<<<1, 1>>>(data, tid);
    
    // Must sync to see child's writes!
    cudaDeviceSynchronize();
    
    // Now parent can see child's writes
    printf("Parent %d: child wrote %.1f\n", tid, data[tid]);
}

int main() {
    float* d_data;
    cudaMalloc(&d_data, 4 * sizeof(float));
    
    parentKernel<<<1, 4>>>(d_data);
    cudaDeviceSynchronize();
    
    cudaFree(d_data);
    return 0;
}
```

---

## Part 4: Device-Side Streams

### Streams in Dynamic Parallelism

```cpp
// dynamic_streams.cu - Device-side streams
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void childA(int* data) {
    atomicAdd(data, 1);
}

__global__ void childB(int* data) {
    atomicAdd(data, 10);
}

__global__ void parentKernel(int* data) {
    // ============================================
    // Default Stream (NULL)
    // ============================================
    // Children on default stream serialize with each other
    // but are async with parent
    
    childA<<<1, 1>>>(data);  // On implicit NULL stream
    childB<<<1, 1>>>(data);  // Waits for childA
    
    cudaDeviceSynchronize();  // Wait for both
    printf("After default stream children: %d\n", *data);
    
    // ============================================
    // Named Streams (Device-Side)
    // ============================================
    cudaStream_t stream1, stream2;
    cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking);
    cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking);
    
    // Concurrent children on different streams
    childA<<<1, 1, 0, stream1>>>(data);
    childB<<<1, 1, 0, stream2>>>(data);
    
    // Sync specific streams
    cudaStreamSynchronize(stream1);
    cudaStreamSynchronize(stream2);
    
    printf("After named stream children: %d\n", *data);
    
    cudaStreamDestroy(stream1);
    cudaStreamDestroy(stream2);
}

int main() {
    int* d_data;
    cudaMalloc(&d_data, sizeof(int));
    cudaMemset(d_data, 0, sizeof(int));
    
    parentKernel<<<1, 1>>>(d_data);
    cudaDeviceSynchronize();
    
    cudaFree(d_data);
    return 0;
}
```

---

## Part 5: Synchronization Patterns

### When to Sync

```cpp
// sync_patterns.cu - Synchronization patterns

// Pattern 1: Sync before reading child results
__global__ void pattern1(float* data) {
    childKernel<<<1, 32>>>(data);
    
    cudaDeviceSynchronize();  // REQUIRED!
    
    float result = data[0];  // Safe to read
}

// Pattern 2: Fire and forget (parent returns)
__global__ void pattern2(float* data) {
    childKernel<<<1, 32>>>(data);
    
    // NO sync - parent returns immediately
    // Child continues running
    // Result visible to HOST after host sync
}

// Pattern 3: Implicit sync at parent completion
// When parent kernel ends, all children are synced
// before control returns to host

__global__ void parentFireForget(float* data) {
    for (int i = 0; i < 10; i++) {
        childKernel<<<1, 32>>>(data + i * 32);
        // No explicit sync
    }
    // When parent ends, runtime syncs all children
}

// Host code:
// parentFireForget<<<1, 1>>>(data);
// cudaDeviceSynchronize();  // Waits for parent AND all children
```

### Error Checking

```cpp
__global__ void errorCheckingPattern(float* data) {
    // Launch child
    childKernel<<<1000000, 1024>>>(data);  // Might fail!
    
    // Check for launch errors (device-side)
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        printf("Launch error: %s\n", cudaGetErrorString(err));
        return;
    }
    
    // Sync and check for execution errors
    err = cudaDeviceSynchronize();
    if (err != cudaSuccess) {
        printf("Execution error: %s\n", cudaGetErrorString(err));
    }
}
```

---

## Exercises

### Exercise 1: Simple Recursion
Write a kernel that recursively launches itself (with depth limit).

### Exercise 2: Fan-Out Pattern
One parent thread launches multiple children that process different data.

### Exercise 3: Memory Communication
Parent writes config to global memory, child reads it and writes results.

---

## Key Takeaways

```
┌─────────────────────────────────────────────────────────┐
│           DYNAMIC PARALLELISM BASICS                    │
├─────────────────────────────────────────────────────────┤
│                                                         │
│  Launch from GPU:                                       │
│  childKernel<<<blocks, threads>>>(args);                │
│                                                         │
│  Synchronization:                                       │
│  cudaDeviceSynchronize();  // Device-side               │
│                                                         │
│  Compile Flag:                                          │
│  nvcc -rdc=true ...                                     │
│                                                         │
│  Memory Rules:                                          │
│  • Global: visible after sync                           │
│  • Local/Shared: NOT visible to child                   │
│  • Use __threadfence() for visibility                   │
│                                                         │
│  Nesting Limit: 24 levels                               │
│  Compute Capability: 3.5+                               │
│                                                         │
└─────────────────────────────────────────────────────────┘
```

## Next: Day 4 - Nested Kernel Patterns