In [None]:
# ‚öôÔ∏è Setup
import subprocess, sys
try:
    import google.colab
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numba"])
except ImportError:
    pass

import numpy as np
from numba import cuda

print("‚ö†Ô∏è  CUDA C++ is PRIMARY. Python/Numba for quick testing only.")
print("‚ö†Ô∏è  Nested kernels require CUDA C++ with -rdc=true!")
if cuda.is_available():
    print(f"GPU: {cuda.get_current_device().name}")

---

## Part 1: Recursive Patterns

### üî∑ CUDA C++ Parallel Quicksort (Primary)

This example demonstrates recursive quicksort using dynamic parallelism:
- Child kernels are launched from within device code
- Uses non-blocking streams for parallel left/right partitioning
- Falls back to insertion sort for small arrays (MIN_SIZE threshold)
- Limits recursion depth (MAX_DEPTH) to prevent stack overflow

**Compile with:** `nvcc -arch=sm_75 -rdc=true -o quicksort quicksort_dp.cu -lcudadevrt`

In [None]:
%%writefile quicksort_dp.cu
// quicksort_dp.cu - Recursive quicksort with dynamic parallelism
#include <stdio.h>
#include <cuda_runtime.h>

#define MAX_DEPTH 16
#define MIN_SIZE 32  // Switch to sequential sort

// ============================================
// Sequential Sort for Small Arrays
// ============================================
__device__ void insertionSort(int* data, int left, int right) {
    for (int i = left + 1; i <= right; i++) {
        int key = data[i];
        int j = i - 1;
        while (j >= left && data[j] > key) {
            data[j + 1] = data[j];
            j--;
        }
        data[j + 1] = key;
    }
}

// ============================================
// Partition
// ============================================
__device__ int partition(int* data, int left, int right) {
    int pivot = data[right];
    int i = left - 1;
    
    for (int j = left; j < right; j++) {
        if (data[j] <= pivot) {
            i++;
            int temp = data[i];
            data[i] = data[j];
            data[j] = temp;
        }
    }
    
    int temp = data[i + 1];
    data[i + 1] = data[right];
    data[right] = temp;
    
    return i + 1;
}

// ============================================
// Recursive Quicksort Kernel
// ============================================
__global__ void quicksortKernel(int* data, int left, int right, int depth) {
    // Base case: small array or max depth
    if (right - left < MIN_SIZE || depth >= MAX_DEPTH) {
        insertionSort(data, left, right);
        return;
    }
    
    // Partition
    int pivotIdx = partition(data, left, right);
    
    // Launch child kernels for left and right partitions
    cudaStream_t s1, s2;
    cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
    cudaStreamCreateWithFlags(&s2, cudaStreamNonBlocking);
    
    // Left partition
    if (left < pivotIdx - 1) {
        quicksortKernel<<<1, 1, 0, s1>>>(data, left, pivotIdx - 1, depth + 1);
    }
    
    // Right partition
    if (pivotIdx + 1 < right) {
        quicksortKernel<<<1, 1, 0, s2>>>(data, pivotIdx + 1, right, depth + 1);
    }
    
    // Wait for children
    cudaDeviceSynchronize();
    cudaStreamDestroy(s1);
    cudaStreamDestroy(s2);
}

// ============================================
// Host Entry Point
// ============================================
__global__ void quicksortEntry(int* data, int n) {
    quicksortKernel<<<1, 1>>>(data, 0, n - 1, 0);
    cudaDeviceSynchronize();
}

int main() {
    const int N = 1024;
    int* h_data = new int[N];
    int* d_data;
    
    // Random data
    srand(42);
    for (int i = 0; i < N; i++) h_data[i] = rand() % 10000;
    
    printf("First 10 elements before: ");
    for (int i = 0; i < 10; i++) printf("%d ", h_data[i]);
    printf("\n");
    
    cudaMalloc(&d_data, N * sizeof(int));
    cudaMemcpy(d_data, h_data, N * sizeof(int), cudaMemcpyHostToDevice);
    
    // Sort
    quicksortEntry<<<1, 1>>>(d_data, N);
    cudaDeviceSynchronize();
    
    // Check for errors
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        printf("CUDA error: %s\n", cudaGetErrorString(err));
        return 1;
    }
    
    // Verify
    cudaMemcpy(h_data, d_data, N * sizeof(int), cudaMemcpyDeviceToHost);
    
    printf("First 10 elements after:  ");
    for (int i = 0; i < 10; i++) printf("%d ", h_data[i]);
    printf("\n");
    
    bool sorted = true;
    for (int i = 1; i < N; i++) {
        if (h_data[i] < h_data[i-1]) {
            sorted = false;
            break;
        }
    }
    printf("Array sorted: %s\n", sorted ? "YES" : "NO");
    
    delete[] h_data;
    cudaFree(d_data);
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -rdc=true -o quicksort_dp quicksort_dp.cu -lcudadevrt && ./quicksort_dp

---

## Part 2: Tree/Graph Traversal

### üî∑ CUDA C++ Parallel Tree Processing (Primary)

This example demonstrates parallel tree traversal using dynamic parallelism:
- Processes a binary tree structure stored in device memory
- Recursively launches child kernels for left and right subtrees
- Uses atomicAdd to accumulate values across all nodes
- Depth guard prevents infinite recursion

In [None]:
%%writefile tree_traversal.cu
// tree_traversal.cu - Parallel tree processing
#include <stdio.h>
#include <cuda_runtime.h>

struct TreeNode {
    float value;
    int leftChild;   // Index, -1 if none
    int rightChild;  // Index, -1 if none
};

// ============================================
// Process Node and Recurse to Children
// ============================================
__global__ void processTree(TreeNode* tree, float* results, int nodeIdx, int depth) {
    if (nodeIdx < 0 || depth > 20) return;  // Guard
    
    TreeNode* node = &tree[nodeIdx];
    
    // Process this node (accumulate value)
    atomicAdd(&results[0], node->value);
    
    // Launch children in parallel
    if (node->leftChild >= 0 && node->rightChild >= 0) {
        // Both children exist - launch in parallel
        cudaStream_t s1, s2;
        cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
        cudaStreamCreateWithFlags(&s2, cudaStreamNonBlocking);
        
        processTree<<<1, 1, 0, s1>>>(tree, results, node->leftChild, depth + 1);
        processTree<<<1, 1, 0, s2>>>(tree, results, node->rightChild, depth + 1);
        
        cudaDeviceSynchronize();
        cudaStreamDestroy(s1);
        cudaStreamDestroy(s2);
    } else {
        // One or no children
        if (node->leftChild >= 0) {
            processTree<<<1, 1>>>(tree, results, node->leftChild, depth + 1);
            cudaDeviceSynchronize();
        }
        if (node->rightChild >= 0) {
            processTree<<<1, 1>>>(tree, results, node->rightChild, depth + 1);
            cudaDeviceSynchronize();
        }
    }
}

int main() {
    // Build simple tree:
    //       0(10)
    //      /    \
    //    1(20)  2(30)
    //    /  \
    //  3(5) 4(15)
    
    TreeNode h_tree[5];
    h_tree[0] = {10.0f, 1, 2};    // Root
    h_tree[1] = {20.0f, 3, 4};    // Left child
    h_tree[2] = {30.0f, -1, -1};  // Right child (leaf)
    h_tree[3] = {5.0f, -1, -1};   // Leaf
    h_tree[4] = {15.0f, -1, -1};  // Leaf
    
    printf("Tree structure:\n");
    printf("       0(10)\n");
    printf("      /    \\\n");
    printf("    1(20)  2(30)\n");
    printf("    /  \\\n");
    printf("  3(5) 4(15)\n\n");
    
    TreeNode* d_tree;
    float* d_result;
    
    cudaMalloc(&d_tree, 5 * sizeof(TreeNode));
    cudaMalloc(&d_result, sizeof(float));
    
    cudaMemcpy(d_tree, h_tree, 5 * sizeof(TreeNode), cudaMemcpyHostToDevice);
    cudaMemset(d_result, 0, sizeof(float));
    
    // Process tree starting at root
    processTree<<<1, 1>>>(d_tree, d_result, 0, 0);
    cudaDeviceSynchronize();
    
    // Check for errors
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        printf("CUDA error: %s\n", cudaGetErrorString(err));
        return 1;
    }
    
    float sum;
    cudaMemcpy(&sum, d_result, sizeof(float), cudaMemcpyDeviceToHost);
    
    printf("Tree sum: %.0f (expected 80 = 10+20+30+5+15)\n", sum);
    
    cudaFree(d_tree);
    cudaFree(d_result);
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -rdc=true -o tree_traversal tree_traversal.cu -lcudadevrt && ./tree_traversal

---

## Part 3: Adaptive Mesh Refinement

### üî∑ CUDA C++ Conditional Work Spawning (Primary)

This example demonstrates adaptive mesh refinement using dynamic parallelism:
- Cells are processed and refined based on error estimates
- Refinement creates 4 child cells (quad subdivision pattern)
- Uses atomicAdd to track dynamic cell allocation
- Recurses only when error exceeds threshold

In [None]:
%%writefile adaptive_mesh.cu
// adaptive_mesh.cu - Adaptive refinement pattern
#include <stdio.h>
#include <cuda_runtime.h>
#include <math.h>

struct Cell {
    float value;
    float error;  // Error estimate
    int refined;  // Has been refined
};

#define ERROR_THRESHOLD 0.1f
#define MAX_DEPTH 3

// ============================================
// Process Cell - Refine if Error Too High
// ============================================
__global__ void processCell(Cell* cells, int cellIdx, int* childStart, int depth) {
    if (depth >= MAX_DEPTH) return;
    
    Cell* cell = &cells[cellIdx];
    
    // Compute work (simplified)
    cell->value = sinf((float)cellIdx * 0.5f);
    cell->error = fabsf(cell->value - 0.5f) * 0.2f;  // Fake error
    
    // Check if refinement needed
    if (cell->error > ERROR_THRESHOLD && depth < MAX_DEPTH) {
        cell->refined = 1;
        
        // Allocate child cells atomically
        int childBase = atomicAdd(childStart, 4);  // 4 children (2D quad)
        
        // Launch refinement for each child
        for (int i = 0; i < 4; i++) {
            processCell<<<1, 1>>>(cells, childBase + i, childStart, depth + 1);
        }
        
        cudaDeviceSynchronize();
    } else {
        cell->refined = 0;
    }
}

// ============================================
// Entry Kernel - Start with Coarse Grid
// ============================================
__global__ void adaptiveMesh(Cell* cells, int initialCells, int* childStart) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    
    if (tid < initialCells) {
        processCell<<<1, 1>>>(cells, tid, childStart, 0);
    }
    
    cudaDeviceSynchronize();
}

int main() {
    const int INITIAL_CELLS = 16;
    const int MAX_CELLS = 10000;  // Preallocate
    
    Cell* d_cells;
    int* d_childStart;
    
    cudaMalloc(&d_cells, MAX_CELLS * sizeof(Cell));
    cudaMalloc(&d_childStart, sizeof(int));
    
    int initialChild = INITIAL_CELLS;  // Children start after initial
    cudaMemcpy(d_childStart, &initialChild, sizeof(int), cudaMemcpyHostToDevice);
    
    printf("Starting adaptive mesh refinement...\n");
    printf("Initial cells: %d\n", INITIAL_CELLS);
    printf("Error threshold: %.2f\n", ERROR_THRESHOLD);
    printf("Max depth: %d\n\n", MAX_DEPTH);
    
    // Start adaptive refinement
    adaptiveMesh<<<1, INITIAL_CELLS>>>(d_cells, INITIAL_CELLS, d_childStart);
    cudaDeviceSynchronize();
    
    // Check for errors
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        printf("CUDA error: %s\n", cudaGetErrorString(err));
        return 1;
    }
    
    int totalCells;
    cudaMemcpy(&totalCells, d_childStart, sizeof(int), cudaMemcpyDeviceToHost);
    
    printf("Total cells after refinement: %d\n", totalCells);
    printf("Cells created by refinement: %d\n", totalCells - INITIAL_CELLS);
    
    cudaFree(d_cells);
    cudaFree(d_childStart);
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -rdc=true -o adaptive_mesh adaptive_mesh.cu -lcudadevrt && ./adaptive_mesh

---

## Part 4: Performance Considerations

### Overhead and Optimization

```
Dynamic Parallelism Overhead:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

Each child launch has overhead:
‚Ä¢ ~5-20 Œºs per launch (varies)
‚Ä¢ Memory for child kernel state
‚Ä¢ Device runtime overhead

Optimization Strategies:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

1. Batch Work
   ‚ùå Launch child for each element
   ‚úÖ Launch child for chunk of elements

2. Limit Depth
   ‚ùå Recurse until single element
   ‚úÖ Switch to sequential at threshold

3. Use Streams
   ‚ùå Serial child launches
   ‚úÖ Parallel with non-blocking streams

4. Consider Alternatives
   ‚Ä¢ Cooperative groups for some cases
   ‚Ä¢ Flattened iteration
   ‚Ä¢ Work queues
```

### üî∑ CUDA C++ When NOT to Use Dynamic Parallelism (Primary)

```cpp
// BAD: Simple vector add - no need for DP!
__global__ void vectorAddDP(float* a, float* b, float* c, int n) {
    int tid = threadIdx.x;
    // DON'T DO THIS!
    vectorAddChild<<<1, 1>>>(a, b, c, tid);
}

// GOOD: Just use regular parallelism
__global__ void vectorAdd(float* a, float* b, float* c, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) c[tid] = a[tid] + b[tid];
}
```

---

## Exercises

### Exercise 1: Merge Sort
Implement parallel merge sort using dynamic parallelism.

### Exercise 2: Quadtree Construction
Build a quadtree for 2D points using adaptive subdivision.

### Exercise 3: Fractal Rendering
Use DP to adaptively refine Mandelbrot set regions with high detail.

---

## Key Takeaways

```
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ           NESTED KERNEL PATTERNS                        ‚îÇ
‚îú‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î§
‚îÇ                                                         ‚îÇ
‚îÇ  Good Use Cases:                                        ‚îÇ
‚îÇ  ‚Ä¢ Recursive algorithms (sort, tree traversal)          ‚îÇ
‚îÇ  ‚Ä¢ Data-dependent parallelism                           ‚îÇ
‚îÇ  ‚Ä¢ Adaptive refinement                                  ‚îÇ
‚îÇ                                                         ‚îÇ
‚îÇ  Optimization:                                          ‚îÇ
‚îÇ  ‚Ä¢ Use MIN_SIZE threshold for base case                 ‚îÇ
‚îÇ  ‚Ä¢ Limit recursion depth (MAX_DEPTH)                    ‚îÇ
‚îÇ  ‚Ä¢ Use non-blocking streams for siblings                ‚îÇ
‚îÇ  ‚Ä¢ Batch work in child kernels                          ‚îÇ
‚îÇ                                                         ‚îÇ
‚îÇ  Avoid When:                                            ‚îÇ
‚îÇ  ‚Ä¢ Regular parallelism works                            ‚îÇ
‚îÇ  ‚Ä¢ Work is uniform/predictable                          ‚îÇ
‚îÇ  ‚Ä¢ Overhead dominates compute                           ‚îÇ
‚îÇ                                                         ‚îÇ
‚îÇ  Memory: Only global memory shared between kernels      ‚îÇ
‚îÇ                                                         ‚îÇ
‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò
```

## Week 11 Complete! Next: Week 12 - Multi-GPU & Advanced Topics