## GPU Quicksort with Dynamic Parallelism

Quicksort is a classic recursive algorithm that benefits from CDP:
1. Partition array around pivot
2. Recursively sort left and right subarrays
3. Base case: small arrays sorted with simple algorithm

In [None]:
%%writefile cdp_quicksort.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>

#define INSERTION_SORT_THRESHOLD 32
#define MAX_DEPTH 16

// Simple insertion sort for small arrays
__device__ void insertionSort(int* arr, int left, int right) {
    for (int i = left + 1; i <= right; i++) {
        int key = arr[i];
        int j = i - 1;
        while (j >= left && arr[j] > key) {
            arr[j + 1] = arr[j];
            j--;
        }
        arr[j + 1] = key;
    }
}

// Partition using Lomuto scheme
__device__ int partition(int* arr, int left, int right) {
    int pivot = arr[right];
    int i = left - 1;
    
    for (int j = left; j < right; j++) {
        if (arr[j] <= pivot) {
            i++;
            int temp = arr[i];
            arr[i] = arr[j];
            arr[j] = temp;
        }
    }
    
    int temp = arr[i + 1];
    arr[i + 1] = arr[right];
    arr[right] = temp;
    
    return i + 1;
}

// Recursive quicksort kernel
__global__ void quicksortKernel(int* arr, int left, int right, int depth) {
    // Base case: small array or max depth reached
    if (right - left < INSERTION_SORT_THRESHOLD || depth >= MAX_DEPTH) {
        if (left < right) {
            insertionSort(arr, left, right);
        }
        return;
    }
    
    // Partition the array
    int pivotIdx = partition(arr, left, right);
    
    // Launch child kernels for subarrays
    cudaStream_t leftStream, rightStream;
    cudaStreamCreateWithFlags(&leftStream, cudaStreamNonBlocking);
    cudaStreamCreateWithFlags(&rightStream, cudaStreamNonBlocking);
    
    if (pivotIdx - 1 > left) {
        quicksortKernel<<<1, 1, 0, leftStream>>>(arr, left, pivotIdx - 1, depth + 1);
    }
    if (pivotIdx + 1 < right) {
        quicksortKernel<<<1, 1, 0, rightStream>>>(arr, pivotIdx + 1, right, depth + 1);
    }
    
    cudaDeviceSynchronize();
    cudaStreamDestroy(leftStream);
    cudaStreamDestroy(rightStream);
}

void gpuQuicksort(int* d_arr, int n) {
    quicksortKernel<<<1, 1>>>(d_arr, 0, n - 1, 0);
    cudaDeviceSynchronize();
}

int main() {
    const int N = 1024;
    thrust::host_vector<int> h_arr(N);
    
    // Initialize with random values
    srand(42);
    for (int i = 0; i < N; i++) {
        h_arr[i] = rand() % 10000;
    }
    
    printf("Before sort: [%d, %d, %d, ... , %d]\n",
           h_arr[0], h_arr[1], h_arr[2], h_arr[N-1]);
    
    thrust::device_vector<int> d_arr = h_arr;
    gpuQuicksort(thrust::raw_pointer_cast(d_arr.data()), N);
    h_arr = d_arr;
    
    printf("After sort:  [%d, %d, %d, ... , %d]\n",
           h_arr[0], h_arr[1], h_arr[2], h_arr[N-1]);
    
    // Verify sorted
    bool sorted = true;
    for (int i = 1; i < N; i++) {
        if (h_arr[i] < h_arr[i-1]) sorted = false;
    }
    printf("Array is %s\n", sorted ? "SORTED" : "NOT SORTED");
    
    return 0;
}

In [None]:
!nvcc -rdc=true -lcudadevrt cdp_quicksort.cu -o cdp_quicksort && ./cdp_quicksort

## Recursion Depth Limits

CDP has practical limits:
- **Default nesting depth**: 24 levels
- **Pending kernel limit**: 2048 by default
- **Device memory for sync**: ~150KB per level

Configure with `cudaDeviceSetLimit()`:

In [None]:
%%writefile cdp_limits.cu
#include <stdio.h>
#include <cuda_runtime.h>

int main() {
    size_t syncDepth, pendingLaunchCount, stackSize;
    
    cudaDeviceGetLimit(&syncDepth, cudaLimitDevRuntimeSyncDepth);
    cudaDeviceGetLimit(&pendingLaunchCount, cudaLimitDevRuntimePendingLaunchCount);
    cudaDeviceGetLimit(&stackSize, cudaLimitStackSize);
    
    printf("CDP Limits:\n");
    printf("  Max sync depth: %zu\n", syncDepth);
    printf("  Max pending launches: %zu\n", pendingLaunchCount);
    printf("  Stack size per thread: %zu bytes\n", stackSize);
    
    // Increase limits if needed
    cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, 32);
    cudaDeviceSetLimit(cudaLimitDevRuntimePendingLaunchCount, 4096);
    
    cudaDeviceGetLimit(&syncDepth, cudaLimitDevRuntimeSyncDepth);
    cudaDeviceGetLimit(&pendingLaunchCount, cudaLimitDevRuntimePendingLaunchCount);
    
    printf("\nAfter increasing:\n");
    printf("  Max sync depth: %zu\n", syncDepth);
    printf("  Max pending launches: %zu\n", pendingLaunchCount);
    
    return 0;
}

In [None]:
!nvcc cdp_limits.cu -o cdp_limits && ./cdp_limits

## Binary Tree Traversal

In [None]:
%%writefile cdp_tree.cu
#include <stdio.h>
#include <cuda_runtime.h>

struct TreeNode {
    int value;
    int left;   // Index of left child (-1 if none)
    int right;  // Index of right child (-1 if none)
};

__global__ void processNode(TreeNode* tree, int nodeIdx, int* result, int* resultIdx) {
    if (nodeIdx < 0) return;
    
    TreeNode node = tree[nodeIdx];
    
    // Process left subtree first (in-order)
    if (node.left >= 0) {
        processNode<<<1, 1>>>(tree, node.left, result, resultIdx);
        cudaDeviceSynchronize();
    }
    
    // Process current node
    int idx = atomicAdd(resultIdx, 1);
    result[idx] = node.value;
    
    // Process right subtree
    if (node.right >= 0) {
        processNode<<<1, 1>>>(tree, node.right, result, resultIdx);
        cudaDeviceSynchronize();
    }
}

int main() {
    //       4
    //      / \\
    //     2   6
    //    / \\ / \\
    //   1  3 5  7
    
    TreeNode h_tree[7] = {
        {4, 1, 2},   // 0: root
        {2, 3, 4},   // 1: left of root
        {6, 5, 6},   // 2: right of root
        {1, -1, -1}, // 3: leaf
        {3, -1, -1}, // 4: leaf
        {5, -1, -1}, // 5: leaf
        {7, -1, -1}  // 6: leaf
    };
    
    TreeNode* d_tree;
    int *d_result, *d_resultIdx;
    int h_result[7];
    
    cudaMalloc(&d_tree, 7 * sizeof(TreeNode));
    cudaMalloc(&d_result, 7 * sizeof(int));
    cudaMalloc(&d_resultIdx, sizeof(int));
    
    cudaMemcpy(d_tree, h_tree, 7 * sizeof(TreeNode), cudaMemcpyHostToDevice);
    cudaMemset(d_resultIdx, 0, sizeof(int));
    
    processNode<<<1, 1>>>(d_tree, 0, d_result, d_resultIdx);
    cudaDeviceSynchronize();
    
    cudaMemcpy(h_result, d_result, 7 * sizeof(int), cudaMemcpyDeviceToHost);
    
    printf("In-order traversal: ");
    for (int i = 0; i < 7; i++) printf("%d ", h_result[i]);
    printf("\n");
    
    cudaFree(d_tree);
    cudaFree(d_result);
    cudaFree(d_resultIdx);
    
    return 0;
}

In [None]:
!nvcc -rdc=true -lcudadevrt cdp_tree.cu -o cdp_tree && ./cdp_tree

---

## ðŸŽ¯ Exercises

### ðŸ”· CUDA C++ Exercises (Primary)

Complete these exercises to reinforce your understanding of recursive algorithms with CDP:

**Exercise 1: Binary Search Tree Traversal**
Implement an in-order traversal of a binary search tree using CDP. Each node should spawn child kernels to process left and right subtrees.

**Exercise 2: Merge Sort with CDP**
Implement parallel merge sort using dynamic parallelism. The kernel should recursively divide the array and merge sorted subarrays.

**Exercise 3: Recursive Tree Height**
Calculate the height of a binary tree using CDP. Each node spawns children to find the max depth of subtrees.

In [None]:
%%writefile recursive_exercises.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <stdlib.h>

// ============================================================
// Exercise 1: Binary Search Tree In-Order Traversal with CDP
// ============================================================

struct TreeNode {
    int value;
    int left;   // Index of left child (-1 if none)
    int right;  // Index of right child (-1 if none)
};

__device__ int traversalIndex = 0;

__global__ void inorderTraversal(TreeNode* tree, int nodeIdx, int* result) {
    if (nodeIdx == -1) return;
    
    // Process left subtree
    if (tree[nodeIdx].left != -1) {
        inorderTraversal<<<1, 1>>>(tree, tree[nodeIdx].left, result);
        cudaDeviceSynchronize();
    }
    
    // Visit current node
    int pos = atomicAdd(&traversalIndex, 1);
    result[pos] = tree[nodeIdx].value;
    
    // Process right subtree
    if (tree[nodeIdx].right != -1) {
        inorderTraversal<<<1, 1>>>(tree, tree[nodeIdx].right, result);
        cudaDeviceSynchronize();
    }
}

void exercise1_bst_traversal() {
    printf("=== Exercise 1: BST In-Order Traversal ===\n");
    
    // Build a simple BST:
    //        4
    //       / \
    //      2   6
    //     / \ / \
    //    1  3 5  7
    
    TreeNode h_tree[7] = {
        {4, 1, 2},   // Node 0: root=4, left->1, right->2
        {2, 3, 4},   // Node 1: value=2, left->3, right->4
        {6, 5, 6},   // Node 2: value=6, left->5, right->6
        {1, -1, -1}, // Node 3: value=1 (leaf)
        {3, -1, -1}, // Node 4: value=3 (leaf)
        {5, -1, -1}, // Node 5: value=5 (leaf)
        {7, -1, -1}  // Node 6: value=7 (leaf)
    };
    
    TreeNode* d_tree;
    int* d_result;
    int h_result[7];
    
    cudaMalloc(&d_tree, 7 * sizeof(TreeNode));
    cudaMalloc(&d_result, 7 * sizeof(int));
    cudaMemcpy(d_tree, h_tree, 7 * sizeof(TreeNode), cudaMemcpyHostToDevice);
    
    // Reset traversal index
    int zero = 0;
    cudaMemcpyToSymbol(traversalIndex, &zero, sizeof(int));
    
    inorderTraversal<<<1, 1>>>(d_tree, 0, d_result);
    cudaDeviceSynchronize();
    
    cudaMemcpy(h_result, d_result, 7 * sizeof(int), cudaMemcpyDeviceToHost);
    
    printf("In-order traversal: ");
    for (int i = 0; i < 7; i++) printf("%d ", h_result[i]);
    printf("\nExpected: 1 2 3 4 5 6 7\n\n");
    
    cudaFree(d_tree);
    cudaFree(d_result);
}

// ============================================================
// Exercise 2: Parallel Merge Sort with CDP
// ============================================================

__device__ void merge(int* arr, int left, int mid, int right, int* temp) {
    int i = left, j = mid + 1, k = left;
    
    while (i <= mid && j <= right) {
        if (arr[i] <= arr[j]) {
            temp[k++] = arr[i++];
        } else {
            temp[k++] = arr[j++];
        }
    }
    while (i <= mid) temp[k++] = arr[i++];
    while (j <= right) temp[k++] = arr[j++];
    
    for (int i = left; i <= right; i++) {
        arr[i] = temp[i];
    }
}

__global__ void mergeSort(int* arr, int left, int right, int* temp, int depth) {
    if (left >= right) return;
    
    int mid = left + (right - left) / 2;
    
    if (depth < 4) {  // Limit recursion depth
        cudaStream_t s1, s2;
        cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
        cudaStreamCreateWithFlags(&s2, cudaStreamNonBlocking);
        
        mergeSort<<<1, 1, 0, s1>>>(arr, left, mid, temp, depth + 1);
        mergeSort<<<1, 1, 0, s2>>>(arr, mid + 1, right, temp, depth + 1);
        
        cudaDeviceSynchronize();
        cudaStreamDestroy(s1);
        cudaStreamDestroy(s2);
    } else {
        // Base case: sequential sort for small subarrays
        for (int i = left + 1; i <= right; i++) {
            int key = arr[i];
            int j = i - 1;
            while (j >= left && arr[j] > key) {
                arr[j + 1] = arr[j];
                j--;
            }
            arr[j + 1] = key;
        }
        return;
    }
    
    merge(arr, left, mid, right, temp);
}

void exercise2_merge_sort() {
    printf("=== Exercise 2: Parallel Merge Sort ===\n");
    
    const int N = 16;
    int h_arr[] = {64, 34, 25, 12, 22, 11, 90, 45, 33, 21, 88, 15, 44, 72, 19, 56};
    int* d_arr;
    int* d_temp;
    
    cudaMalloc(&d_arr, N * sizeof(int));
    cudaMalloc(&d_temp, N * sizeof(int));
    cudaMemcpy(d_arr, h_arr, N * sizeof(int), cudaMemcpyHostToDevice);
    
    printf("Before: ");
    for (int i = 0; i < N; i++) printf("%d ", h_arr[i]);
    printf("\n");
    
    mergeSort<<<1, 1>>>(d_arr, 0, N - 1, d_temp, 0);
    cudaDeviceSynchronize();
    
    cudaMemcpy(h_arr, d_arr, N * sizeof(int), cudaMemcpyDeviceToHost);
    
    printf("After:  ");
    for (int i = 0; i < N; i++) printf("%d ", h_arr[i]);
    printf("\n\n");
    
    cudaFree(d_arr);
    cudaFree(d_temp);
}

// ============================================================
// Exercise 3: Recursive Tree Height with CDP
// ============================================================

__global__ void treeHeight(TreeNode* tree, int nodeIdx, int* heights) {
    if (nodeIdx == -1) {
        return;
    }
    
    int leftHeight = 0, rightHeight = 0;
    
    // Get heights from children using CDP
    if (tree[nodeIdx].left != -1 || tree[nodeIdx].right != -1) {
        int* d_leftH;
        int* d_rightH;
        cudaMalloc(&d_leftH, sizeof(int));
        cudaMalloc(&d_rightH, sizeof(int));
        cudaMemset(d_leftH, 0, sizeof(int));
        cudaMemset(d_rightH, 0, sizeof(int));
        
        cudaStream_t s1, s2;
        cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
        cudaStreamCreateWithFlags(&s2, cudaStreamNonBlocking);
        
        if (tree[nodeIdx].left != -1) {
            treeHeight<<<1, 1, 0, s1>>>(tree, tree[nodeIdx].left, d_leftH);
        }
        if (tree[nodeIdx].right != -1) {
            treeHeight<<<1, 1, 0, s2>>>(tree, tree[nodeIdx].right, d_rightH);
        }
        
        cudaDeviceSynchronize();
        
        cudaMemcpy(&leftHeight, d_leftH, sizeof(int), cudaMemcpyDeviceToDevice);
        cudaMemcpy(&rightHeight, d_rightH, sizeof(int), cudaMemcpyDeviceToDevice);
        
        // Read values directly
        int* h_left = (int*)malloc(sizeof(int));
        int* h_right = (int*)malloc(sizeof(int));
        cudaMemcpy(h_left, d_leftH, sizeof(int), cudaMemcpyDeviceToHost);
        cudaMemcpy(h_right, d_rightH, sizeof(int), cudaMemcpyDeviceToHost);
        leftHeight = *h_left;
        rightHeight = *h_right;
        free(h_left);
        free(h_right);
        
        cudaFree(d_leftH);
        cudaFree(d_rightH);
        cudaStreamDestroy(s1);
        cudaStreamDestroy(s2);
    }
    
    heights[0] = 1 + max(leftHeight, rightHeight);
}

void exercise3_tree_height() {
    printf("=== Exercise 3: Recursive Tree Height ===\n");
    
    // Same BST as Exercise 1 (height = 3)
    TreeNode h_tree[7] = {
        {4, 1, 2},
        {2, 3, 4},
        {6, 5, 6},
        {1, -1, -1},
        {3, -1, -1},
        {5, -1, -1},
        {7, -1, -1}
    };
    
    TreeNode* d_tree;
    int* d_height;
    int h_height;
    
    cudaMalloc(&d_tree, 7 * sizeof(TreeNode));
    cudaMalloc(&d_height, sizeof(int));
    cudaMemcpy(d_tree, h_tree, 7 * sizeof(TreeNode), cudaMemcpyHostToDevice);
    cudaMemset(d_height, 0, sizeof(int));
    
    treeHeight<<<1, 1>>>(d_tree, 0, d_height);
    cudaDeviceSynchronize();
    
    cudaMemcpy(&h_height, d_height, sizeof(int), cudaMemcpyDeviceToHost);
    
    printf("Tree height: %d\n", h_height);
    printf("Expected: 3\n\n");
    
    cudaFree(d_tree);
    cudaFree(d_height);
}

int main() {
    printf("CDP Recursive Algorithms Exercises\n");
    printf("===================================\n\n");
    
    exercise1_bst_traversal();
    exercise2_merge_sort();
    exercise3_tree_height();
    
    printf("All exercises completed!\n");
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -rdc=true -lcudadevrt -o recursive_exercises recursive_exercises.cu && ./recursive_exercises

### ðŸ”¶ Python/Numba Exercises (Optional)

The following exercises explore similar recursive concepts using Python. Note that Numba CUDA does not support dynamic parallelism directly, so these focus on alternative approaches.

**Exercise A:** Implement an iterative quicksort using a stack-based approach with Numba CUDA kernels.

**Exercise B:** Compare the performance of CPU recursive merge sort vs. GPU parallel reduction-based sorting.

## Key Takeaways

1. **Recursive algorithms** map naturally to CDP
2. **Use base cases** to limit recursion depth and switch to iterative
3. **Streams enable concurrent** left/right processing
4. **Monitor limits** with `cudaDeviceGetLimit`
5. **Consider hybrid**: CDP for structure, parallel kernels for data

## Next: Day 3 - Adaptive Algorithms