## GPU Quicksort with Dynamic Parallelism

Quicksort is a classic recursive algorithm that benefits from CDP:
1. Partition array around pivot
2. Recursively sort left and right subarrays
3. Base case: small arrays sorted with simple algorithm

In [None]:
%%writefile cdp_quicksort.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>

#define INSERTION_SORT_THRESHOLD 32
#define MAX_DEPTH 16

// Simple insertion sort for small arrays
__device__ void insertionSort(int* arr, int left, int right) {
    for (int i = left + 1; i <= right; i++) {
        int key = arr[i];
        int j = i - 1;
        while (j >= left && arr[j] > key) {
            arr[j + 1] = arr[j];
            j--;
        }
        arr[j + 1] = key;
    }
}

// Partition using Lomuto scheme
__device__ int partition(int* arr, int left, int right) {
    int pivot = arr[right];
    int i = left - 1;
    
    for (int j = left; j < right; j++) {
        if (arr[j] <= pivot) {
            i++;
            int temp = arr[i];
            arr[i] = arr[j];
            arr[j] = temp;
        }
    }
    
    int temp = arr[i + 1];
    arr[i + 1] = arr[right];
    arr[right] = temp;
    
    return i + 1;
}

// Recursive quicksort kernel
__global__ void quicksortKernel(int* arr, int left, int right, int depth) {
    // Base case: small array or max depth reached
    if (right - left < INSERTION_SORT_THRESHOLD || depth >= MAX_DEPTH) {
        if (left < right) {
            insertionSort(arr, left, right);
        }
        return;
    }
    
    // Partition the array
    int pivotIdx = partition(arr, left, right);
    
    // Launch child kernels for subarrays
    cudaStream_t leftStream, rightStream;
    cudaStreamCreateWithFlags(&leftStream, cudaStreamNonBlocking);
    cudaStreamCreateWithFlags(&rightStream, cudaStreamNonBlocking);
    
    if (pivotIdx - 1 > left) {
        quicksortKernel<<<1, 1, 0, leftStream>>>(arr, left, pivotIdx - 1, depth + 1);
    }
    if (pivotIdx + 1 < right) {
        quicksortKernel<<<1, 1, 0, rightStream>>>(arr, pivotIdx + 1, right, depth + 1);
    }
    
    cudaDeviceSynchronize();
    cudaStreamDestroy(leftStream);
    cudaStreamDestroy(rightStream);
}

void gpuQuicksort(int* d_arr, int n) {
    quicksortKernel<<<1, 1>>>(d_arr, 0, n - 1, 0);
    cudaDeviceSynchronize();
}

int main() {
    const int N = 1024;
    thrust::host_vector<int> h_arr(N);
    
    // Initialize with random values
    srand(42);
    for (int i = 0; i < N; i++) {
        h_arr[i] = rand() % 10000;
    }
    
    printf("Before sort: [%d, %d, %d, ... , %d]\n",
           h_arr[0], h_arr[1], h_arr[2], h_arr[N-1]);
    
    thrust::device_vector<int> d_arr = h_arr;
    gpuQuicksort(thrust::raw_pointer_cast(d_arr.data()), N);
    h_arr = d_arr;
    
    printf("After sort:  [%d, %d, %d, ... , %d]\n",
           h_arr[0], h_arr[1], h_arr[2], h_arr[N-1]);
    
    // Verify sorted
    bool sorted = true;
    for (int i = 1; i < N; i++) {
        if (h_arr[i] < h_arr[i-1]) sorted = false;
    }
    printf("Array is %s\n", sorted ? "SORTED" : "NOT SORTED");
    
    return 0;
}

In [None]:
!nvcc -rdc=true -lcudadevrt cdp_quicksort.cu -o cdp_quicksort && ./cdp_quicksort

## Recursion Depth Limits

CDP has practical limits:
- **Default nesting depth**: 24 levels
- **Pending kernel limit**: 2048 by default
- **Device memory for sync**: ~150KB per level

Configure with `cudaDeviceSetLimit()`:

In [None]:
%%writefile cdp_limits.cu
#include <stdio.h>
#include <cuda_runtime.h>

int main() {
    size_t syncDepth, pendingLaunchCount, stackSize;
    
    cudaDeviceGetLimit(&syncDepth, cudaLimitDevRuntimeSyncDepth);
    cudaDeviceGetLimit(&pendingLaunchCount, cudaLimitDevRuntimePendingLaunchCount);
    cudaDeviceGetLimit(&stackSize, cudaLimitStackSize);
    
    printf("CDP Limits:\n");
    printf("  Max sync depth: %zu\n", syncDepth);
    printf("  Max pending launches: %zu\n", pendingLaunchCount);
    printf("  Stack size per thread: %zu bytes\n", stackSize);
    
    // Increase limits if needed
    cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, 32);
    cudaDeviceSetLimit(cudaLimitDevRuntimePendingLaunchCount, 4096);
    
    cudaDeviceGetLimit(&syncDepth, cudaLimitDevRuntimeSyncDepth);
    cudaDeviceGetLimit(&pendingLaunchCount, cudaLimitDevRuntimePendingLaunchCount);
    
    printf("\nAfter increasing:\n");
    printf("  Max sync depth: %zu\n", syncDepth);
    printf("  Max pending launches: %zu\n", pendingLaunchCount);
    
    return 0;
}

In [None]:
!nvcc cdp_limits.cu -o cdp_limits && ./cdp_limits

## Binary Tree Traversal

In [None]:
%%writefile cdp_tree.cu
#include <stdio.h>
#include <cuda_runtime.h>

struct TreeNode {
    int value;
    int left;   // Index of left child (-1 if none)
    int right;  // Index of right child (-1 if none)
};

__global__ void processNode(TreeNode* tree, int nodeIdx, int* result, int* resultIdx) {
    if (nodeIdx < 0) return;
    
    TreeNode node = tree[nodeIdx];
    
    // Process left subtree first (in-order)
    if (node.left >= 0) {
        processNode<<<1, 1>>>(tree, node.left, result, resultIdx);
        cudaDeviceSynchronize();
    }
    
    // Process current node
    int idx = atomicAdd(resultIdx, 1);
    result[idx] = node.value;
    
    // Process right subtree
    if (node.right >= 0) {
        processNode<<<1, 1>>>(tree, node.right, result, resultIdx);
        cudaDeviceSynchronize();
    }
}

int main() {
    //       4
    //      / \\
    //     2   6
    //    / \\ / \\
    //   1  3 5  7
    
    TreeNode h_tree[7] = {
        {4, 1, 2},   // 0: root
        {2, 3, 4},   // 1: left of root
        {6, 5, 6},   // 2: right of root
        {1, -1, -1}, // 3: leaf
        {3, -1, -1}, // 4: leaf
        {5, -1, -1}, // 5: leaf
        {7, -1, -1}  // 6: leaf
    };
    
    TreeNode* d_tree;
    int *d_result, *d_resultIdx;
    int h_result[7];
    
    cudaMalloc(&d_tree, 7 * sizeof(TreeNode));
    cudaMalloc(&d_result, 7 * sizeof(int));
    cudaMalloc(&d_resultIdx, sizeof(int));
    
    cudaMemcpy(d_tree, h_tree, 7 * sizeof(TreeNode), cudaMemcpyHostToDevice);
    cudaMemset(d_resultIdx, 0, sizeof(int));
    
    processNode<<<1, 1>>>(d_tree, 0, d_result, d_resultIdx);
    cudaDeviceSynchronize();
    
    cudaMemcpy(h_result, d_result, 7 * sizeof(int), cudaMemcpyDeviceToHost);
    
    printf("In-order traversal: ");
    for (int i = 0; i < 7; i++) printf("%d ", h_result[i]);
    printf("\n");
    
    cudaFree(d_tree);
    cudaFree(d_result);
    cudaFree(d_resultIdx);
    
    return 0;
}

In [None]:
!nvcc -rdc=true -lcudadevrt cdp_tree.cu -o cdp_tree && ./cdp_tree

## Key Takeaways

1. **Recursive algorithms** map naturally to CDP
2. **Use base cases** to limit recursion depth and switch to iterative
3. **Streams enable concurrent** left/right processing
4. **Monitor limits** with `cudaDeviceGetLimit`
5. **Consider hybrid**: CDP for structure, parallel kernels for data

## Next: Day 3 - Adaptive Algorithms