In [None]:
# ‚öôÔ∏è Setup
import subprocess, sys
try:
    import google.colab
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numba"])
except ImportError:
    pass

import numpy as np
from numba import cuda
import math
import time

print("‚ö†Ô∏è  CUDA C++ is the PRIMARY learning material!")

---

## Part 1: The Large Array Problem

### Block Size Limitation

```
Single-block scan is limited:
‚Ä¢ Max threads per block: 1024
‚Ä¢ Blelloch uses n/2 threads ‚Üí max 2048 elements
‚Ä¢ Real arrays have millions of elements!

Solution: Multi-block scan with 3 phases
```

### Three-Pass Algorithm

```
Array: [B0 elements][B1 elements][B2 elements][B3 elements]

Pass 1: Block-level scan
  Each block scans its portion independently
  Save each block's total in auxiliary array

Pass 2: Scan the auxiliary array
  aux = [sum_B0, sum_B1, sum_B2, sum_B3]
  scan(aux) ‚Üí [0, sum_B0, sum_B0+B1, sum_B0+B1+B2]

Pass 3: Add auxiliary sums to each block
  Block i adds aux[i] to all its elements
```

### CUDA C++ Implementation (Primary)

In [None]:
%%writefile large_scan.cu
// large_scan.cu - Multi-block scan
#include <stdio.h>
#include <cuda_runtime.h>

#define BLOCK_SIZE 256
#define ELEMENTS_PER_BLOCK (2 * BLOCK_SIZE)  // Blelloch uses n/2 threads

// Pass 1: Block-level scan, save block sums
__global__ void scan_blocks(int* data, int* block_sums, int n) {
    __shared__ int temp[ELEMENTS_PER_BLOCK];
    
    int tid = threadIdx.x;
    int block_offset = blockIdx.x * ELEMENTS_PER_BLOCK;
    
    // Load 2 elements per thread
    int ai = tid;
    int bi = tid + BLOCK_SIZE;
    
    temp[ai] = (block_offset + ai < n) ? data[block_offset + ai] : 0;
    temp[bi] = (block_offset + bi < n) ? data[block_offset + bi] : 0;
    
    // Blelloch up-sweep
    int offset = 1;
    for (int d = ELEMENTS_PER_BLOCK >> 1; d > 0; d >>= 1) {
        __syncthreads();
        if (tid < d) {
            int ai = offset * (2*tid + 1) - 1;
            int bi = offset * (2*tid + 2) - 1;
            temp[bi] += temp[ai];
        }
        offset *= 2;
    }
    
    // Save block sum before clearing
    if (tid == 0) {
        block_sums[blockIdx.x] = temp[ELEMENTS_PER_BLOCK - 1];
        temp[ELEMENTS_PER_BLOCK - 1] = 0;
    }
    
    // Blelloch down-sweep
    for (int d = 1; d < ELEMENTS_PER_BLOCK; d *= 2) {
        offset >>= 1;
        __syncthreads();
        if (tid < d) {
            int ai = offset * (2*tid + 1) - 1;
            int bi = offset * (2*tid + 2) - 1;
            int t = temp[ai];
            temp[ai] = temp[bi];
            temp[bi] += t;
        }
    }
    __syncthreads();
    
    // Write back
    if (block_offset + ai < n) data[block_offset + ai] = temp[ai];
    if (block_offset + bi < n) data[block_offset + bi] = temp[bi];
}

// Pass 3: Add block sums to elements
__global__ void add_block_sums(int* data, int* block_sums, int n) {
    int gid = blockIdx.x * blockDim.x + threadIdx.x;
    int block_id = gid / ELEMENTS_PER_BLOCK;
    
    if (gid < n && block_id > 0) {
        data[gid] += block_sums[block_id];
    }
}

// Recursive scan for large arrays
void scan_recursive(int* d_data, int n) {
    int num_blocks = (n + ELEMENTS_PER_BLOCK - 1) / ELEMENTS_PER_BLOCK;
    
    if (num_blocks == 1) {
        // Single block - direct scan
        int* d_dummy;
        cudaMalloc(&d_dummy, sizeof(int));
        scan_blocks<<<1, BLOCK_SIZE>>>(d_data, d_dummy, n);
        cudaFree(d_dummy);
    } else {
        // Multi-block scan
        int* d_block_sums;
        cudaMalloc(&d_block_sums, num_blocks * sizeof(int));
        
        // Pass 1: Scan each block
        scan_blocks<<<num_blocks, BLOCK_SIZE>>>(d_data, d_block_sums, n);
        
        // Pass 2: Recursively scan block sums
        scan_recursive(d_block_sums, num_blocks);
        
        // Pass 3: Add block sums to elements
        int threads = 256;
        int blocks = (n + threads - 1) / threads;
        add_block_sums<<<blocks, threads>>>(d_data, d_block_sums, n);
        
        cudaFree(d_block_sums);
    }
}

int main() {
    int n = 10000;  // Large array
    int* h_data = new int[n];
    for (int i = 0; i < n; i++) h_data[i] = 1;  // All ones
    
    int* d_data;
    cudaMalloc(&d_data, n * sizeof(int));
    cudaMemcpy(d_data, h_data, n * sizeof(int), cudaMemcpyHostToDevice);
    
    scan_recursive(d_data, n);
    
    cudaMemcpy(h_data, d_data, n * sizeof(int), cudaMemcpyDeviceToHost);
    
    // Verify: exclusive scan of all 1s should be [0, 1, 2, 3, ...]
    bool correct = true;
    for (int i = 0; i < n; i++) {
        if (h_data[i] != i) {
            printf("Error at %d: got %d, expected %d\n", i, h_data[i], i);
            correct = false;
            break;
        }
    }
    
    printf("Large scan (n=%d): %s\n", n, correct ? "PASSED" : "FAILED");
    
    delete[] h_data;
    cudaFree(d_data);
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o large_scan large_scan.cu
!./large_scan

---

## Part 2: Stream Compaction Application

### What is Stream Compaction?

```
Remove unwanted elements from an array:

Input:  [3, 0, 5, 0, 0, 2, 0, 1]
Keep:   non-zero elements
Output: [3, 5, 2, 1]

The challenge: Do it in parallel!
```

### Stream Compaction with Scan

```
Step 1: Create predicate array (1 if keep, 0 if discard)
  Input:     [3, 0, 5, 0, 0, 2, 0, 1]
  Predicate: [1, 0, 1, 0, 0, 1, 0, 1]

Step 2: Exclusive scan of predicates = output indices
  Indices: [0, 1, 1, 2, 2, 2, 3, 3]

Step 3: Scatter kept elements to their indices
  If predicate[i] == 1:
    output[indices[i]] = input[i]
  
  Output: [3, 5, 2, 1]

Total output count: scan result + last predicate
```

### CUDA C++ Stream Compaction (Primary)

### Python/Numba Implementation (Optional)

In [None]:
%%writefile stream_compaction.cu
// stream_compaction.cu
#include <stdio.h>
#include <cuda_runtime.h>

#define BLOCK_SIZE 256
#define ELEMENTS_PER_BLOCK (2 * BLOCK_SIZE)

// Forward declarations from large_scan.cu
__global__ void scan_blocks(int* data, int* block_sums, int n);
__global__ void add_block_sums(int* data, int* block_sums, int n);
void scan_recursive(int* d_data, int n);

// Create predicate array
__global__ void create_predicates(const int* input, int* predicates, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) {
        predicates[tid] = (input[tid] != 0) ? 1 : 0;
    }
}

// Scatter based on scan results
__global__ void scatter(const int* input, const int* predicates, 
                        const int* indices, int* output, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n && predicates[tid] == 1) {
        output[indices[tid]] = input[tid];
    }
}

// Simple exclusive scan for demonstration
__global__ void simple_exclusive_scan(int* data, int n) {
    __shared__ int temp[512];
    int tid = threadIdx.x;
    
    // Load
    temp[2*tid] = data[2*tid];
    temp[2*tid + 1] = data[2*tid + 1];
    
    // UP-SWEEP
    int offset = 1;
    for (int d = n >> 1; d > 0; d >>= 1) {
        __syncthreads();
        if (tid < d) {
            int ai = offset * (2*tid + 1) - 1;
            int bi = offset * (2*tid + 2) - 1;
            temp[bi] += temp[ai];
        }
        offset *= 2;
    }
    
    if (tid == 0) temp[n-1] = 0;
    
    // DOWN-SWEEP
    for (int d = 1; d < n; d *= 2) {
        offset >>= 1;
        __syncthreads();
        if (tid < d) {
            int ai = offset * (2*tid + 1) - 1;
            int bi = offset * (2*tid + 2) - 1;
            int t = temp[ai];
            temp[ai] = temp[bi];
            temp[bi] += t;
        }
    }
    __syncthreads();
    
    data[2*tid] = temp[2*tid];
    data[2*tid + 1] = temp[2*tid + 1];
}

int main() {
    int h_input[] = {3, 0, 5, 0, 0, 2, 0, 1};
    int n = 8;
    
    printf("Input: ");
    for (int i = 0; i < n; i++) printf("%d ", h_input[i]);
    printf("\n");
    
    int *d_input, *d_predicates, *d_indices, *d_output;
    cudaMalloc(&d_input, n * sizeof(int));
    cudaMalloc(&d_predicates, n * sizeof(int));
    cudaMalloc(&d_indices, n * sizeof(int));
    cudaMalloc(&d_output, n * sizeof(int));
    
    cudaMemcpy(d_input, h_input, n * sizeof(int), cudaMemcpyHostToDevice);
    
    // Step 1: Create predicates
    create_predicates<<<1, n>>>(d_input, d_predicates, n);
    
    // Step 2: Exclusive scan
    cudaMemcpy(d_indices, d_predicates, n * sizeof(int), cudaMemcpyDeviceToDevice);
    simple_exclusive_scan<<<1, n/2>>>(d_indices, n);
    
    // Get output count
    int h_predicates[8], h_indices[8];
    cudaMemcpy(h_predicates, d_predicates, n * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(h_indices, d_indices, n * sizeof(int), cudaMemcpyDeviceToHost);
    int output_count = h_indices[n-1] + h_predicates[n-1];
    
    printf("Predicates: ");
    for (int i = 0; i < n; i++) printf("%d ", h_predicates[i]);
    printf("\nIndices:    ");
    for (int i = 0; i < n; i++) printf("%d ", h_indices[i]);
    printf("\n");
    
    // Step 3: Scatter
    scatter<<<1, n>>>(d_input, d_predicates, d_indices, d_output, n);
    
    int h_output[8];
    cudaMemcpy(h_output, d_output, output_count * sizeof(int), cudaMemcpyDeviceToHost);
    
    printf("Output: ");
    for (int i = 0; i < output_count; i++) printf("%d ", h_output[i]);
    printf("\nCount: %d\n", output_count);
    
    cudaFree(d_input);
    cudaFree(d_predicates);
    cudaFree(d_indices);
    cudaFree(d_output);
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o stream_compaction stream_compaction.cu
!./stream_compaction

In [None]:
@cuda.jit
def create_predicates(input_arr, predicates, n):
    """Create binary predicate array."""
    tid = cuda.grid(1)
    if tid < n:
        predicates[tid] = 1 if input_arr[tid] != 0 else 0

@cuda.jit
def scatter(input_arr, predicates, indices, output, n):
    """Scatter kept elements to output positions."""
    tid = cuda.grid(1)
    if tid < n and predicates[tid] == 1:
        output[indices[tid]] = input_arr[tid]

def cpu_exclusive_scan(arr):
    result = np.zeros_like(arr)
    for i in range(1, len(arr)):
        result[i] = result[i-1] + arr[i-1]
    return result

In [None]:
def stream_compact_cpu(data):
    """Stream compaction using CPU (for comparison)."""
    # Step 1: Predicates
    predicates = (data != 0).astype(np.int32)
    
    # Step 2: Exclusive scan
    indices = cpu_exclusive_scan(predicates)
    
    # Step 3: Count and scatter
    output_count = indices[-1] + predicates[-1]
    output = np.zeros(output_count, dtype=data.dtype)
    
    for i in range(len(data)):
        if predicates[i] == 1:
            output[indices[i]] = data[i]
    
    return output, predicates, indices

# Test
test_data = np.array([3, 0, 5, 0, 0, 2, 0, 1], dtype=np.int32)
output, predicates, indices = stream_compact_cpu(test_data)

print(f"Input:      {test_data}")
print(f"Predicates: {predicates}")
print(f"Indices:    {indices}")
print(f"Output:     {output}")
print(f"Count:      {len(output)}")

---

## Part 3: Radix Sort with Scan

### Radix Sort Algorithm

```
Sort by processing one bit at a time (LSB first):

Input: [5, 3, 7, 2] = [101, 011, 111, 010]

Bit 0 (LSB):
  0-bit: [2] (010)      ‚Üí goes first
  1-bit: [5, 3, 7]      ‚Üí goes after
  Result: [2, 5, 3, 7]

Bit 1:
  0-bit: [5] (101)      ‚Üí goes first
  1-bit: [2, 3, 7]      ‚Üí goes after
  Result: [5, 2, 3, 7]

Bit 2:
  0-bit: [2, 3] (0xx)   ‚Üí goes first
  1-bit: [5, 7] (1xx)   ‚Üí goes after
  Result: [2, 3, 5, 7] ‚úì
```

### Scan in Radix Sort

```
For each bit position:
1. Create predicate: bit == 0
2. Exclusive scan ‚Üí positions for 0-bit elements
3. Create predicate: bit == 1
4. Exclusive scan ‚Üí positions for 1-bit elements (offset by count of 0s)
5. Scatter to new positions
```

In [None]:
def radix_sort_cpu(data, num_bits=8):
    """Simple radix sort using scan."""
    n = len(data)
    current = data.copy()
    output = np.zeros_like(data)
    
    for bit in range(num_bits):
        # Extract bit
        bits = (current >> bit) & 1
        
        # Count 0s (will go first)
        zero_pred = (bits == 0).astype(np.int32)
        zero_scan = cpu_exclusive_scan(zero_pred)
        num_zeros = zero_scan[-1] + zero_pred[-1]
        
        # Count 1s (will go after 0s)
        one_pred = (bits == 1).astype(np.int32)
        one_scan = cpu_exclusive_scan(one_pred)
        
        # Scatter
        for i in range(n):
            if bits[i] == 0:
                output[zero_scan[i]] = current[i]
            else:
                output[num_zeros + one_scan[i]] = current[i]
        
        current, output = output, current
    
    return current

# Test
test_data = np.array([5, 3, 7, 2, 8, 1, 4, 6], dtype=np.int32)
sorted_data = radix_sort_cpu(test_data, num_bits=4)

print(f"Input:  {test_data}")
print(f"Sorted: {sorted_data}")
print(f"Correct: {'‚úì' if np.array_equal(sorted_data, np.sort(test_data)) else '‚úó'}")

---

## Part 4: Performance Comparison

In [None]:
# Compare CPU vs GPU scan performance
def benchmark_scan(n):
    data = np.random.randint(0, 100, n).astype(np.int32)
    
    # CPU
    start = time.perf_counter()
    cpu_result = cpu_exclusive_scan(data)
    cpu_time = time.perf_counter() - start
    
    # NumPy (optimized)
    start = time.perf_counter()
    numpy_result = np.concatenate([[0], np.cumsum(data[:-1])])
    numpy_time = time.perf_counter() - start
    
    return cpu_time * 1000, numpy_time * 1000

print(f"{'N':<12} {'CPU (ms)':<15} {'NumPy (ms)':<15} {'Speedup':<10}")
print("=" * 55)

for n in [1000, 10000, 100000, 1000000]:
    cpu_ms, numpy_ms = benchmark_scan(n)
    speedup = cpu_ms / numpy_ms
    print(f"{n:<12} {cpu_ms:<15.4f} {numpy_ms:<15.4f} {speedup:<10.1f}x")

---

## Exercises

### Exercise 1: Filter Positive Numbers

In [None]:
# TODO: Use stream compaction to filter only positive numbers
# Input:  [-3, 1, -5, 2, 0, -1, 4, 3]
# Output: [1, 2, 4, 3]

def filter_positive(data):
    """Keep only positive numbers using scan."""
    pass  # Your implementation

# Test
test = np.array([-3, 1, -5, 2, 0, -1, 4, 3], dtype=np.int32)
# result = filter_positive(test)
# print(f"Input:  {test}")
# print(f"Output: {result}")

### Exercise 2: Remove Duplicates

In [None]:
# TODO: Remove consecutive duplicates using scan
# Input:  [1, 1, 2, 2, 2, 3, 1, 1]
# Output: [1, 2, 3, 1]

def remove_consecutive_duplicates(data):
    """Remove consecutive duplicates."""
    pass  # Your implementation

# Hint: predicate[i] = 1 if data[i] != data[i-1]

### Exercise 3: Segment Scan

In [None]:
# TODO: Implement segmented scan
# Data:    [1, 2, 3 | 4, 5 | 6]
# Flags:   [1, 0, 0,  1, 0,  1]  (1 = start of segment)
# Output:  [1, 3, 6,  4, 9,  6]

def segmented_scan(data, flags):
    """Scan within segments only."""
    pass  # Your implementation

---

## Summary

### Large Array Scan

| Pass | Purpose |
|------|---------||
| 1 | Block-level scan, save block sums |
| 2 | Scan block sums (recursive) |
| 3 | Add block sums to elements |

### Scan Applications

| Application | How Scan Helps |
|-------------|---------------|
| Stream compaction | Compute output indices |
| Radix sort | Position elements in buckets |
| Histograms | Compute bin boundaries |
| Tree traversal | Compute subtree sizes |

### CUDA C++ Key Pattern

```cpp
// Stream compaction with scan
create_predicates<<<...>>>(input, predicates, n);
scan(predicates, indices, n);  // Exclusive scan
scatter<<<...>>>(input, predicates, indices, output, n);
```

### Week 5 Complete! üéâ

**What You Learned:**
- Inclusive vs Exclusive scan
- Hillis-Steele algorithm
- Blelloch work-efficient algorithm
- Multi-block scan for large arrays
- Stream compaction and radix sort