In [None]:
# ‚öôÔ∏è Colab/Local Setup - Run this first!
# Python/Numba is OPTIONAL - for quick interactive testing only
import subprocess, sys
try:
    import google.colab
    print("üîß Running on Google Colab - Installing dependencies...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numba"])
    print("‚úÖ Setup complete!")
except ImportError:
    print("üíª Running locally - make sure you have: pip install numba numpy")

import numpy as np
from numba import cuda
import math
import time

print("\n‚ö†Ô∏è  Remember: CUDA C++ code is the PRIMARY learning material!")
print(f"CUDA available: {cuda.is_available()}")
if cuda.is_available():
    print(f"Device: {cuda.get_current_device().name}")

---

## Part 1: What is Scan (Prefix Sum)?

### Definition

**Scan** computes running totals across an array:

```
Input:     [3, 1, 7, 0, 4, 1, 6, 3]

Inclusive: [3, 4, 11, 11, 15, 16, 22, 25]  ‚Üê Includes current element
Exclusive: [0, 3, 4, 11, 11, 15, 16, 22]   ‚Üê Excludes current element
```

### Mathematical Definition

```
Inclusive scan:  out[i] = in[0] ‚äï in[1] ‚äï ... ‚äï in[i]
Exclusive scan:  out[i] = in[0] ‚äï in[1] ‚äï ... ‚äï in[i-1]  (out[0] = identity)

where ‚äï is any associative operator (add, multiply, max, min, etc.)
```

### Why Scan Matters

```
Scan is a FUNDAMENTAL parallel primitive:
‚Ä¢ Stream compaction (filtering arrays)
‚Ä¢ Radix sort (key distribution)
‚Ä¢ Polynomial evaluation
‚Ä¢ Histogram computation
‚Ä¢ Sparse matrix operations
‚Ä¢ Tree traversal
```

### üî∑ CUDA C++ Implementation (Primary)

### üî∂ Python/Numba (Optional - Quick Testing)

In [None]:
%%writefile scan_basics.cu
// scan_basics.cu - Sequential scan for reference
#include <stdio.h>
#include <cuda_runtime.h>

// CPU inclusive scan - O(n)
void cpu_inclusive_scan(const int* input, int* output, int n) {
    output[0] = input[0];
    for (int i = 1; i < n; i++) {
        output[i] = output[i-1] + input[i];
    }
}

// CPU exclusive scan - O(n)
void cpu_exclusive_scan(const int* input, int* output, int n) {
    output[0] = 0;  // Identity element for addition
    for (int i = 1; i < n; i++) {
        output[i] = output[i-1] + input[i-1];
    }
}

// Verify scan result
bool verify_scan(const int* result, const int* expected, int n) {
    for (int i = 0; i < n; i++) {
        if (result[i] != expected[i]) {
            printf("Mismatch at %d: got %d, expected %d\n", 
                   i, result[i], expected[i]);
            return false;
        }
    }
    return true;
}

int main() {
    int input[] = {3, 1, 7, 0, 4, 1, 6, 3};
    int n = 8;
    int inclusive[8], exclusive[8];
    
    cpu_inclusive_scan(input, inclusive, n);
    cpu_exclusive_scan(input, exclusive, n);
    
    printf("Input:     ");
    for (int i = 0; i < n; i++) printf("%d ", input[i]);
    
    printf("\nInclusive: ");
    for (int i = 0; i < n; i++) printf("%d ", inclusive[i]);
    
    printf("\nExclusive: ");
    for (int i = 0; i < n; i++) printf("%d ", exclusive[i]);
    printf("\n");
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o scan_basics scan_basics.cu
!./scan_basics

In [None]:
# Sequential scan implementations (CPU reference)
def cpu_inclusive_scan(arr):
    """Inclusive prefix sum: out[i] = sum(arr[0:i+1])"""
    result = np.zeros_like(arr)
    result[0] = arr[0]
    for i in range(1, len(arr)):
        result[i] = result[i-1] + arr[i]
    return result

def cpu_exclusive_scan(arr):
    """Exclusive prefix sum: out[i] = sum(arr[0:i])"""
    result = np.zeros_like(arr)
    result[0] = 0  # Identity element
    for i in range(1, len(arr)):
        result[i] = result[i-1] + arr[i-1]
    return result

# Test
test_input = np.array([3, 1, 7, 0, 4, 1, 6, 3], dtype=np.int32)

print(f"Input:     {test_input}")
print(f"Inclusive: {cpu_inclusive_scan(test_input)}")
print(f"Exclusive: {cpu_exclusive_scan(test_input)}")
print(f"\nNote: Exclusive[i] + Input[i] = Inclusive[i]")

---

## Part 2: The Parallel Scan Challenge

### Why Sequential Doesn't Parallelize

```
Sequential scan has DATA DEPENDENCIES:

out[0] = in[0]
out[1] = out[0] + in[1]    ‚Üê Depends on out[0]
out[2] = out[1] + in[2]    ‚Üê Depends on out[1]
out[3] = out[2] + in[3]    ‚Üê Depends on out[2]
...

Each step depends on the previous! Can't run in parallel.
```

### Key Insight: Associativity

```
Addition is ASSOCIATIVE:
(a + b) + c = a + (b + c)

This means we can reorganize computation!

out[3] = in[0] + in[1] + in[2] + in[3]
       = (in[0] + in[1]) + (in[2] + in[3])  ‚Üê Parallel!
```

### üî∑ CUDA C++ Implementation (Primary)

### üî∂ Python/Numba (Optional - Quick Testing)

In [None]:
%%writefile naive_parallel_scan.cu
// naive_parallel_scan.cu - Simple but inefficient
#include <stdio.h>
#include <cuda_runtime.h>

#define BLOCK_SIZE 256

// Naive parallel scan - O(n log n) work
__global__ void naive_inclusive_scan(int* data, int n) {
    __shared__ int temp[BLOCK_SIZE];
    int tid = threadIdx.x;
    int gid = blockIdx.x * blockDim.x + threadIdx.x;
    
    // Load into shared memory
    temp[tid] = (gid < n) ? data[gid] : 0;
    __syncthreads();
    
    // Perform scan with increasing stride
    for (int stride = 1; stride < blockDim.x; stride *= 2) {
        int val = 0;
        if (tid >= stride) {
            val = temp[tid - stride];
        }
        __syncthreads();
        
        temp[tid] += val;
        __syncthreads();
    }
    
    // Write back
    if (gid < n) {
        data[gid] = temp[tid];
    }
}

int main() {
    int h_data[] = {3, 1, 7, 0, 4, 1, 6, 3};
    int n = 8;
    
    int* d_data;
    cudaMalloc(&d_data, n * sizeof(int));
    cudaMemcpy(d_data, h_data, n * sizeof(int), cudaMemcpyHostToDevice);
    
    naive_inclusive_scan<<<1, 8>>>(d_data, n);
    
    cudaMemcpy(h_data, d_data, n * sizeof(int), cudaMemcpyDeviceToHost);
    
    printf("Scan result: ");
    for (int i = 0; i < n; i++) {
        printf("%d ", h_data[i]);
    }
    printf("\n");
    
    cudaFree(d_data);
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o naive_parallel_scan naive_parallel_scan.cu
!./naive_parallel_scan

In [None]:
@cuda.jit
def naive_inclusive_scan(data, n):
    """Naive parallel inclusive scan - O(n log n) work."""
    shared = cuda.shared.array(256, dtype=np.int32)
    tid = cuda.threadIdx.x
    gid = cuda.blockIdx.x * cuda.blockDim.x + tid
    
    # Load into shared memory
    if gid < n:
        shared[tid] = data[gid]
    else:
        shared[tid] = 0
    cuda.syncthreads()
    
    # Scan with increasing stride
    stride = 1
    while stride < cuda.blockDim.x:
        val = 0
        if tid >= stride:
            val = shared[tid - stride]
        cuda.syncthreads()
        
        shared[tid] += val
        cuda.syncthreads()
        
        stride *= 2
    
    # Write back
    if gid < n:
        data[gid] = shared[tid]

In [None]:
# Test naive parallel scan
test_data = np.array([3, 1, 7, 0, 4, 1, 6, 3], dtype=np.int32)
expected = cpu_inclusive_scan(test_data)

d_data = cuda.to_device(test_data.copy())
naive_inclusive_scan[1, 8](d_data, len(test_data))
result = d_data.copy_to_host()

print(f"Input:    {test_data}")
print(f"Result:   {result}")
print(f"Expected: {expected}")
print(f"Correct:  {'‚úì' if np.array_equal(result, expected) else '‚úó'}")

---

## Part 3: Visualizing Parallel Scan

### How It Works

```
Input: [3, 1, 7, 0, 4, 1, 6, 3]

Stride=1: Each element adds its left neighbor
  [3, 3+1, 1+7, 7+0, 0+4, 4+1, 1+6, 6+3]
= [3,  4,   8,   7,   4,   5,   7,   9]

Stride=2: Each element adds element 2 positions left
  [3, 4, 3+8, 4+7, 8+4, 7+5, 4+7, 5+9]
= [3, 4, 11,  11,  12,  12,  11,  14]

Stride=4: Each element adds element 4 positions left
  [3, 4, 11, 11, 3+12, 4+12, 11+11, 11+14]
= [3, 4, 11, 11,  15,   16,   22,    25]

Done! (stride >= n)
```

In [None]:
def visualize_naive_scan(arr):
    """Visualize each step of naive parallel scan."""
    data = arr.copy()
    n = len(data)
    
    print(f"Input:      {data}")
    print()
    
    stride = 1
    step = 0
    while stride < n:
        new_data = data.copy()
        for i in range(stride, n):
            new_data[i] = data[i] + data[i - stride]
        
        print(f"Stride={stride}: {new_data}")
        data = new_data
        stride *= 2
        step += 1
    
    print(f"\nTotal steps: {step} = log2({n})")
    return data

test = np.array([3, 1, 7, 0, 4, 1, 6, 3], dtype=np.int32)
result = visualize_naive_scan(test)

---

## Part 4: Work Efficiency Analysis

### Problem with Naive Approach

```
Sequential scan: O(n) work, O(n) steps
Naive parallel:  O(n log n) work, O(log n) steps

For n = 1,000,000:
  Sequential: 1,000,000 operations
  Naive:      20,000,000 operations (20x more!)

The naive approach does TOO MUCH WORK!
```

### Why More Work is Bad

```
GPUs have finite compute resources.

If we do 20x more operations:
‚Ä¢ Need 20x more threads to maintain speed
‚Ä¢ Or run 20x slower

We want WORK-EFFICIENT algorithms:
‚Ä¢ O(n) work total
‚Ä¢ O(log n) parallel steps
```

### Upcoming: Better Algorithms

| Algorithm | Work | Steps | Work-Efficient? |
|-----------|------|-------|------------------|
| Sequential | O(n) | O(n) | Yes |
| Hillis-Steele | O(n log n) | O(log n) | No |
| Blelloch | O(n) | O(2 log n) | Yes! |

In [None]:
# Compare work done
def analyze_work(n):
    sequential_work = n
    naive_parallel_work = 0
    
    stride = 1
    while stride < n:
        # Each step, (n - stride) threads do work
        naive_parallel_work += (n - stride)
        stride *= 2
    
    blelloch_work = 2 * n  # Approximately
    
    return sequential_work, naive_parallel_work, blelloch_work

print(f"{'N':<12} {'Sequential':<12} {'Naive':<12} {'Blelloch':<12} {'Naive/Seq':<12}")
print("=" * 60)

for n in [8, 256, 1024, 65536, 1048576]:
    seq, naive, blelloch = analyze_work(n)
    ratio = naive / seq
    print(f"{n:<12} {seq:<12} {naive:<12} {blelloch:<12} {ratio:.1f}x")

---

## üéØ Exercises

### üî∑ CUDA C++ Exercises (Primary)

In [None]:
%%writefile scan_exercises.cu
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>

// Error checking macro
#define CHECK_CUDA(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            printf("CUDA Error: %s at line %d\n", cudaGetErrorString(err), __LINE__); \
            exit(1); \
        } \
    } while(0)

// ============================================================
// Exercise 1: Exclusive Scan (Naive Parallel)
// ============================================================
// Exclusive scan: output[i] = sum(input[0..i-1])
// output[0] = 0 (identity element)

__global__ void naiveExclusiveScan(int* data, int n) {
    extern __shared__ int temp[];
    
    int tid = threadIdx.x;
    if (tid >= n) return;
    
    // Load data to shared memory
    temp[tid] = data[tid];
    __syncthreads();
    
    // First do inclusive scan
    for (int stride = 1; stride < n; stride *= 2) {
        int val = 0;
        if (tid >= stride) {
            val = temp[tid - stride];
        }
        __syncthreads();
        temp[tid] += val;
        __syncthreads();
    }
    
    // Convert to exclusive by shifting right
    if (tid == 0) {
        data[tid] = 0;
    } else {
        data[tid] = temp[tid - 1];
    }
}

// ============================================================
// Exercise 2: Max Scan (Running Maximum)
// ============================================================
// Output[i] = max(input[0..i])

__global__ void maxScan(int* data, int n) {
    extern __shared__ int temp[];
    
    int tid = threadIdx.x;
    if (tid >= n) return;
    
    temp[tid] = data[tid];
    __syncthreads();
    
    for (int stride = 1; stride < n; stride *= 2) {
        int val = temp[tid];
        if (tid >= stride) {
            val = max(val, temp[tid - stride]);
        }
        __syncthreads();
        temp[tid] = val;
        __syncthreads();
    }
    
    data[tid] = temp[tid];
}

// ============================================================
// Exercise 3: Min-Max Scan (Track Both)
// ============================================================
// Track running min and max simultaneously

__global__ void minMaxScan(const int* input, int* outMin, int* outMax, int n) {
    extern __shared__ int shared[];
    int* sMin = shared;
    int* sMax = shared + blockDim.x;
    
    int tid = threadIdx.x;
    if (tid >= n) return;
    
    sMin[tid] = input[tid];
    sMax[tid] = input[tid];
    __syncthreads();
    
    for (int stride = 1; stride < n; stride *= 2) {
        int minVal = sMin[tid];
        int maxVal = sMax[tid];
        if (tid >= stride) {
            minVal = min(minVal, sMin[tid - stride]);
            maxVal = max(maxVal, sMax[tid - stride]);
        }
        __syncthreads();
        sMin[tid] = minVal;
        sMax[tid] = maxVal;
        __syncthreads();
    }
    
    outMin[tid] = sMin[tid];
    outMax[tid] = sMax[tid];
}

// ============================================================
// Test Functions
// ============================================================
void testExclusiveScan() {
    printf("=== Exercise 1: Exclusive Scan ===\n");
    
    int h_data[] = {3, 1, 7, 0, 4, 1, 6, 3};
    int expected[] = {0, 3, 4, 11, 11, 15, 16, 22};
    int n = 8;
    
    int* d_data;
    CHECK_CUDA(cudaMalloc(&d_data, n * sizeof(int)));
    CHECK_CUDA(cudaMemcpy(d_data, h_data, n * sizeof(int), cudaMemcpyHostToDevice));
    
    naiveExclusiveScan<<<1, n, n * sizeof(int)>>>(d_data, n);
    CHECK_CUDA(cudaDeviceSynchronize());
    
    int result[8];
    CHECK_CUDA(cudaMemcpy(result, d_data, n * sizeof(int), cudaMemcpyDeviceToHost));
    
    printf("Input:    ");
    for (int i = 0; i < n; i++) printf("%2d ", h_data[i]);
    printf("\nExclusive:");
    for (int i = 0; i < n; i++) printf("%2d ", result[i]);
    printf("\nExpected: ");
    for (int i = 0; i < n; i++) printf("%2d ", expected[i]);
    
    bool correct = true;
    for (int i = 0; i < n; i++) {
        if (result[i] != expected[i]) correct = false;
    }
    printf("\nTest %s\n\n", correct ? "PASSED ‚úì" : "FAILED ‚úó");
    
    cudaFree(d_data);
}

void testMaxScan() {
    printf("=== Exercise 2: Max Scan ===\n");
    
    int h_data[] = {3, 1, 7, 0, 4, 1, 6, 3};
    int expected[] = {3, 3, 7, 7, 7, 7, 7, 7};
    int n = 8;
    
    int* d_data;
    CHECK_CUDA(cudaMalloc(&d_data, n * sizeof(int)));
    CHECK_CUDA(cudaMemcpy(d_data, h_data, n * sizeof(int), cudaMemcpyHostToDevice));
    
    maxScan<<<1, n, n * sizeof(int)>>>(d_data, n);
    CHECK_CUDA(cudaDeviceSynchronize());
    
    int result[8];
    CHECK_CUDA(cudaMemcpy(result, d_data, n * sizeof(int), cudaMemcpyDeviceToHost));
    
    printf("Input:   ");
    for (int i = 0; i < n; i++) printf("%2d ", h_data[i]);
    printf("\nMax Scan:");
    for (int i = 0; i < n; i++) printf("%2d ", result[i]);
    printf("\nExpected:");
    for (int i = 0; i < n; i++) printf("%2d ", expected[i]);
    
    bool correct = true;
    for (int i = 0; i < n; i++) {
        if (result[i] != expected[i]) correct = false;
    }
    printf("\nTest %s\n\n", correct ? "PASSED ‚úì" : "FAILED ‚úó");
    
    cudaFree(d_data);
}

void testMinMaxScan() {
    printf("=== Exercise 3: Min-Max Scan ===\n");
    
    int h_data[] = {5, 2, 8, 1, 9, 3, 7, 4};
    int expectedMin[] = {5, 2, 2, 1, 1, 1, 1, 1};
    int expectedMax[] = {5, 5, 8, 8, 9, 9, 9, 9};
    int n = 8;
    
    int *d_data, *d_min, *d_max;
    CHECK_CUDA(cudaMalloc(&d_data, n * sizeof(int)));
    CHECK_CUDA(cudaMalloc(&d_min, n * sizeof(int)));
    CHECK_CUDA(cudaMalloc(&d_max, n * sizeof(int)));
    CHECK_CUDA(cudaMemcpy(d_data, h_data, n * sizeof(int), cudaMemcpyHostToDevice));
    
    minMaxScan<<<1, n, 2 * n * sizeof(int)>>>(d_data, d_min, d_max, n);
    CHECK_CUDA(cudaDeviceSynchronize());
    
    int resultMin[8], resultMax[8];
    CHECK_CUDA(cudaMemcpy(resultMin, d_min, n * sizeof(int), cudaMemcpyDeviceToHost));
    CHECK_CUDA(cudaMemcpy(resultMax, d_max, n * sizeof(int), cudaMemcpyDeviceToHost));
    
    printf("Input:    ");
    for (int i = 0; i < n; i++) printf("%2d ", h_data[i]);
    printf("\nMin Scan: ");
    for (int i = 0; i < n; i++) printf("%2d ", resultMin[i]);
    printf("\nMax Scan: ");
    for (int i = 0; i < n; i++) printf("%2d ", resultMax[i]);
    
    bool correct = true;
    for (int i = 0; i < n; i++) {
        if (resultMin[i] != expectedMin[i] || resultMax[i] != expectedMax[i]) {
            correct = false;
        }
    }
    printf("\nTest %s\n\n", correct ? "PASSED ‚úì" : "FAILED ‚úó");
    
    cudaFree(d_data);
    cudaFree(d_min);
    cudaFree(d_max);
}

int main() {
    printf("‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó\n");
    printf("‚ïë              CUDA Scan Basics Exercises                      ‚ïë\n");
    printf("‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù\n\n");
    
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);
    printf("Device: %s\n", prop.name);
    printf("Compute Capability: %d.%d\n\n", prop.major, prop.minor);
    
    testExclusiveScan();
    testMaxScan();
    testMinMaxScan();
    
    printf("‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê\n");
    printf("                    All exercises completed!\n");
    printf("‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê\n");
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o scan_exercises scan_exercises.cu && ./scan_exercises

### üî∂ Python/Numba Exercises (Optional)

### Exercise 1: Exclusive Scan Kernel

Modify the naive scan to produce exclusive scan output.

In [None]:
# TODO: Implement exclusive scan
# Hint: Run inclusive scan, then shift result right and prepend 0

@cuda.jit
def naive_exclusive_scan(data, n):
    """Naive parallel exclusive scan."""
    pass  # Your implementation

# Test
# Expected: [0, 3, 4, 11, 11, 15, 16, 22]

### Exercise 2: Max Scan

Implement a scan using `max` instead of `+`.

In [None]:
# TODO: Implement max scan
# Input:  [3, 1, 7, 0, 4, 1, 6, 3]
# Output: [3, 3, 7, 7, 7, 7, 7, 7]

@cuda.jit
def max_scan(data, n):
    """Running maximum scan."""
    pass  # Your implementation

### Exercise 3: Count Work Operations

Modify the visualization to count exact operations at each step.

In [None]:
# TODO: Count additions at each step
def count_scan_work(n):
    """Count total additions in naive parallel scan."""
    pass  # Your implementation

# Verify: For n=8, should be 3+4+5+6+7 = 25 (approx n*log2(n))

---

## Summary

### Key Concepts

| Concept | Description |
|---------|-------------|
| Inclusive Scan | out[i] = sum(in[0..i]) |
| Exclusive Scan | out[i] = sum(in[0..i-1]) |
| Associativity | (a+b)+c = a+(b+c) enables parallelism |
| Work | Total operations performed |
| Steps | Parallel depth (synchronization points) |

### CUDA C++ Key Patterns

```cpp
// Shared memory for block-level scan
__shared__ int temp[BLOCK_SIZE];

// Load data
temp[tid] = data[gid];
__syncthreads();

// Scan loop with doubling stride
for (int stride = 1; stride < blockDim.x; stride *= 2) {
    int val = (tid >= stride) ? temp[tid - stride] : 0;
    __syncthreads();
    temp[tid] += val;
    __syncthreads();
}
```

### Next: Hillis-Steele Algorithm
Tomorrow we'll study the Hillis-Steele algorithm in depth - simple but not work-efficient.