In [None]:
# ‚öôÔ∏è Setup
import subprocess, sys
try:
    import google.colab
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numba"])
except ImportError:
    pass

import numpy as np
from numba import cuda
import math

print("‚ö†Ô∏è  CUDA C++ is the PRIMARY learning material!")

---

## Part 1: Hillis-Steele Algorithm

### Algorithm Description

```
Hillis-Steele (1986):
‚Ä¢ Simple parallel prefix algorithm
‚Ä¢ O(n log n) work
‚Ä¢ O(log n) steps
‚Ä¢ NOT work-efficient, but simple to implement

Key idea: At step d, element i adds element (i - 2^d)
```

### Visualization

```
Input:  [a, b, c, d, e, f, g, h]

Step 0 (stride=1):
  out[i] = in[i] + in[i-1]
  [a, a+b, b+c, c+d, d+e, e+f, f+g, g+h]

Step 1 (stride=2):
  out[i] = in[i] + in[i-2]
  [a, a+b, a+b+c, a+b+c+d, b+c+d+e, c+d+e+f, d+e+f+g, e+f+g+h]

Step 2 (stride=4):
  out[i] = in[i] + in[i-4]
  [a, a+b, a+b+c, a+b+c+d, a+b+c+d+e, a+b+c+d+e+f, a+b+c+d+e+f+g, a+b+c+d+e+f+g+h]

Done! All prefixes computed.
```

### üî∑ CUDA C++ Implementation (Primary)

### üî∂ Python/Numba (Optional - Quick Testing)

In [None]:
%%writefile hillis_steele.cu
// hillis_steele.cu - Hillis-Steele parallel scan
#include <stdio.h>
#include <cuda_runtime.h>

#define BLOCK_SIZE 256

// Hillis-Steele scan with double buffering
__global__ void hillis_steele_scan(int* data, int n) {
    // Double buffer to avoid race conditions
    __shared__ int buffer[2][BLOCK_SIZE];
    
    int tid = threadIdx.x;
    int gid = blockIdx.x * blockDim.x + tid;
    
    // Load into buffer 0
    int pout = 0, pin = 1;  // Toggle between buffers
    buffer[pout][tid] = (gid < n) ? data[gid] : 0;
    __syncthreads();
    
    // Hillis-Steele: log(n) steps
    for (int stride = 1; stride < blockDim.x; stride *= 2) {
        // Swap buffers
        pout = 1 - pout;
        pin = 1 - pin;
        
        if (tid >= stride) {
            buffer[pout][tid] = buffer[pin][tid] + buffer[pin][tid - stride];
        } else {
            buffer[pout][tid] = buffer[pin][tid];
        }
        __syncthreads();
    }
    
    // Write result
    if (gid < n) {
        data[gid] = buffer[pout][tid];
    }
}

// Exclusive scan version
__global__ void hillis_steele_exclusive(int* data, int n) {
    __shared__ int buffer[2][BLOCK_SIZE];
    
    int tid = threadIdx.x;
    int gid = blockIdx.x * blockDim.x + tid;
    
    int pout = 0, pin = 1;
    
    // For exclusive: shift input right, insert 0
    if (tid == 0) {
        buffer[pout][tid] = 0;
    } else if (gid - 1 < n) {
        buffer[pout][tid] = data[gid - 1];
    } else {
        buffer[pout][tid] = 0;
    }
    __syncthreads();
    
    for (int stride = 1; stride < blockDim.x; stride *= 2) {
        pout = 1 - pout;
        pin = 1 - pin;
        
        if (tid >= stride) {
            buffer[pout][tid] = buffer[pin][tid] + buffer[pin][tid - stride];
        } else {
            buffer[pout][tid] = buffer[pin][tid];
        }
        __syncthreads();
    }
    
    if (gid < n) {
        data[gid] = buffer[pout][tid];
    }
}

int main() {
    int h_data[] = {3, 1, 7, 0, 4, 1, 6, 3};
    int n = 8;
    
    printf("Input: ");
    for (int i = 0; i < n; i++) printf("%d ", h_data[i]);
    printf("\n");
    
    int* d_data;
    cudaMalloc(&d_data, n * sizeof(int));
    
    // Inclusive scan
    cudaMemcpy(d_data, h_data, n * sizeof(int), cudaMemcpyHostToDevice);
    hillis_steele_scan<<<1, 8>>>(d_data, n);
    
    int result[8];
    cudaMemcpy(result, d_data, n * sizeof(int), cudaMemcpyDeviceToHost);
    
    printf("Inclusive: ");
    for (int i = 0; i < n; i++) printf("%d ", result[i]);
    printf("\n");
    
    // Exclusive scan
    cudaMemcpy(d_data, h_data, n * sizeof(int), cudaMemcpyHostToDevice);
    hillis_steele_exclusive<<<1, 8>>>(d_data, n);
    cudaMemcpy(result, d_data, n * sizeof(int), cudaMemcpyDeviceToHost);
    
    printf("Exclusive: ");
    for (int i = 0; i < n; i++) printf("%d ", result[i]);
    printf("\n");
    
    cudaFree(d_data);
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o hillis_steele hillis_steele.cu
!./hillis_steele

In [None]:
BLOCK_SIZE = 256

@cuda.jit
def hillis_steele_inclusive(data, n):
    """Hillis-Steele inclusive scan with double buffering."""
    # Double buffer
    buffer0 = cuda.shared.array(256, dtype=np.int32)
    buffer1 = cuda.shared.array(256, dtype=np.int32)
    
    tid = cuda.threadIdx.x
    gid = cuda.blockIdx.x * cuda.blockDim.x + tid
    
    # Load into buffer0
    if gid < n:
        buffer0[tid] = data[gid]
    else:
        buffer0[tid] = 0
    cuda.syncthreads()
    
    # Track which buffer is output
    stride = 1
    use_buffer1 = True  # Next write goes to buffer1
    
    while stride < cuda.blockDim.x:
        if use_buffer1:
            # Read from buffer0, write to buffer1
            if tid >= stride:
                buffer1[tid] = buffer0[tid] + buffer0[tid - stride]
            else:
                buffer1[tid] = buffer0[tid]
        else:
            # Read from buffer1, write to buffer0
            if tid >= stride:
                buffer0[tid] = buffer1[tid] + buffer1[tid - stride]
            else:
                buffer0[tid] = buffer1[tid]
        
        cuda.syncthreads()
        use_buffer1 = not use_buffer1
        stride *= 2
    
    # Write result from the last written buffer
    if gid < n:
        if use_buffer1:
            data[gid] = buffer0[tid]
        else:
            data[gid] = buffer1[tid]

In [None]:
# Test Hillis-Steele
def cpu_inclusive_scan(arr):
    result = np.zeros_like(arr)
    result[0] = arr[0]
    for i in range(1, len(arr)):
        result[i] = result[i-1] + arr[i]
    return result

test_data = np.array([3, 1, 7, 0, 4, 1, 6, 3], dtype=np.int32)
expected = cpu_inclusive_scan(test_data)

d_data = cuda.to_device(test_data.copy())
hillis_steele_inclusive[1, 8](d_data, len(test_data))
result = d_data.copy_to_host()

print(f"Input:    {test_data}")
print(f"Result:   {result}")
print(f"Expected: {expected}")
print(f"Correct:  {'‚úì' if np.array_equal(result, expected) else '‚úó'}")

---

## Part 2: Why Double Buffering?

### The Race Condition Problem

```cpp
// WRONG: Race condition!
for (int stride = 1; stride < n; stride *= 2) {
    if (tid >= stride) {
        temp[tid] = temp[tid] + temp[tid - stride];  // BUG!
    }
    __syncthreads();
}

Problem:
Thread 2 reads temp[1] and temp[2]
Thread 1 writes temp[1] (might happen first!)
Thread 2 reads stale or updated value - undefined!
```

### Solution: Double Buffering

```cpp
// CORRECT: Double buffering
__shared__ int buffer[2][BLOCK_SIZE];
int pout = 0, pin = 1;

for (int stride = 1; stride < n; stride *= 2) {
    pout = 1 - pout;  // Swap
    pin = 1 - pin;
    
    // Read from pin, write to pout - no conflict!
    buffer[pout][tid] = buffer[pin][tid] + buffer[pin][tid - stride];
    __syncthreads();
}
```

In [None]:
# Demonstrate the race condition (CPU simulation)
def simulate_race_condition():
    """Show what can go wrong without double buffering."""
    data = np.array([3, 1, 7, 0, 4, 1, 6, 3], dtype=np.int32)
    n = len(data)
    
    print("Simulating without double buffering:")
    print(f"Initial: {data}")
    
    # Step 1: stride = 1
    # If thread 1 updates data[1] before thread 2 reads it...
    print("\nStride=1 execution order matters:")
    
    # Scenario A: Thread 1 goes first
    data_a = data.copy()
    data_a[1] = data_a[1] + data_a[0]  # Thread 1: 1+3=4
    data_a[2] = data_a[2] + data_a[1]  # Thread 2: 7+4=11 (uses NEW value!)
    print(f"Thread 1 first: data[2] = 7 + 4 = 11 (WRONG - used updated value)")
    
    # Scenario B: Correct with double buffering
    data_b = data.copy()
    new_data = data_b.copy()
    new_data[1] = data_b[1] + data_b[0]  # 1+3=4
    new_data[2] = data_b[2] + data_b[1]  # 7+1=8 (uses OLD value)
    print(f"Double buffer: data[2] = 7 + 1 = 8 (CORRECT - used original value)")

simulate_race_condition()

---

## Part 3: Work and Step Analysis

### Complexity Analysis

```
Hillis-Steele Algorithm:

Steps: log‚ÇÇ(n)
  - stride doubles each iteration: 1, 2, 4, 8, ... n/2
  
Work per step: O(n)
  - Almost all threads do an addition
  
Total work: O(n log n)
  - log(n) steps √ó O(n) work each

Compare to sequential: O(n) work, O(n) steps

Hillis-Steele trades extra work for parallelism.
```

In [None]:
def hillis_steele_complexity(n):
    """Analyze Hillis-Steele complexity."""
    steps = int(np.ceil(np.log2(n)))
    
    total_work = 0
    stride = 1
    for step in range(steps):
        work = n - stride  # Threads that do addition
        total_work += work
        stride *= 2
    
    return steps, total_work

print(f"{'N':<12} {'Steps':<10} {'Work':<15} {'n*log(n)':<15} {'Efficiency':<10}")
print("=" * 60)

for n in [8, 256, 1024, 65536, 1048576]:
    steps, work = hillis_steele_complexity(n)
    n_log_n = n * np.log2(n)
    efficiency = n / work  # How much of work is "useful"
    print(f"{n:<12} {steps:<10} {work:<15} {n_log_n:<15.0f} {efficiency:<10.3f}")

---

## Part 4: Larger Block Sizes

In [None]:
# Test with larger arrays
import time

def benchmark_hillis_steele(n, iterations=100):
    data = np.random.randint(0, 10, n).astype(np.int32)
    expected = np.cumsum(data)
    
    d_data = cuda.to_device(data.copy())
    
    # Warmup
    hillis_steele_inclusive[1, n](d_data, n)
    cuda.synchronize()
    
    # Benchmark
    start = time.perf_counter()
    for _ in range(iterations):
        d_data = cuda.to_device(data.copy())
        hillis_steele_inclusive[1, n](d_data, n)
        cuda.synchronize()
    elapsed = (time.perf_counter() - start) / iterations * 1e6
    
    result = d_data.copy_to_host()
    correct = np.array_equal(result, expected)
    
    return elapsed, correct

print(f"{'N':<10} {'Time (Œºs)':<15} {'Correct':<10}")
print("=" * 35)

for n in [8, 32, 64, 128, 256]:
    time_us, correct = benchmark_hillis_steele(n)
    print(f"{n:<10} {time_us:<15.2f} {'‚úì' if correct else '‚úó':<10}")

---

## Exercises

### Exercise 1: Hillis-Steele Exclusive Scan

In [None]:
# TODO: Implement exclusive scan using Hillis-Steele
# Hint: Shift input right and insert 0 at beginning

@cuda.jit
def hillis_steele_exclusive(data, n):
    """Hillis-Steele exclusive scan."""
    pass  # Your implementation

# Test
# Input:    [3, 1, 7, 0, 4, 1, 6, 3]
# Expected: [0, 3, 4, 11, 11, 15, 16, 22]

### Exercise 2: Generalize to Any Operator

In [None]:
# TODO: Implement max-scan using Hillis-Steele
# Each element should contain the max of all elements up to that point

@cuda.jit
def hillis_steele_max_scan(data, n):
    """Running maximum using Hillis-Steele."""
    pass  # Your implementation

# Test
# Input:    [3, 1, 7, 0, 4, 1, 6, 3]
# Expected: [3, 3, 7, 7, 7, 7, 7, 7]

---

## Summary

### Hillis-Steele Algorithm

| Property | Value |
|----------|-------|
| Work | O(n log n) |
| Steps | O(log n) |
| Work-efficient | No |
| Simple to implement | Yes |

### CUDA C++ Key Pattern

```cpp
// Double buffering for race-free scan
__shared__ int buffer[2][BLOCK_SIZE];
int pout = 0, pin = 1;

for (int stride = 1; stride < n; stride *= 2) {
    pout = 1 - pout;
    pin = 1 - pin;
    
    if (tid >= stride)
        buffer[pout][tid] = buffer[pin][tid] + buffer[pin][tid - stride];
    else
        buffer[pout][tid] = buffer[pin][tid];
        
    __syncthreads();
}
```

### When to Use Hillis-Steele

- Small arrays where simplicity matters
- When step count is more important than work
- As building block for more complex algorithms

### Next: Blelloch Algorithm
Tomorrow we'll learn the work-efficient Blelloch scan with O(n) work.