In [None]:
# ⚙️ Setup
import subprocess, sys
try:
    import google.colab
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numba"])
except ImportError:
    pass

import numpy as np
from numba import cuda

print("⚠️  CUDA C++ is PRIMARY. Python/Numba for quick testing only.")
print("⚠️  Multi-GPU patterns require multiple physical GPUs!")
if cuda.is_available():
    print(f"GPU: {cuda.get_current_device().name}")

---

## Part 1: Domain Decomposition

### Splitting Work Across GPUs

```
1D Domain Decomposition:
━━━━━━━━━━━━━━━━━━━━━━━━━

Full Array: [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
                    ↓
GPU 0:      [0 1 2 3 4 5 6 7]
GPU 1:      [8 9 10 11 12 13 14 15]

2D Domain Decomposition:
━━━━━━━━━━━━━━━━━━━━━━━━━

┌─────────┬─────────┐
│  GPU 0  │  GPU 1  │
├─────────┼─────────┤
│  GPU 2  │  GPU 3  │
└─────────┴─────────┘
```

### CUDA C++ Domain Decomposition (Primary)

```cpp
// domain_decomp.cu - 1D domain decomposition
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void processChunk(float* data, int n, int globalOffset) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) {
        int globalIdx = globalOffset + tid;
        data[tid] = sinf((float)globalIdx * 0.01f);
    }
}

int main() {
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    
    const int TOTAL_N = 1 << 24;  // 16M elements
    const int NUM_GPUS = min(deviceCount, 4);
    
    printf("Using %d GPUs for %d elements\n", NUM_GPUS, TOTAL_N);
    
    // Calculate chunk sizes
    int baseChunkSize = TOTAL_N / NUM_GPUS;
    int remainder = TOTAL_N % NUM_GPUS;
    
    // Arrays for per-GPU data
    float* d_data[NUM_GPUS];
    int chunkSizes[NUM_GPUS];
    int offsets[NUM_GPUS];
    cudaStream_t streams[NUM_GPUS];
    
    // ============================================
    // Calculate Per-GPU Chunks (Handle Remainder)
    // ============================================
    int currentOffset = 0;
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        // Distribute remainder across first few GPUs
        chunkSizes[gpu] = baseChunkSize + (gpu < remainder ? 1 : 0);
        offsets[gpu] = currentOffset;
        currentOffset += chunkSizes[gpu];
        
        printf("GPU %d: offset=%d, size=%d\n", 
               gpu, offsets[gpu], chunkSizes[gpu]);
    }
    
    // ============================================
    // Allocate on Each GPU
    // ============================================
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        cudaMalloc(&d_data[gpu], chunkSizes[gpu] * sizeof(float));
        cudaStreamCreate(&streams[gpu]);
    }
    
    // ============================================
    // Launch Kernels on Each GPU
    // ============================================
    int blockSize = 256;
    
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        int numBlocks = (chunkSizes[gpu] + blockSize - 1) / blockSize;
        
        processChunk<<<numBlocks, blockSize, 0, streams[gpu]>>>(
            d_data[gpu], chunkSizes[gpu], offsets[gpu]);
    }
    
    // ============================================
    // Synchronize All GPUs
    // ============================================
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        cudaStreamSynchronize(streams[gpu]);
    }
    
    printf("All GPUs finished processing!\n");
    
    // Cleanup
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        cudaFree(d_data[gpu]);
        cudaStreamDestroy(streams[gpu]);
    }
    
    return 0;
}
```

In [None]:
%%writefile domain_decomp.cu
// domain_decomp.cu - 1D domain decomposition
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void processChunk(float* data, int n, int globalOffset) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) {
        int globalIdx = globalOffset + tid;
        data[tid] = sinf((float)globalIdx * 0.01f);
    }
}

int main() {
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    
    const int TOTAL_N = 1 << 24;  // 16M elements
    const int NUM_GPUS = min(deviceCount, 4);
    
    printf("Using %d GPUs for %d elements\n", NUM_GPUS, TOTAL_N);
    
    // Calculate chunk sizes
    int baseChunkSize = TOTAL_N / NUM_GPUS;
    int remainder = TOTAL_N % NUM_GPUS;
    
    // Arrays for per-GPU data
    float* d_data[NUM_GPUS];
    int chunkSizes[NUM_GPUS];
    int offsets[NUM_GPUS];
    cudaStream_t streams[NUM_GPUS];
    
    // ============================================
    // Calculate Per-GPU Chunks (Handle Remainder)
    // ============================================
    int currentOffset = 0;
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        // Distribute remainder across first few GPUs
        chunkSizes[gpu] = baseChunkSize + (gpu < remainder ? 1 : 0);
        offsets[gpu] = currentOffset;
        currentOffset += chunkSizes[gpu];
        
        printf("GPU %d: offset=%d, size=%d\n", 
               gpu, offsets[gpu], chunkSizes[gpu]);
    }
    
    // ============================================
    // Allocate on Each GPU
    // ============================================
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        cudaMalloc(&d_data[gpu], chunkSizes[gpu] * sizeof(float));
        cudaStreamCreate(&streams[gpu]);
    }
    
    // ============================================
    // Launch Kernels on Each GPU
    // ============================================
    int blockSize = 256;
    
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        int numBlocks = (chunkSizes[gpu] + blockSize - 1) / blockSize;
        
        processChunk<<<numBlocks, blockSize, 0, streams[gpu]>>>(
            d_data[gpu], chunkSizes[gpu], offsets[gpu]);
    }
    
    // ============================================
    // Synchronize All GPUs
    // ============================================
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        cudaStreamSynchronize(streams[gpu]);
    }
    
    printf("All GPUs finished processing!\n");
    
    // Cleanup
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        cudaFree(d_data[gpu]);
        cudaStreamDestroy(streams[gpu]);
    }
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o domain_decomp domain_decomp.cu
!./domain_decomp

---

## Part 2: Halo Exchange

### Stencil Operations Require Boundary Data

```
Problem: Stencil needs neighbors
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

GPU 0 data:  [a b c d e | f g h]
GPU 1 data:  [i j k | l m n o p]
               ↑       ↑
         Need f,g,h  Need e

Solution: Halo/Ghost cells
━━━━━━━━━━━━━━━━━━━━━━━━━━

GPU 0: [a b c d e | f g h] + [i j k]  ← halo from GPU 1
GPU 1: [f g h] + [i j k | l m n o p]  ← halo from GPU 0
        ↑                     
   halo from GPU 0
```

### CUDA C++ Halo Exchange (Primary)

```cpp
// halo_exchange.cu - Exchange boundary data between GPUs
#include <stdio.h>
#include <cuda_runtime.h>

#define HALO_SIZE 3  // Stencil radius

// Stencil kernel (5-point average)
__global__ void stencil(float* out, float* in, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    
    // Skip halo regions for output
    if (tid >= HALO_SIZE && tid < n - HALO_SIZE) {
        out[tid] = 0.2f * (in[tid-2] + in[tid-1] + in[tid] + 
                          in[tid+1] + in[tid+2]);
    }
}

int main() {
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    
    if (deviceCount < 2) {
        printf("Need 2 GPUs for halo exchange\n");
        return 1;
    }
    
    const int CHUNK_SIZE = 1024;  // Data per GPU
    const int TOTAL_SIZE = CHUNK_SIZE + 2 * HALO_SIZE;  // + halos
    
    float *d_in[2], *d_out[2];
    cudaStream_t streams[2];
    
    // Enable P2P
    int canP2P;
    cudaDeviceCanAccessPeer(&canP2P, 0, 1);
    if (canP2P) {
        cudaSetDevice(0); cudaDeviceEnablePeerAccess(1, 0);
        cudaSetDevice(1); cudaDeviceEnablePeerAccess(0, 0);
    }
    
    // Allocate with halo space
    for (int gpu = 0; gpu < 2; gpu++) {
        cudaSetDevice(gpu);
        cudaMalloc(&d_in[gpu], TOTAL_SIZE * sizeof(float));
        cudaMalloc(&d_out[gpu], TOTAL_SIZE * sizeof(float));
        cudaStreamCreate(&streams[gpu]);
        
        // Initialize
        cudaMemset(d_in[gpu], 0, TOTAL_SIZE * sizeof(float));
    }
    
    // ============================================
    // Halo Exchange
    // ============================================
    // GPU 0's right boundary -> GPU 1's left halo
    // GPU 1's left boundary -> GPU 0's right halo
    
    // GPU 0: Copy rightmost data to GPU 1's left halo
    // d_in[0][HALO_SIZE + CHUNK_SIZE - HALO_SIZE ... ] -> d_in[1][0...]
    cudaMemcpyPeerAsync(
        d_in[1],                    // dst: GPU 1's left halo
        1,                          // dst device
        d_in[0] + CHUNK_SIZE,       // src: GPU 0's right boundary
        0,                          // src device
        HALO_SIZE * sizeof(float),
        streams[0]);
    
    // GPU 1: Copy leftmost data to GPU 0's right halo
    cudaMemcpyPeerAsync(
        d_in[0] + HALO_SIZE + CHUNK_SIZE,  // dst: GPU 0's right halo
        0,
        d_in[1] + HALO_SIZE,               // src: GPU 1's left boundary
        1,
        HALO_SIZE * sizeof(float),
        streams[1]);
    
    // Sync before compute
    cudaSetDevice(0); cudaStreamSynchronize(streams[0]);
    cudaSetDevice(1); cudaStreamSynchronize(streams[1]);
    
    // ============================================
    // Compute Stencil
    // ============================================
    int blockSize = 256;
    int numBlocks = (TOTAL_SIZE + blockSize - 1) / blockSize;
    
    for (int gpu = 0; gpu < 2; gpu++) {
        cudaSetDevice(gpu);
        stencil<<<numBlocks, blockSize, 0, streams[gpu]>>>(
            d_out[gpu], d_in[gpu], TOTAL_SIZE);
    }
    
    // Sync
    for (int gpu = 0; gpu < 2; gpu++) {
        cudaSetDevice(gpu);
        cudaStreamSynchronize(streams[gpu]);
    }
    
    printf("Halo exchange and stencil complete!\n");
    
    // Cleanup
    for (int gpu = 0; gpu < 2; gpu++) {
        cudaSetDevice(gpu);
        cudaFree(d_in[gpu]);
        cudaFree(d_out[gpu]);
        cudaStreamDestroy(streams[gpu]);
    }
    
    return 0;
}
```

In [None]:
%%writefile halo_exchange.cu
// halo_exchange.cu - Exchange boundary data between GPUs
#include <stdio.h>
#include <cuda_runtime.h>

#define HALO_SIZE 3  // Stencil radius

// Stencil kernel (5-point average)
__global__ void stencil(float* out, float* in, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    
    // Skip halo regions for output
    if (tid >= HALO_SIZE && tid < n - HALO_SIZE) {
        out[tid] = 0.2f * (in[tid-2] + in[tid-1] + in[tid] + 
                          in[tid+1] + in[tid+2]);
    }
}

int main() {
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    
    if (deviceCount < 2) {
        printf("Need 2 GPUs for halo exchange\n");
        return 1;
    }
    
    const int CHUNK_SIZE = 1024;  // Data per GPU
    const int TOTAL_SIZE = CHUNK_SIZE + 2 * HALO_SIZE;  // + halos
    
    float *d_in[2], *d_out[2];
    cudaStream_t streams[2];
    
    // Enable P2P
    int canP2P;
    cudaDeviceCanAccessPeer(&canP2P, 0, 1);
    if (canP2P) {
        cudaSetDevice(0); cudaDeviceEnablePeerAccess(1, 0);
        cudaSetDevice(1); cudaDeviceEnablePeerAccess(0, 0);
    }
    
    // Allocate with halo space
    for (int gpu = 0; gpu < 2; gpu++) {
        cudaSetDevice(gpu);
        cudaMalloc(&d_in[gpu], TOTAL_SIZE * sizeof(float));
        cudaMalloc(&d_out[gpu], TOTAL_SIZE * sizeof(float));
        cudaStreamCreate(&streams[gpu]);
        
        // Initialize
        cudaMemset(d_in[gpu], 0, TOTAL_SIZE * sizeof(float));
    }
    
    // ============================================
    // Halo Exchange
    // ============================================
    // GPU 0's right boundary -> GPU 1's left halo
    // GPU 1's left boundary -> GPU 0's right halo
    
    // GPU 0: Copy rightmost data to GPU 1's left halo
    // d_in[0][HALO_SIZE + CHUNK_SIZE - HALO_SIZE ... ] -> d_in[1][0...]
    cudaMemcpyPeerAsync(
        d_in[1],                    // dst: GPU 1's left halo
        1,                          // dst device
        d_in[0] + CHUNK_SIZE,       // src: GPU 0's right boundary
        0,                          // src device
        HALO_SIZE * sizeof(float),
        streams[0]);
    
    // GPU 1: Copy leftmost data to GPU 0's right halo
    cudaMemcpyPeerAsync(
        d_in[0] + HALO_SIZE + CHUNK_SIZE,  // dst: GPU 0's right halo
        0,
        d_in[1] + HALO_SIZE,               // src: GPU 1's left boundary
        1,
        HALO_SIZE * sizeof(float),
        streams[1]);
    
    // Sync before compute
    cudaSetDevice(0); cudaStreamSynchronize(streams[0]);
    cudaSetDevice(1); cudaStreamSynchronize(streams[1]);
    
    // ============================================
    // Compute Stencil
    // ============================================
    int blockSize = 256;
    int numBlocks = (TOTAL_SIZE + blockSize - 1) / blockSize;
    
    for (int gpu = 0; gpu < 2; gpu++) {
        cudaSetDevice(gpu);
        stencil<<<numBlocks, blockSize, 0, streams[gpu]>>>(
            d_out[gpu], d_in[gpu], TOTAL_SIZE);
    }
    
    // Sync
    for (int gpu = 0; gpu < 2; gpu++) {
        cudaSetDevice(gpu);
        cudaStreamSynchronize(streams[gpu]);
    }
    
    printf("Halo exchange and stencil complete!\n");
    
    // Cleanup
    for (int gpu = 0; gpu < 2; gpu++) {
        cudaSetDevice(gpu);
        cudaFree(d_in[gpu]);
        cudaFree(d_out[gpu]);
        cudaStreamDestroy(streams[gpu]);
    }
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o halo_exchange halo_exchange.cu
!./halo_exchange

---

## Part 3: Multi-GPU Reduction

### Combining Results Across GPUs

```cpp
// multi_gpu_reduction.cu - Reduce across multiple GPUs
#include <stdio.h>
#include <cuda_runtime.h>

// Block reduction kernel
__global__ void reduce(float* out, float* in, int n) {
    extern __shared__ float sdata[];
    
    int tid = threadIdx.x;
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    
    sdata[tid] = (idx < n) ? in[idx] : 0.0f;
    __syncthreads();
    
    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s) {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }
    
    if (tid == 0) {
        out[blockIdx.x] = sdata[0];
    }
}

int main() {
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    const int NUM_GPUS = min(deviceCount, 4);
    
    const int TOTAL_N = 1 << 24;
    const int N_PER_GPU = TOTAL_N / NUM_GPUS;
    const int BLOCK_SIZE = 256;
    const int NUM_BLOCKS = (N_PER_GPU + BLOCK_SIZE - 1) / BLOCK_SIZE;
    
    // Per-GPU data
    float* d_data[NUM_GPUS];
    float* d_partial[NUM_GPUS];
    float h_partial[NUM_GPUS];  // Final partial sums
    cudaStream_t streams[NUM_GPUS];
    
    // Allocate and initialize
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        cudaMalloc(&d_data[gpu], N_PER_GPU * sizeof(float));
        cudaMalloc(&d_partial[gpu], NUM_BLOCKS * sizeof(float));
        cudaStreamCreate(&streams[gpu]);
        
        // Init with 1s
        float* h_temp = new float[N_PER_GPU];
        for (int i = 0; i < N_PER_GPU; i++) h_temp[i] = 1.0f;
        cudaMemcpy(d_data[gpu], h_temp, N_PER_GPU * sizeof(float),
                   cudaMemcpyHostToDevice);
        delete[] h_temp;
    }
    
    // ============================================
    // Phase 1: Reduce Within Each GPU
    // ============================================
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        reduce<<<NUM_BLOCKS, BLOCK_SIZE, BLOCK_SIZE * sizeof(float), 
                 streams[gpu]>>>(d_partial[gpu], d_data[gpu], N_PER_GPU);
        
        // Second reduction pass
        int remaining = NUM_BLOCKS;
        while (remaining > 1) {
            int newBlocks = (remaining + BLOCK_SIZE - 1) / BLOCK_SIZE;
            reduce<<<newBlocks, BLOCK_SIZE, BLOCK_SIZE * sizeof(float),
                     streams[gpu]>>>(d_partial[gpu], d_partial[gpu], remaining);
            remaining = newBlocks;
        }
    }
    
    // ============================================
    // Phase 2: Collect Partial Sums to Host
    // ============================================
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        cudaMemcpyAsync(&h_partial[gpu], d_partial[gpu], sizeof(float),
                        cudaMemcpyDeviceToHost, streams[gpu]);
    }
    
    // Sync all
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        cudaStreamSynchronize(streams[gpu]);
    }
    
    // ============================================
    // Phase 3: Final Sum on Host
    // ============================================
    float total = 0.0f;
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        printf("GPU %d partial sum: %.0f\n", gpu, h_partial[gpu]);
        total += h_partial[gpu];
    }
    
    printf("Total sum: %.0f (expected %d)\n", total, TOTAL_N);
    
    // Cleanup
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        cudaFree(d_data[gpu]);
        cudaFree(d_partial[gpu]);
        cudaStreamDestroy(streams[gpu]);
    }
    
    return 0;
}
```

In [None]:
%%writefile multi_gpu_reduction.cu
// multi_gpu_reduction.cu - Reduce across multiple GPUs
#include <stdio.h>
#include <cuda_runtime.h>

// Block reduction kernel
__global__ void reduce(float* out, float* in, int n) {
    extern __shared__ float sdata[];
    
    int tid = threadIdx.x;
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    
    sdata[tid] = (idx < n) ? in[idx] : 0.0f;
    __syncthreads();
    
    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s) {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }
    
    if (tid == 0) {
        out[blockIdx.x] = sdata[0];
    }
}

int main() {
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    const int NUM_GPUS = min(deviceCount, 4);
    
    const int TOTAL_N = 1 << 24;
    const int N_PER_GPU = TOTAL_N / NUM_GPUS;
    const int BLOCK_SIZE = 256;
    const int NUM_BLOCKS = (N_PER_GPU + BLOCK_SIZE - 1) / BLOCK_SIZE;
    
    // Per-GPU data
    float* d_data[NUM_GPUS];
    float* d_partial[NUM_GPUS];
    float h_partial[NUM_GPUS];  // Final partial sums
    cudaStream_t streams[NUM_GPUS];
    
    // Allocate and initialize
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        cudaMalloc(&d_data[gpu], N_PER_GPU * sizeof(float));
        cudaMalloc(&d_partial[gpu], NUM_BLOCKS * sizeof(float));
        cudaStreamCreate(&streams[gpu]);
        
        // Init with 1s
        float* h_temp = new float[N_PER_GPU];
        for (int i = 0; i < N_PER_GPU; i++) h_temp[i] = 1.0f;
        cudaMemcpy(d_data[gpu], h_temp, N_PER_GPU * sizeof(float),
                   cudaMemcpyHostToDevice);
        delete[] h_temp;
    }
    
    // ============================================
    // Phase 1: Reduce Within Each GPU
    // ============================================
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        reduce<<<NUM_BLOCKS, BLOCK_SIZE, BLOCK_SIZE * sizeof(float), 
                 streams[gpu]>>>(d_partial[gpu], d_data[gpu], N_PER_GPU);
        
        // Second reduction pass
        int remaining = NUM_BLOCKS;
        while (remaining > 1) {
            int newBlocks = (remaining + BLOCK_SIZE - 1) / BLOCK_SIZE;
            reduce<<<newBlocks, BLOCK_SIZE, BLOCK_SIZE * sizeof(float),
                     streams[gpu]>>>(d_partial[gpu], d_partial[gpu], remaining);
            remaining = newBlocks;
        }
    }
    
    // ============================================
    // Phase 2: Collect Partial Sums to Host
    // ============================================
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        cudaMemcpyAsync(&h_partial[gpu], d_partial[gpu], sizeof(float),
                        cudaMemcpyDeviceToHost, streams[gpu]);
    }
    
    // Sync all
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        cudaStreamSynchronize(streams[gpu]);
    }
    
    // ============================================
    // Phase 3: Final Sum on Host
    // ============================================
    float total = 0.0f;
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        printf("GPU %d partial sum: %.0f\n", gpu, h_partial[gpu]);
        total += h_partial[gpu];
    }
    
    printf("Total sum: %.0f (expected %d)\n", total, TOTAL_N);
    
    // Cleanup
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        cudaFree(d_data[gpu]);
        cudaFree(d_partial[gpu]);
        cudaStreamDestroy(streams[gpu]);
    }
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o multi_gpu_reduction multi_gpu_reduction.cu
!./multi_gpu_reduction

---

## Part 4: Load Balancing

### Handling Non-Uniform Work

```
Static Partitioning (Simple but can be unbalanced):
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
GPU 0: [0, N/4)   → May be easy work
GPU 1: [N/4, N/2) → May be hard work
GPU 2: [N/2, 3N/4) → Variable
GPU 3: [3N/4, N)  → Variable

Dynamic Load Balancing:
━━━━━━━━━━━━━━━━━━━━━━━━
• Work queue with atomic counter
• GPUs grab chunks as they finish
• Better for non-uniform work
```

```cpp
// Simple work-stealing pattern
__device__ int workCounter = 0;

__global__ void dynamicWork(float* data, int totalChunks, int chunkSize) {
    while (true) {
        // Atomically get next chunk
        int myChunk = atomicAdd(&workCounter, 1);
        
        if (myChunk >= totalChunks) break;
        
        // Process chunk
        int start = myChunk * chunkSize;
        for (int i = threadIdx.x; i < chunkSize; i += blockDim.x) {
            data[start + i] = processElement(data[start + i]);
        }
    }
}
```

---

## Exercises

### Exercise 1: 2D Decomposition
Implement 2D domain decomposition for a matrix operation.

### Exercise 2: Ring AllReduce
Implement ring allreduce pattern for multi-GPU sum.

### Exercise 3: Async Halo Exchange
Overlap halo exchange with interior computation.

---

## Key Takeaways

```
┌─────────────────────────────────────────────────────────┐
│              MULTI-GPU PATTERNS                         │
├─────────────────────────────────────────────────────────┤
│                                                         │
│  Domain Decomposition:                                  │
│  • Split data into chunks per GPU                       │
│  • Handle remainder for non-divisible sizes             │
│  • Each GPU processes its chunk independently           │
│                                                         │
│  Halo Exchange:                                         │
│  • For stencil operations needing neighbors             │
│  • Allocate extra space for ghost cells                 │
│  • Exchange boundaries before compute                   │
│                                                         │
│  Reduction:                                             │
│  • Reduce within each GPU first                         │
│  • Collect partial sums to host or one GPU              │
│  • Final reduction on collected values                  │
│                                                         │
│  Load Balancing:                                        │
│  • Static: simple, good for uniform work                │
│  • Dynamic: work queues for variable work               │
│                                                         │
└─────────────────────────────────────────────────────────┘
```

## Next: Day 3 - Advanced Optimization Review