In [None]:
# ⚙️ Setup
import subprocess, sys
try:
    import google.colab
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numba"])
except ImportError:
    pass

import numpy as np
from numba import cuda

print("⚠️  CUDA C++ is PRIMARY. Python/Numba for quick testing only.")
print("⚠️  Grid-wide sync is a CUDA C++ feature requiring cooperative launch!")
if cuda.is_available():
    print(f"GPU: {cuda.get_current_device().name}")

---

## Part 1: Why Grid-Wide Sync?

### The Problem

```
Traditional CUDA - Multiple Kernel Launches:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

kernel1<<<blocks, threads>>>(data);  // Pass 1
cudaDeviceSynchronize();              // Host overhead!
kernel2<<<blocks, threads>>>(data);  // Pass 2
cudaDeviceSynchronize();              // Host overhead!
kernel3<<<blocks, threads>>>(data);  // Pass 3

Problems:
• Each launch has ~5-15μs overhead
• Data goes through L2 cache multiple times
• Can't keep data in L1/registers between passes

With Grid Sync - Single Kernel:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

__global__ void fusedKernel(data) {
    // Pass 1
    process1(data);
    grid.sync();      // GPU-only sync, ~1μs!
    
    // Pass 2
    process2(data);
    grid.sync();
    
    // Pass 3
    process3(data);
}
```

---

## Part 2: Basic Grid Synchronization

### CUDA C++ Grid Group (Primary)

```cpp
// grid_sync.cu - Basic grid-wide synchronization
#include <stdio.h>
#include <cuda_runtime.h>
#include <cooperative_groups.h>

namespace cg = cooperative_groups;

__global__ void gridSyncDemo(float* data, int n) {
    // ============================================
    // Get Grid Group
    // ============================================
    cg::grid_group grid = cg::this_grid();
    
    int tid = grid.thread_rank();  // Global thread index
    int gridSize = grid.size();    // Total threads in grid
    
    if (tid == 0) {
        printf("Grid has %d threads\n", gridSize);
    }
    
    // Pass 1: Multiply
    if (tid < n) {
        data[tid] *= 2.0f;
    }
    
    // ============================================
    // Grid-Wide Synchronization
    // ============================================
    grid.sync();  // ALL blocks wait here!
    
    // Pass 2: Add (safe - all threads done with pass 1)
    if (tid < n) {
        data[tid] += 1.0f;
    }
    
    grid.sync();  // Sync again
    
    // Pass 3: Sqrt
    if (tid < n) {
        data[tid] = sqrtf(data[tid]);
    }
}

int main() {
    const int N = 1 << 20;
    const int BLOCK_SIZE = 256;
    
    float* d_data;
    cudaMalloc(&d_data, N * sizeof(float));
    
    // Initialize
    float* h_data = new float[N];
    for (int i = 0; i < N; i++) h_data[i] = 4.0f;
    cudaMemcpy(d_data, h_data, N * sizeof(float), cudaMemcpyHostToDevice);
    
    // ============================================
    // Query Maximum Blocks for Cooperative Launch
    // ============================================
    int device = 0;
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, device);
    
    int numBlocksPerSM;
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &numBlocksPerSM, gridSyncDemo, BLOCK_SIZE, 0);
    
    int maxBlocks = numBlocksPerSM * prop.multiProcessorCount;
    int numBlocks = min(maxBlocks, (N + BLOCK_SIZE - 1) / BLOCK_SIZE);
    
    printf("Using %d blocks (max %d)\n", numBlocks, maxBlocks);
    
    // ============================================
    // Cooperative Kernel Launch
    // ============================================
    void* args[] = { &d_data, (void*)&N };
    
    cudaLaunchCooperativeKernel(
        (void*)gridSyncDemo,
        dim3(numBlocks),
        dim3(BLOCK_SIZE),
        args,
        0,       // Shared memory
        0        // Stream (default)
    );
    
    cudaDeviceSynchronize();
    
    // Verify
    cudaMemcpy(h_data, d_data, N * sizeof(float), cudaMemcpyDeviceToHost);
    printf("Result[0] = %.4f (expected %.4f)\n", h_data[0], sqrtf(4.0f * 2.0f + 1.0f));
    
    delete[] h_data;
    cudaFree(d_data);
    return 0;
}
```

Compile: `nvcc -arch=sm_70 -o grid_sync grid_sync.cu`

---

## Part 3: Occupancy Requirements

### Why Limit Grid Size?

```
Grid Sync Requirement:
━━━━━━━━━━━━━━━━━━━━━━━

ALL blocks must be resident simultaneously!

If you launch too many blocks:
• Some blocks can't start
• Running blocks wait at sync()
• Waiting blocks never start
• DEADLOCK!

Solution: Query max occupancy

cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blocksPerSM, kernel, blockSize, sharedMem);
maxBlocks = blocksPerSM × numSMs;
```

### Occupancy Calculator

```cpp
// occupancy_check.cu - Calculate safe grid size

#include <cuda_runtime.h>
#include <cooperative_groups.h>
#include <stdio.h>

namespace cg = cooperative_groups;

__global__ void cooperativeKernel(float* data, int n) {
    cg::grid_group grid = cg::this_grid();
    // ... work ...
    grid.sync();
}

int getMaxCooperativeBlocks(int blockSize, size_t sharedMem = 0) {
    int device;
    cudaGetDevice(&device);
    
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, device);
    
    // Check if cooperative launch is supported
    if (!prop.cooperativeLaunch) {
        printf("Cooperative launch not supported!\n");
        return 0;
    }
    
    int blocksPerSM;
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &blocksPerSM, 
        cooperativeKernel, 
        blockSize, 
        sharedMem);
    
    return blocksPerSM * prop.multiProcessorCount;
}

int main() {
    printf("Max cooperative blocks (256 threads): %d\n", getMaxCooperativeBlocks(256));
    printf("Max cooperative blocks (512 threads): %d\n", getMaxCooperativeBlocks(512));
    printf("Max cooperative blocks (1024 threads): %d\n", getMaxCooperativeBlocks(1024));
    return 0;
}
```

---

## Part 4: Practical Example - Multi-Pass Reduction

### CUDA C++ Grid Reduction (Primary)

```cpp
// grid_reduction.cu - Single-kernel full reduction
#include <stdio.h>
#include <cuda_runtime.h>
#include <cooperative_groups.h>

namespace cg = cooperative_groups;

__device__ float warpReduce(cg::thread_block_tile<32>& warp, float val) {
    for (int i = warp.size()/2; i > 0; i /= 2) {
        val += warp.shfl_down(val, i);
    }
    return val;
}

__global__ void gridReduce(float* input, float* output, int n) {
    cg::grid_group grid = cg::this_grid();
    cg::thread_block block = cg::this_thread_block();
    cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block);
    
    extern __shared__ float sdata[];
    
    int tid = grid.thread_rank();
    int gridSize = grid.size();
    
    // ============================================
    // Phase 1: Grid-stride loop to load & reduce
    // ============================================
    float sum = 0.0f;
    for (int i = tid; i < n; i += gridSize) {
        sum += input[i];
    }
    
    // Warp reduction
    sum = warpReduce(warp, sum);
    
    // Store warp results to shared memory
    int lane = warp.thread_rank();
    int warp_id = block.thread_rank() / 32;
    if (lane == 0) {
        sdata[warp_id] = sum;
    }
    block.sync();
    
    // First warp reduces shared memory
    int numWarps = (block.size() + 31) / 32;
    sum = (block.thread_rank() < numWarps) ? sdata[block.thread_rank()] : 0.0f;
    if (warp_id == 0) {
        sum = warpReduce(warp, sum);
    }
    
    // Block leader writes to global array
    if (block.thread_rank() == 0) {
        output[blockIdx.x] = sum;
    }
    
    // ============================================
    // Grid Sync - Wait for all blocks
    // ============================================
    grid.sync();
    
    // ============================================
    // Phase 2: Final reduction (first block only)
    // ============================================
    if (blockIdx.x == 0) {
        int numBlocks = gridDim.x;
        sum = 0.0f;
        
        // Each thread sums subset of block results
        for (int i = block.thread_rank(); i < numBlocks; i += block.size()) {
            sum += output[i];
        }
        
        // Reduce within block
        sum = warpReduce(warp, sum);
        if (lane == 0) sdata[warp_id] = sum;
        block.sync();
        
        sum = (block.thread_rank() < numWarps) ? sdata[block.thread_rank()] : 0.0f;
        if (warp_id == 0) sum = warpReduce(warp, sum);
        
        // Final result
        if (block.thread_rank() == 0) {
            output[0] = sum;
        }
    }
}

int main() {
    const int N = 1 << 24;  // 16M elements
    const int BLOCK_SIZE = 256;
    
    float *d_input, *d_output;
    cudaMalloc(&d_input, N * sizeof(float));
    cudaMalloc(&d_output, 1024 * sizeof(float));  // Workspace
    
    // Initialize with 1s
    float* h_input = new float[N];
    for (int i = 0; i < N; i++) h_input[i] = 1.0f;
    cudaMemcpy(d_input, h_input, N * sizeof(float), cudaMemcpyHostToDevice);
    
    // Get max blocks
    int device;
    cudaGetDevice(&device);
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, device);
    
    int blocksPerSM;
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &blocksPerSM, gridReduce, BLOCK_SIZE, 32 * sizeof(float));
    int numBlocks = blocksPerSM * prop.multiProcessorCount;
    
    printf("Launching %d blocks\n", numBlocks);
    
    // Launch
    void* args[] = { &d_input, &d_output, (void*)&N };
    cudaLaunchCooperativeKernel(
        (void*)gridReduce,
        dim3(numBlocks),
        dim3(BLOCK_SIZE),
        args,
        32 * sizeof(float),
        0);
    
    float result;
    cudaMemcpy(&result, d_output, sizeof(float), cudaMemcpyDeviceToHost);
    
    printf("Sum = %.0f (expected %d)\n", result, N);
    
    delete[] h_input;
    cudaFree(d_input);
    cudaFree(d_output);
    
    return 0;
}
```

---

## Exercises

### Exercise 1: Grid Prefix Sum
Implement Blelloch scan using grid sync for multiple phases.

### Exercise 2: Iterative Solver
Implement Jacobi iteration with grid sync between iterations.

### Exercise 3: Histogram with Grid Sync
Build a global histogram in a single kernel using grid sync.

---

## Key Takeaways

```
┌─────────────────────────────────────────────────────────┐
│           GRID-WIDE SYNCHRONIZATION                     │
├─────────────────────────────────────────────────────────┤
│                                                         │
│  Get Grid Group:                                        │
│  cg::grid_group grid = cg::this_grid();                 │
│                                                         │
│  Synchronize:                                           │
│  grid.sync();  // All blocks wait here                  │
│                                                         │
│  Launch Requirements:                                   │
│  • Use cudaLaunchCooperativeKernel()                    │
│  • Limit blocks to max occupancy                        │
│  • Check prop.cooperativeLaunch                         │
│                                                         │
│  Benefits:                                              │
│  • Eliminate kernel launch overhead                     │
│  • Keep data in cache between phases                    │
│  • Single kernel for multi-pass algorithms              │
│                                                         │
│  Limitations:                                           │
│  • Grid size limited by occupancy                       │
│  • Requires compute capability 3.5+                     │
│                                                         │
└─────────────────────────────────────────────────────────┘
```

## Next: Day 3 - Dynamic Parallelism Basics