In [None]:
# ⚙️ Setup
import subprocess, sys
try:
    import google.colab
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numba"])
except ImportError:
    pass

import numpy as np
from numba import cuda

print("⚠️  CUDA C++ is PRIMARY. Python/Numba for quick testing only.")
print("⚠️  Cooperative Groups are a CUDA C++ feature!")
if cuda.is_available():
    print(f"GPU: {cuda.get_current_device().name}")

---

## Part 1: What Are Cooperative Groups?

### Traditional CUDA Synchronization

```
Traditional CUDA:
━━━━━━━━━━━━━━━━━
__syncthreads()  → Only syncs within block
Warp shuffle     → Only within warp

Problems:
• No grid-wide sync
• No sub-warp sync
• No dynamic grouping

Cooperative Groups Solution:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━
• Flexible group abstraction
• Groups can be partitioned
• Sync at any granularity
• Works with divergent code
```

### Group Hierarchy

```
┌────────────────────────────────────────┐
│               Grid Group               │
│  ┌──────────────────────────────────┐  │
│  │         Thread Block             │  │
│  │  ┌────────────────────────────┐  │  │
│  │  │     Tile (32 threads)      │  │  │
│  │  │  ┌──────────────────────┐  │  │  │
│  │  │  │ Coalesced (active)   │  │  │  │
│  │  │  └──────────────────────┘  │  │  │
│  │  └────────────────────────────┘  │  │
│  └──────────────────────────────────┘  │
└────────────────────────────────────────┘
```

---

## Part 2: Thread Block Groups

### CUDA C++ Thread Block Group (Primary)

```cpp
// thread_block_group.cu - Basic cooperative groups
#include <stdio.h>
#include <cuda_runtime.h>
#include <cooperative_groups.h>

namespace cg = cooperative_groups;

__global__ void blockGroupDemo(float* data, int n) {
    // ============================================
    // Get Thread Block Group
    // ============================================
    cg::thread_block block = cg::this_thread_block();
    
    int tid = block.thread_rank();  // Same as threadIdx.x
    int size = block.size();        // Same as blockDim.x
    
    if (tid == 0) {
        printf("Block %d: size=%d\n", blockIdx.x, size);
    }
    
    // Perform work
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        data[idx] *= 2.0f;
    }
    
    // ============================================
    // Synchronize Using Group
    // ============================================
    block.sync();  // Same as __syncthreads()
    
    // More work after sync
    if (idx < n) {
        data[idx] += 1.0f;
    }
}

int main() {
    const int N = 1024;
    float *d_data;
    cudaMalloc(&d_data, N * sizeof(float));
    
    blockGroupDemo<<<4, 256>>>(d_data, N);
    cudaDeviceSynchronize();
    
    cudaFree(d_data);
    return 0;
}
```

Compile: `nvcc -o block_group thread_block_group.cu`

---

## Part 3: Tile Groups

### CUDA C++ Tile Partitioning (Primary)

```cpp
// tile_groups.cu - Warp-level programming with tiles
#include <stdio.h>
#include <cuda_runtime.h>
#include <cooperative_groups.h>

namespace cg = cooperative_groups;

__global__ void tileGroupDemo(float* data, int n) {
    // Get thread block
    cg::thread_block block = cg::this_thread_block();
    
    // ============================================
    // Create Tile of 32 Threads (Warp)
    // ============================================
    cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block);
    
    // Tile properties
    int lane = warp.thread_rank();    // 0-31 within warp
    int warp_id = warp.meta_group_rank();  // Which warp
    
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    float val = (idx < n) ? data[idx] : 0.0f;
    
    // ============================================
    // Warp-Level Reduction Using Tile
    // ============================================
    // Shuffle within tile
    for (int offset = warp.size() / 2; offset > 0; offset /= 2) {
        val += warp.shfl_down(val, offset);
    }
    
    // Lane 0 has the warp sum
    if (lane == 0) {
        printf("Block %d, Warp %d: sum = %.1f\n", 
               blockIdx.x, warp_id, val);
    }
    
    // ============================================
    // Tile Synchronization
    // ============================================
    warp.sync();  // Sync only within this tile
}

// ============================================
// Smaller Tiles (Sub-Warp)
// ============================================
__global__ void smallTileDemo() {
    cg::thread_block block = cg::this_thread_block();
    
    // Create tiles of 4 threads
    cg::thread_block_tile<4> tile4 = cg::tiled_partition<4>(block);
    
    // Create tiles of 8 threads
    cg::thread_block_tile<8> tile8 = cg::tiled_partition<8>(block);
    
    // Create tiles of 16 threads
    cg::thread_block_tile<16> tile16 = cg::tiled_partition<16>(block);
    
    // Each tile can sync independently
    tile4.sync();
    tile8.sync();
    tile16.sync();
    
    // Shuffle within small tiles
    float val = (float)threadIdx.x;
    float sum4 = val;
    for (int i = tile4.size()/2; i > 0; i /= 2) {
        sum4 += tile4.shfl_down(sum4, i);
    }
    
    if (tile4.thread_rank() == 0) {
        printf("Thread %d: tile4 sum = %.0f\n", threadIdx.x, sum4);
    }
}

int main() {
    const int N = 256;
    float* d_data;
    cudaMalloc(&d_data, N * sizeof(float));
    
    float h_data[N];
    for (int i = 0; i < N; i++) h_data[i] = 1.0f;
    cudaMemcpy(d_data, h_data, N * sizeof(float), cudaMemcpyHostToDevice);
    
    printf("=== Warp Tiles ===\n");
    tileGroupDemo<<<1, 128>>>(d_data, N);
    cudaDeviceSynchronize();
    
    printf("\n=== Small Tiles ===\n");
    smallTileDemo<<<1, 32>>>();
    cudaDeviceSynchronize();
    
    cudaFree(d_data);
    return 0;
}
```

---

## Part 4: Coalesced Groups

### Handling Divergent Code

```cpp
// coalesced_groups.cu - Groups for divergent code
#include <stdio.h>
#include <cuda_runtime.h>
#include <cooperative_groups.h>

namespace cg = cooperative_groups;

__global__ void coalescedDemo(int* flags, float* data, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    
    // ============================================
    // Divergent Branch
    // ============================================
    if (tid < n && flags[tid]) {  // Only some threads active!
        
        // Get group of active threads
        cg::coalesced_group active = cg::coalesced_threads();
        
        int rank = active.thread_rank();  // Rank among active threads
        int size = active.size();         // How many active
        
        // Collective operation among active threads only
        float val = data[tid];
        
        // Reduction among active threads
        for (int offset = active.size() / 2; offset > 0; offset /= 2) {
            val += active.shfl_down(val, offset);
        }
        
        if (rank == 0) {
            printf("Block %d: %d active threads, sum = %.1f\n",
                   blockIdx.x, size, val);
        }
        
        // Sync among active threads
        active.sync();
        
        // Continue processing...
        data[tid] = val;
    }
}

int main() {
    const int N = 32;
    int* d_flags;
    float* d_data;
    
    cudaMalloc(&d_flags, N * sizeof(int));
    cudaMalloc(&d_data, N * sizeof(float));
    
    // Set alternating flags (half threads active)
    int h_flags[N];
    float h_data[N];
    for (int i = 0; i < N; i++) {
        h_flags[i] = (i % 2 == 0) ? 1 : 0;  // Even threads active
        h_data[i] = 1.0f;
    }
    
    cudaMemcpy(d_flags, h_flags, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_data, h_data, N * sizeof(float), cudaMemcpyHostToDevice);
    
    coalescedDemo<<<1, 32>>>(d_flags, d_data, N);
    cudaDeviceSynchronize();
    
    cudaFree(d_flags);
    cudaFree(d_data);
    return 0;
}
```

---

## Part 5: Practical Example - Block Reduction

### CUDA C++ Optimized Reduction (Primary)

```cpp
// cg_reduction.cu - Reduction using cooperative groups
#include <stdio.h>
#include <cuda_runtime.h>
#include <cooperative_groups.h>

namespace cg = cooperative_groups;

// Warp reduction using tile shuffle
__device__ float warpReduce(cg::thread_block_tile<32>& warp, float val) {
    for (int offset = warp.size() / 2; offset > 0; offset /= 2) {
        val += warp.shfl_down(val, offset);
    }
    return val;
}

// Block reduction using cooperative groups
__device__ float blockReduce(cg::thread_block& block, float val) {
    __shared__ float warpSums[32];  // Max 32 warps per block
    
    cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block);
    
    int lane = warp.thread_rank();
    int warp_id = warp.meta_group_rank();
    
    // Reduce within warp
    val = warpReduce(warp, val);
    
    // Lane 0 writes warp sum
    if (lane == 0) {
        warpSums[warp_id] = val;
    }
    
    block.sync();  // Wait for all warps
    
    // First warp reduces warp sums
    int numWarps = (block.size() + 31) / 32;
    val = (block.thread_rank() < numWarps) ? warpSums[block.thread_rank()] : 0.0f;
    
    if (warp_id == 0) {
        val = warpReduce(warp, val);
    }
    
    return val;
}

__global__ void reduceKernel(float* input, float* output, int n) {
    cg::thread_block block = cg::this_thread_block();
    
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    float val = (tid < n) ? input[tid] : 0.0f;
    
    float blockSum = blockReduce(block, val);
    
    if (block.thread_rank() == 0) {
        output[blockIdx.x] = blockSum;
    }
}

int main() {
    const int N = 1 << 20;
    const int BLOCK_SIZE = 256;
    const int NUM_BLOCKS = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
    
    float *d_input, *d_output;
    cudaMalloc(&d_input, N * sizeof(float));
    cudaMalloc(&d_output, NUM_BLOCKS * sizeof(float));
    
    // Initialize with 1s
    float* h_input = new float[N];
    for (int i = 0; i < N; i++) h_input[i] = 1.0f;
    cudaMemcpy(d_input, h_input, N * sizeof(float), cudaMemcpyHostToDevice);
    
    // Reduce
    reduceKernel<<<NUM_BLOCKS, BLOCK_SIZE>>>(d_input, d_output, N);
    
    // Final reduction on host
    float* h_output = new float[NUM_BLOCKS];
    cudaMemcpy(h_output, d_output, NUM_BLOCKS * sizeof(float), cudaMemcpyDeviceToHost);
    
    float total = 0;
    for (int i = 0; i < NUM_BLOCKS; i++) total += h_output[i];
    
    printf("Sum of %d elements: %.0f (expected %d)\n", N, total, N);
    
    delete[] h_input;
    delete[] h_output;
    cudaFree(d_input);
    cudaFree(d_output);
    
    return 0;
}
```

---

## Exercises

### Exercise 1: Tile Broadcast
Use `tile.shfl()` to broadcast a value from lane 0 to all threads in a tile.

### Exercise 2: Ballot and Count
Use `cg::ballot()` to count how many threads satisfy a condition.

### Exercise 3: Coalesced Histogram
Build a histogram using coalesced groups for divergent binning.

---

## Key Takeaways

```
┌─────────────────────────────────────────────────────────┐
│           COOPERATIVE GROUPS BASICS                     │
├─────────────────────────────────────────────────────────┤
│                                                         │
│  Include:                                               │
│  #include <cooperative_groups.h>                        │
│  namespace cg = cooperative_groups;                     │
│                                                         │
│  Group Types:                                           │
│  • cg::thread_block - all threads in block              │
│  • cg::thread_block_tile<N> - tile of N threads         │
│  • cg::coalesced_group - active threads only            │
│                                                         │
│  Operations:                                            │
│  • group.sync() - synchronize group                     │
│  • group.thread_rank() - thread index in group          │
│  • group.size() - number of threads                     │
│  • tile.shfl_down() - shuffle within tile               │
│                                                         │
│  Benefits:                                              │
│  • Cleaner code                                         │
│  • Sub-warp operations                                  │
│  • Divergent code support                               │
│                                                         │
└─────────────────────────────────────────────────────────┘
```

## Next: Day 2 - Grid-Wide Synchronization