In [None]:
# ⚙️ Setup
import subprocess, sys
try:
    import google.colab
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numba"])
except ImportError:
    pass

import numpy as np
from numba import cuda

print("⚠️  CUDA C++ is the PRIMARY learning material!")

---

## Part 1: What is Occupancy?

### Definition

```
                    Active Warps per SM
Occupancy (%) = ─────────────────────────── × 100
                 Maximum Warps per SM

Example: 32 active warps / 64 max warps = 50% occupancy
```

### Why Occupancy Matters

```
GPU hides memory latency through parallelism:

Low Occupancy (25%):
  Warp 0: [COMPUTE]──[WAIT FOR MEMORY]────────────────[COMPUTE]
  Warp 1: ──[COMPUTE]──[WAIT FOR MEMORY]────────────────[COMPUTE]
  Warp 2: ────[COMPUTE]──[WAIT FOR MEMORY]────────────────[COMPUTE]
  Warp 3: ──────[COMPUTE]──[WAIT FOR MEMORY]────────────────[COMPUTE]
  SM:     ████░░░░░░░░░░░░░░░░░░░░░████░░░░░░░░░░░░░░░░░░░░░░
                          ^^ SM idle, waiting for memory

High Occupancy (100%):
  Warp 0:  [COMP][WAIT....][COMP]
  Warp 1:  [COMP][WAIT....][COMP]
  ...many more warps...
  Warp 63: [COMP][WAIT....][COMP]
  SM:      ██████████████████████████████████████████████████
                    ^^ SM always has work to do
```

### CUDA C++ Occupancy Query (Primary)

```cpp
// occupancy.cu - Query and optimize occupancy
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void sampleKernel(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        float x = data[idx];
        data[idx] = x * x + x;
    }
}

int main() {
    // Get device properties
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);
    
    printf("Device: %s\n", prop.name);
    printf("Max threads per SM: %d\n", prop.maxThreadsPerMultiProcessor);
    printf("Max threads per block: %d\n", prop.maxThreadsPerBlock);
    printf("Registers per SM: %d\n", prop.regsPerMultiprocessor);
    printf("Shared memory per SM: %zu KB\n", prop.sharedMemPerMultiprocessor / 1024);
    printf("Number of SMs: %d\n", prop.multiProcessorCount);
    printf("\n");
    
    // Calculate occupancy for different block sizes
    printf("Block Size | Active Blocks/SM | Occupancy\n");
    printf("----------------------------------------\n");
    
    for (int blockSize = 64; blockSize <= 1024; blockSize *= 2) {
        int minGridSize, optBlockSize;
        
        // Query occupancy
        cudaOccupancyMaxActiveBlocksPerMultiprocessor(
            &minGridSize, sampleKernel, blockSize, 0);
        
        int maxWarpsPerSM = prop.maxThreadsPerMultiProcessor / 32;
        int activeWarps = minGridSize * (blockSize / 32);
        float occupancy = 100.0f * activeWarps / maxWarpsPerSM;
        
        printf("%10d | %16d | %6.1f%%\n", blockSize, minGridSize, occupancy);
    }
    
    // Get optimal block size
    int minGridSize, optBlockSize;
    cudaOccupancyMaxPotentialBlockSize(&minGridSize, &optBlockSize, 
                                        sampleKernel, 0, 0);
    printf("\nOptimal block size: %d\n", optBlockSize);
    
    return 0;
}
```

### Python/Numba (Optional)

In [None]:
# Query device properties
device = cuda.get_current_device()

print("=== GPU Occupancy Properties ===")
print(f"Device: {device.name}")
print(f"Compute Capability: {device.compute_capability}")
print(f"Max threads per block: {device.MAX_THREADS_PER_BLOCK}")
print(f"Max block dimensions: {device.MAX_BLOCK_DIM_X} x {device.MAX_BLOCK_DIM_Y} x {device.MAX_BLOCK_DIM_Z}")
print(f"Max grid dimensions: {device.MAX_GRID_DIM_X} x {device.MAX_GRID_DIM_Y} x {device.MAX_GRID_DIM_Z}")
print(f"Max shared memory per block: {device.MAX_SHARED_MEMORY_PER_BLOCK / 1024:.0f} KB")
print(f"Multiprocessor count: {device.MULTIPROCESSOR_COUNT}")
print(f"Warp size: {device.WARP_SIZE}")

---

## Part 2: Occupancy Limiters

In [None]:
def explain_occupancy_limiters():
    """Explain the three occupancy limiters."""
    print("The Three Occupancy Limiters")
    print("=" * 60)
    print()
    print("1. THREADS (Block Size)")
    print("   Max threads per SM: typically 1536-2048")
    print("   Max threads per block: 1024")
    print("   Issue: Too small block → not enough threads")
    print("          Too large block → can't fit enough blocks")
    print()
    print("2. REGISTERS")
    print("   Registers per SM: 65536 (typical)")
    print("   Max per thread: 255")
    print("   Issue: High register usage → fewer concurrent threads")
    print("   Formula: max_threads = regs_per_SM / regs_per_thread")
    print()
    print("3. SHARED MEMORY")
    print("   Shared memory per SM: 48-164 KB")
    print("   Configurable with L1 cache")
    print("   Issue: Large shared blocks → fewer concurrent blocks")
    print("   Formula: max_blocks = smem_per_SM / smem_per_block")

explain_occupancy_limiters()

In [None]:
def calculate_occupancy(threads_per_block, regs_per_thread, shared_per_block,
                        max_threads_per_sm=2048, max_regs_per_sm=65536,
                        max_shared_per_sm=49152, max_blocks_per_sm=32):
    """Calculate theoretical occupancy."""
    
    # Threads limit
    blocks_by_threads = max_threads_per_sm // threads_per_block
    
    # Registers limit
    regs_per_block = threads_per_block * regs_per_thread
    if regs_per_block > 0:
        blocks_by_regs = max_regs_per_sm // regs_per_block
    else:
        blocks_by_regs = max_blocks_per_sm
    
    # Shared memory limit
    if shared_per_block > 0:
        blocks_by_shared = max_shared_per_sm // shared_per_block
    else:
        blocks_by_shared = max_blocks_per_sm
    
    # Block limit
    max_blocks = min(blocks_by_threads, blocks_by_regs, blocks_by_shared, max_blocks_per_sm)
    
    # Calculate occupancy
    warps_per_block = threads_per_block // 32
    active_warps = max_blocks * warps_per_block
    max_warps = max_threads_per_sm // 32
    occupancy = 100 * active_warps / max_warps
    
    print(f"Occupancy Analysis")
    print("=" * 50)
    print(f"Input: {threads_per_block} threads/block, {regs_per_thread} regs/thread, {shared_per_block} bytes shared")
    print(f"\nLimiters:")
    print(f"  By threads:      {blocks_by_threads} blocks")
    print(f"  By registers:    {blocks_by_regs} blocks")
    print(f"  By shared mem:   {blocks_by_shared} blocks")
    print(f"  By block limit:  {max_blocks_per_sm} blocks")
    print(f"\nResult:")
    print(f"  Active blocks:   {max_blocks}")
    print(f"  Active warps:    {active_warps}/{max_warps}")
    print(f"  Occupancy:       {occupancy:.1f}%")
    
    # Identify limiter
    limiter = "blocks"
    if max_blocks == blocks_by_threads:
        limiter = "threads"
    elif max_blocks == blocks_by_regs:
        limiter = "registers"
    elif max_blocks == blocks_by_shared:
        limiter = "shared memory"
    print(f"  Limiter:         {limiter}")
    
    return occupancy

In [None]:
# Example: High occupancy kernel
print("Example 1: Simple kernel")
calculate_occupancy(threads_per_block=256, regs_per_thread=32, shared_per_block=0)

In [None]:
# Example: Register-limited kernel
print("\nExample 2: Register-heavy kernel")
calculate_occupancy(threads_per_block=256, regs_per_thread=128, shared_per_block=0)

In [None]:
# Example: Shared-memory limited kernel
print("\nExample 3: Shared memory heavy kernel")
calculate_occupancy(threads_per_block=256, regs_per_thread=32, shared_per_block=16384)

---

## Part 3: Occupancy vs Performance

In [None]:
def occupancy_performance_relationship():
    """Explain when high occupancy matters."""
    print("Occupancy vs Performance")
    print("=" * 60)
    print()
    print("Key Insight: Higher occupancy ≠ always better performance")
    print()
    print("When HIGH OCCUPANCY helps:")
    print("  ✓ Memory-bound kernels (need to hide latency)")
    print("  ✓ Simple arithmetic operations")
    print("  ✓ Irregular memory access patterns")
    print()
    print("When HIGH OCCUPANCY may NOT help:")
    print("  ✗ Compute-bound kernels (already saturated)")
    print("  ✗ High instruction-level parallelism (ILP)")
    print("  ✗ When reducing occupancy enables better optimizations")
    print()
    print("Rule of Thumb:")
    print("  - Start with ~50% occupancy")
    print("  - Profile to determine if occupancy is the bottleneck")
    print("  - Memory-bound: aim for higher occupancy")
    print("  - Compute-bound: focus on instruction throughput")

occupancy_performance_relationship()

---

## Part 4: Practical Occupancy Example

In [None]:
import time

# Test different block sizes
@cuda.jit
def simple_kernel(data, result):
    idx = cuda.grid(1)
    if idx < data.size:
        x = data[idx]
        result[idx] = x * x + x

def benchmark_block_sizes(n=10_000_000):
    """Benchmark different block sizes."""
    data = np.random.rand(n).astype(np.float32)
    result = np.zeros(n, dtype=np.float32)
    
    d_data = cuda.to_device(data)
    d_result = cuda.to_device(result)
    
    print(f"Benchmarking with {n:,} elements")
    print(f"{'Block Size':<12} {'Grid Size':<12} {'Time (ms)':<12} {'Throughput':<15}")
    print("=" * 55)
    
    for block_size in [32, 64, 128, 256, 512, 1024]:
        grid_size = (n + block_size - 1) // block_size
        
        # Warmup
        simple_kernel[grid_size, block_size](d_data, d_result)
        cuda.synchronize()
        
        # Benchmark
        iterations = 50
        start = time.perf_counter()
        for _ in range(iterations):
            simple_kernel[grid_size, block_size](d_data, d_result)
        cuda.synchronize()
        elapsed = (time.perf_counter() - start) / iterations * 1000
        
        throughput = n / (elapsed / 1000) / 1e9  # billion elements/sec
        print(f"{block_size:<12} {grid_size:<12} {elapsed:<12.3f} {throughput:.2f} B elem/s")

benchmark_block_sizes()

---

## Exercises

### Exercise 1: Occupancy Calculator

In [None]:
# TODO: Create an interactive occupancy calculator
# that takes kernel properties and outputs:
# 1. Theoretical occupancy
# 2. The limiting factor
# 3. Suggestions for improvement

def occupancy_advisor(threads, regs, shared):
    """Provide occupancy advice for given kernel parameters."""
    pass  # Your implementation

### Exercise 2: Occupancy Impact

In [None]:
# TODO: Create two kernels:
# 1. Memory-bound kernel (should benefit from high occupancy)
# 2. Compute-bound kernel (may not benefit as much)
# Compare performance at different block sizes

@cuda.jit
def memory_bound_kernel(data, result):
    """Memory-bound: simple read-modify-write."""
    pass  # Your implementation

@cuda.jit
def compute_bound_kernel(data, result):
    """Compute-bound: heavy arithmetic."""
    pass  # Your implementation

---

## Summary

### Occupancy Formula

```
Occupancy = Active Warps / Max Warps per SM
```

### Three Limiters

| Limiter | Resource | Typical Limit |
|---------|----------|---------------|
| Threads | Threads per SM | 1536-2048 |
| Registers | Registers per SM | 65536 |
| Shared Memory | Bytes per SM | 48-164 KB |

### CUDA C++ APIs

```cpp
// Query occupancy
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
    &numBlocks, kernel, blockSize, sharedMem);

// Get optimal block size
cudaOccupancyMaxPotentialBlockSize(
    &minGridSize, &blockSize, kernel, 0, 0);
```

### Key Takeaways

1. **50% occupancy is often sufficient** for good performance
2. **Memory-bound kernels** benefit most from high occupancy
3. **Profile first** - don't blindly maximize occupancy
4. **Balance resources** - sometimes less is more

### Tomorrow: Register Optimization
We'll dive into register pressure and launch bounds.