In [None]:
# ⚙️ Setup
import subprocess, sys
try:
    import google.colab
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numba"])
except ImportError:
    pass

import numpy as np
from numba import cuda
import time

print("⚠️  CUDA C++ is the PRIMARY learning material!")

---

## Part 1: GPU Cache Hierarchy

### Memory Hierarchy

```
GPU Memory Hierarchy:

┌─────────────────────────────────────────────────────────────┐
│                     Registers (per thread)                   │
│                     ~1 cycle, 255 max                        │
└─────────────────────────────────────────────────────────────┘
                              ↓
┌─────────────────────────────────────────────────────────────┐
│              Shared Memory / L1 Cache (per SM)               │
│              ~5-30 cycles, 48-228KB configurable             │
└─────────────────────────────────────────────────────────────┘
                              ↓
┌─────────────────────────────────────────────────────────────┐
│                    L2 Cache (shared)                         │
│                    ~100-200 cycles, 1.5-80MB                 │
└─────────────────────────────────────────────────────────────┘
                              ↓
┌─────────────────────────────────────────────────────────────┐
│                 Global Memory (DRAM)                         │
│                 ~400-800 cycles, 8-80GB                      │
└─────────────────────────────────────────────────────────────┘
```

### L1 Cache Details

```
L1 Cache (per SM):
  - Unified with shared memory (128-228KB total)
  - Configurable split: more shared OR more L1
  - Cache line: 128 bytes
  - Write-through (writes go to L2)
  - Automatic caching of global loads

Configuration options (cudaFuncSetCacheConfig):
  cudaFuncCachePreferNone     Default
  cudaFuncCachePreferShared   Prefer shared memory
  cudaFuncCachePreferL1       Prefer L1 cache
  cudaFuncCachePreferEqual    Equal split
```

### CUDA C++ Cache Configuration (Primary)

```cpp
// cache_config.cu - Configure L1/shared memory split
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void memoryIntensiveKernel(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        // Read many times from different locations
        float sum = 0;
        for (int i = 0; i < 16; i++) {
            int offset = (idx + i * 1024) % n;
            sum += data[offset];
        }
        data[idx] = sum;
    }
}

int main() {
    // Query current cache config
    cudaFuncCache currentConfig;
    cudaDeviceGetCacheConfig(&currentConfig);
    printf("Current config: %d\n", currentConfig);
    
    // Set cache preference for kernel
    cudaFuncSetCacheConfig(memoryIntensiveKernel, cudaFuncCachePreferL1);
    
    // For kernels that use shared memory heavily:
    // cudaFuncSetCacheConfig(kernel, cudaFuncCachePreferShared);
    
    // Query device L1/shared config
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);
    printf("Shared memory per block: %zu KB\n", prop.sharedMemPerBlock / 1024);
    printf("Shared memory per SM: %zu KB\n", prop.sharedMemPerMultiprocessor / 1024);
    printf("L2 cache size: %d KB\n", prop.l2CacheSize / 1024);
    
    return 0;
}
```

In [None]:
# Query cache information
device = cuda.get_current_device()

print("GPU Cache Information")
print("=" * 50)
print(f"Device: {device.name}")
print(f"Max shared memory per block: {device.MAX_SHARED_MEMORY_PER_BLOCK / 1024:.0f} KB")

# Get more detailed info via context
ctx = cuda.current_context()
try:
    print(f"\nNote: L1/L2 cache sizes vary by GPU architecture")
    print(f"Typical L1: 48-128 KB per SM")
    print(f"Typical L2: 1.5-40 MB shared")
except:
    pass

---

## Part 2: Memory Coalescing Deep Dive

In [None]:
def explain_coalescing():
    """Deep dive into memory coalescing."""
    print("Memory Coalescing")
    print("=" * 60)
    print()
    print("Definition: Combining multiple memory requests into fewer")
    print("            transactions at cache-line granularity (128 bytes)")
    print()
    print("COALESCED ACCESS (GOOD):")
    print("  Warp threads 0-31 access addresses 0-124 (consecutive floats)")
    print("  → 1 cache line transaction (128 bytes = 32 floats)")
    print()
    print("  Thread 0:  addr 0")
    print("  Thread 1:  addr 4")
    print("  Thread 2:  addr 8")
    print("  ...")
    print("  Thread 31: addr 124")
    print("  ────────────────────")
    print("  Result: ONE 128-byte transaction")
    print()
    print("STRIDED ACCESS (BAD):")
    print("  Warp threads access every 16th element (stride = 64 bytes)")
    print("  → Multiple cache line transactions")
    print()
    print("  Thread 0:  addr 0    → cache line 0")
    print("  Thread 1:  addr 64   → cache line 0")
    print("  Thread 2:  addr 128  → cache line 1")
    print("  Thread 3:  addr 192  → cache line 1")
    print("  ...")
    print("  Result: 16+ cache line transactions!")

explain_coalescing()

In [None]:
# Demonstrate coalescing impact

@cuda.jit
def coalesced_access(data, result, stride):
    """Stride-1 access (coalesced)."""
    idx = cuda.grid(1)
    if idx < result.size:
        result[idx] = data[idx]

@cuda.jit
def strided_access(data, result, stride):
    """Strided access (not coalesced)."""
    idx = cuda.grid(1)
    if idx < result.size:
        # Strided read - bad for coalescing
        src_idx = (idx * stride) % data.size
        result[idx] = data[src_idx]

In [None]:
def benchmark_coalescing(n=1_000_000):
    """Benchmark coalesced vs strided access."""
    data = np.random.rand(n * 32).astype(np.float32)  # Extra space for strides
    result = np.zeros(n, dtype=np.float32)
    
    d_data = cuda.to_device(data)
    d_result = cuda.to_device(result)
    
    block = 256
    grid = (n + block - 1) // block
    
    print(f"Coalescing Benchmark ({n:,} elements)")
    print("=" * 50)
    
    for stride in [1, 2, 4, 8, 16, 32]:
        # Warmup
        strided_access[grid, block](d_data, d_result, stride)
        cuda.synchronize()
        
        # Benchmark
        iterations = 100
        start = time.perf_counter()
        for _ in range(iterations):
            strided_access[grid, block](d_data, d_result, stride)
        cuda.synchronize()
        elapsed = (time.perf_counter() - start) / iterations * 1000
        
        # Calculate bandwidth
        bytes_moved = n * 4 * 2  # read + write
        bandwidth = bytes_moved / (elapsed / 1000) / 1e9
        
        print(f"Stride {stride:2d}: {elapsed:.3f} ms, {bandwidth:.1f} GB/s")

benchmark_coalescing()

---

## Part 3: L2 Cache Optimization

In [None]:
def l2_cache_strategies():
    """Strategies for L2 cache optimization."""
    print("L2 Cache Optimization Strategies")
    print("=" * 60)
    print()
    print("1. DATA LOCALITY")
    print("   Keep working set smaller than L2 cache")
    print("   Typical L2: 1.5-40 MB")
    print()
    print("2. PERSISTENCE (Ampere+)")
    print("   cudaAccessPropertyPersisting: keep in L2")
    print("   cudaAccessPropertyStreaming: don't cache")
    print()
    print("3. CACHE PARTITIONING (Ampere+)")
    print("   Reserve portion of L2 for specific data")
    print("   cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize, bytes)")
    print()
    print("4. ACCESS PATTERNS")
    print("   Sequential: Best cache utilization")
    print("   Random: Poor cache utilization")
    print("   Blocked/Tiled: Good cache reuse")

l2_cache_strategies()

### CUDA C++ L2 Cache Control (Ampere+)

```cpp
// l2_cache.cu - L2 cache persistence (Ampere+)
#include <stdio.h>
#include <cuda_runtime.h>

int main() {
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);
    
    printf("L2 cache size: %d MB\n", prop.l2CacheSize / (1024 * 1024));
    
    // On Ampere+, you can control L2 persistence
    if (prop.major >= 8) {
        // Reserve some L2 cache for persistent data
        size_t persistingL2 = prop.l2CacheSize / 2;  // Reserve half
        cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize, persistingL2);
        
        // Set access policy for a memory range
        float* d_persistent;
        size_t size = 1024 * 1024;  // 1MB
        cudaMalloc(&d_persistent, size);
        
        cudaStreamAttrValue attr;
        attr.accessPolicyWindow.base_ptr = d_persistent;
        attr.accessPolicyWindow.num_bytes = size;
        attr.accessPolicyWindow.hitProp = cudaAccessPropertyPersisting;
        attr.accessPolicyWindow.missProp = cudaAccessPropertyStreaming;
        attr.accessPolicyWindow.hitRatio = 1.0f;
        
        cudaStream_t stream;
        cudaStreamCreate(&stream);
        cudaStreamSetAttribute(stream, cudaStreamAttributeAccessPolicyWindow, &attr);
        
        // Now kernels on this stream will try to keep d_persistent in L2
        
        cudaStreamDestroy(stream);
        cudaFree(d_persistent);
    }
    
    return 0;
}
```

---

## Part 4: Texture and Constant Memory

In [None]:
def special_caches():
    """Explain texture and constant memory caches."""
    print("Special GPU Caches")
    print("=" * 60)
    print()
    print("CONSTANT MEMORY")
    print("  Size: 64 KB total, 8 KB cache per SM")
    print("  Access: Broadcast to all threads in warp")
    print("  Best for: Same value read by all threads")
    print("  Declare: __constant__ float data[1024];")
    print()
    print("TEXTURE MEMORY")
    print("  Cache: ~48 KB per SM")
    print("  Access: Optimized for 2D spatial locality")
    print("  Features:")
    print("    - Hardware interpolation")
    print("    - Boundary handling (clamp/wrap)")
    print("    - Normalized coordinates")
    print("  Best for: Image processing, lookup tables")
    print()
    print("READ-ONLY CACHE (via __ldg)")
    print("  Uses texture cache for global loads")
    print("  float x = __ldg(&data[idx]);")
    print("  Automatic with 'const __restrict__' pointers")

special_caches()

### CUDA C++ Constant Memory (Primary)

```cpp
// constant_memory.cu - Using constant memory
#include <stdio.h>
#include <cuda_runtime.h>

// Declare constant memory (at file scope)
__constant__ float coefficients[256];

__global__ void applyCoefficients(float* data, float* result, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        float x = data[idx];
        int coef_idx = idx % 256;
        
        // All threads in warp likely read same coefficient
        // → Constant cache broadcasts efficiently
        result[idx] = x * coefficients[coef_idx];
    }
}

int main() {
    // Copy to constant memory
    float h_coefs[256];
    for (int i = 0; i < 256; i++) {
        h_coefs[i] = i * 0.1f;
    }
    
    cudaMemcpyToSymbol(coefficients, h_coefs, sizeof(h_coefs));
    
    // ... launch kernel ...
    
    return 0;
}
```

### Using __ldg for Read-Only Data

```cpp
// Option 1: Explicit __ldg intrinsic
__global__ void withLdg(const float* data, float* result, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        // Uses texture cache path
        float x = __ldg(&data[idx]);
        result[idx] = x * x;
    }
}

// Option 2: const __restrict__ (compiler may use __ldg)
__global__ void withRestrict(const float* __restrict__ data, 
                             float* __restrict__ result, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        // Compiler knows data is read-only and non-aliasing
        result[idx] = data[idx] * data[idx];
    }
}
```

---

## Part 5: Cache-Aware Algorithm Design

In [None]:
# Example: Cache-friendly vs cache-unfriendly matrix access

@cuda.jit
def row_major_sum(matrix, result, rows, cols):
    """Row-major traversal - cache friendly."""
    idx = cuda.grid(1)
    if idx < rows:
        total = 0.0
        for j in range(cols):
            total += matrix[idx, j]  # Sequential in memory
        result[idx] = total

@cuda.jit
def col_major_sum(matrix, result, rows, cols):
    """Column-major traversal - cache unfriendly for row-major layout."""
    idx = cuda.grid(1)
    if idx < cols:
        total = 0.0
        for i in range(rows):
            total += matrix[i, idx]  # Strided in memory
        result[idx] = total

In [None]:
def benchmark_matrix_access(rows=1024, cols=1024):
    """Compare row vs column traversal."""
    matrix = np.random.rand(rows, cols).astype(np.float32)
    result_row = np.zeros(rows, dtype=np.float32)
    result_col = np.zeros(cols, dtype=np.float32)
    
    d_matrix = cuda.to_device(matrix)
    d_result_row = cuda.to_device(result_row)
    d_result_col = cuda.to_device(result_col)
    
    block = 256
    grid_row = (rows + block - 1) // block
    grid_col = (cols + block - 1) // block
    
    # Warmup
    row_major_sum[grid_row, block](d_matrix, d_result_row, rows, cols)
    col_major_sum[grid_col, block](d_matrix, d_result_col, rows, cols)
    cuda.synchronize()
    
    # Benchmark row-major
    iterations = 50
    start = time.perf_counter()
    for _ in range(iterations):
        row_major_sum[grid_row, block](d_matrix, d_result_row, rows, cols)
    cuda.synchronize()
    row_time = (time.perf_counter() - start) / iterations * 1000
    
    # Benchmark column-major
    start = time.perf_counter()
    for _ in range(iterations):
        col_major_sum[grid_col, block](d_matrix, d_result_col, rows, cols)
    cuda.synchronize()
    col_time = (time.perf_counter() - start) / iterations * 1000
    
    print(f"Matrix Access Pattern Comparison ({rows}×{cols})")
    print("=" * 50)
    print(f"Row-major (cache-friendly):   {row_time:.3f} ms")
    print(f"Column-major (cache-unfriendly): {col_time:.3f} ms")
    print(f"Ratio: {col_time/row_time:.2f}x slower")

benchmark_matrix_access()

---

## Exercises

### Exercise 1: Cache Line Analysis

In [None]:
# TODO: Calculate cache line utilization for different access patterns

def cache_line_efficiency(element_size, stride, warp_size=32, cache_line=128):
    """
    Calculate cache line efficiency.
    
    Args:
        element_size: Size of each element in bytes
        stride: Access stride (1 = consecutive)
        warp_size: Threads per warp
        cache_line: Cache line size in bytes
    
    Returns:
        Efficiency (0.0 - 1.0)
    """
    # TODO: Implement
    # Hint: Calculate how many cache lines are needed
    #       vs how much data is actually used
    pass

# Test
# cache_line_efficiency(4, 1)   # float, stride 1 → should be 100%
# cache_line_efficiency(4, 32)  # float, stride 32 → should be low

---

## Summary

### Cache Hierarchy

| Level | Size | Latency | Scope |
|-------|------|---------|-------|
| L1 | 48-128 KB | ~30 cycles | Per SM |
| L2 | 1.5-40 MB | ~200 cycles | Shared |
| Texture | ~48 KB | ~100 cycles | Per SM |
| Constant | 8 KB | ~5 cycles | Per SM |

### Coalescing Rules

1. **Consecutive threads → Consecutive addresses** (ideal)
2. **Cache line = 128 bytes**
3. **Stride-1 access = 100% efficiency**
4. **Stride-32+ = ~3% efficiency**

### CUDA C++ Patterns

```cpp
// Set cache preference
cudaFuncSetCacheConfig(kernel, cudaFuncCachePreferL1);

// Use read-only cache
float x = __ldg(&data[idx]);

// Constant memory
__constant__ float coefs[256];
cudaMemcpyToSymbol(coefs, h_coefs, sizeof(coefs));
```

### Tomorrow: Unified Memory
We'll explore simplified memory management with unified memory.