In [None]:
# ⚙️ Setup
import subprocess, sys
try:
    import google.colab
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numba"])
except ImportError:
    pass

import numpy as np
from numba import cuda

print("⚠️  CUDA C++ is PRIMARY. Python/Numba for quick testing only.")
if cuda.is_available():
    print(f"GPU: {cuda.get_current_device().name}")

---

## The Optimization Hierarchy

```
┌─────────────────────────────────────────────────────────────┐
│              OPTIMIZATION PRIORITY ORDER                    │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  1. ALGORITHM CHOICE          ███████████████████ ~10-100x  │
│     • Choose right algorithm                                │
│     • Reduce algorithmic complexity                         │
│                                                             │
│  2. MEMORY ACCESS PATTERNS    ████████████████    ~5-20x    │
│     • Coalesced access                                      │
│     • Minimize global memory traffic                        │
│     • Use shared memory                                     │
│                                                             │
│  3. OCCUPANCY & PARALLELISM   ████████████        ~2-5x     │
│     • Enough threads to hide latency                        │
│     • Balance resources per block                           │
│                                                             │
│  4. INSTRUCTION OPTIMIZATION  ████████            ~1.5-3x   │
│     • Fast math functions                                   │
│     • Avoid divergence                                      │
│                                                             │
│  5. CONCURRENCY               ██████              ~1.2-2x   │
│     • Streams, graphs                                       │
│     • Overlap compute/transfer                              │
│                                                             │
└─────────────────────────────────────────────────────────────┘
```

---

## Part 1: Memory Optimization Techniques

### 1.1 Coalesced Memory Access

```cpp
// GOOD: Coalesced - threads access consecutive memory
__global__ void coalesced(float* data, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) {
        data[tid] = data[tid] * 2.0f;  // Thread i accesses element i
    }
}

// BAD: Strided - threads access with stride
__global__ void strided(float* data, int n, int stride) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int idx = tid * stride;  // Thread i accesses element i*stride
    if (idx < n) {
        data[idx] = data[idx] * 2.0f;
    }
}
```

### 1.2 Shared Memory Usage

```cpp
// Pattern: Load to shared -> Sync -> Process -> Sync -> Store
__global__ void withShared(float* out, float* in, int n) {
    __shared__ float smem[256];
    
    int tid = threadIdx.x;
    int gid = blockIdx.x * blockDim.x + tid;
    
    // Load to shared memory
    smem[tid] = (gid < n) ? in[gid] : 0.0f;
    __syncthreads();
    
    // Process using shared memory
    float result = smem[tid];
    if (tid > 0) result += smem[tid - 1];
    if (tid < 255) result += smem[tid + 1];
    __syncthreads();
    
    // Write result
    if (gid < n) out[gid] = result;
}
```

### 1.3 Bank Conflict Avoidance

```cpp
// BAD: Bank conflicts (stride of 32)
__shared__ float smem[32][32];
smem[threadIdx.x][0] = value;  // All access bank 0!

// GOOD: Padding to avoid conflicts
__shared__ float smem[32][33];  // +1 padding
smem[threadIdx.x][0] = value;   // Different banks
```

---

## Part 2: Compute Optimization

### 2.1 Instruction Throughput

```cpp
// Fast math intrinsics (less accurate, much faster)
__device__ float fast_sin(float x) {
    return __sinf(x);      // ~10x faster than sinf()
}

__device__ float fast_exp(float x) {
    return __expf(x);      // ~10x faster than expf()
}

__device__ float fast_rsqrt(float x) {
    return rsqrtf(x);      // 1/sqrt(x), very fast
}
```

### 2.2 Warp Divergence

```cpp
// BAD: Divergent branches
__global__ void divergent(float* data, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid % 2 == 0) {      // Half warp does one thing
        data[tid] = expensive_op1(data[tid]);
    } else {                 // Other half does another
        data[tid] = expensive_op2(data[tid]);
    }
}

// BETTER: Separate into different warps
__global__ void nondivergent(float* data, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int warpId = tid / 32;
    
    if (warpId % 2 == 0) {   // Whole warp does one thing
        data[tid] = expensive_op1(data[tid]);
    } else {                 // Other warp does another
        data[tid] = expensive_op2(data[tid]);
    }
}
```

### 2.3 Loop Unrolling

```cpp
// Manual unroll
__global__ void unrolled(float* data, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    
    #pragma unroll 4
    for (int i = 0; i < 4; i++) {
        int idx = tid + i * blockDim.x * gridDim.x;
        if (idx < n) data[idx] *= 2.0f;
    }
}
```

---

## Part 3: Occupancy Optimization

### 3.1 Resource Balancing

```cpp
// Query optimal block size
int minGridSize, optBlockSize;
cudaOccupancyMaxPotentialBlockSize(
    &minGridSize, &optBlockSize, 
    myKernel, 0, 0);

printf("Optimal block size: %d\n", optBlockSize);
printf("Min grid size: %d\n", minGridSize);
```

### 3.2 Register Pressure

```cpp
// Limit registers to increase occupancy
__global__ __launch_bounds__(256, 4)  // 256 threads, 4 blocks/SM
void limitedRegisters(float* data) {
    // Kernel code
}

// Compile with: nvcc -maxrregcount=32 kernel.cu
```

### 3.3 Shared Memory Configuration

```cpp
// Prefer more shared memory over L1 cache
cudaFuncSetCacheConfig(myKernel, cudaFuncCachePreferShared);

// Options:
// cudaFuncCachePreferNone   - No preference
// cudaFuncCachePreferShared - Prefer shared memory
// cudaFuncCachePreferL1     - Prefer L1 cache
// cudaFuncCachePreferEqual  - Equal split
```

---

## Part 4: Concurrency Optimization

### 4.1 Stream Overlap Pattern

```cpp
// Chunk and overlap pattern
const int NUM_STREAMS = 4;
cudaStream_t streams[NUM_STREAMS];

for (int i = 0; i < NUM_STREAMS; i++) {
    cudaStreamCreate(&streams[i]);
}

int chunkSize = N / NUM_STREAMS;

for (int i = 0; i < NUM_STREAMS; i++) {
    int offset = i * chunkSize;
    
    // H2D for chunk i
    cudaMemcpyAsync(d_in + offset, h_in + offset,
                    chunkSize * sizeof(float),
                    cudaMemcpyHostToDevice, streams[i]);
    
    // Compute chunk i
    kernel<<<blocks, threads, 0, streams[i]>>>(
        d_out + offset, d_in + offset, chunkSize);
    
    // D2H for chunk i
    cudaMemcpyAsync(h_out + offset, d_out + offset,
                    chunkSize * sizeof(float),
                    cudaMemcpyDeviceToHost, streams[i]);
}
```

### 4.2 CUDA Graphs for Repeated Patterns

```cpp
// Capture repeating pattern
cudaGraph_t graph;
cudaGraphExec_t instance;

cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);

// Pattern to repeat
kernelA<<<...>>>(...);
kernelB<<<...>>>(...);
kernelC<<<...>>>(...);

cudaStreamEndCapture(stream, &graph);
cudaGraphInstantiate(&instance, graph, NULL, NULL, 0);

// Execute efficiently many times
for (int iter = 0; iter < 1000; iter++) {
    cudaGraphLaunch(instance, stream);
}
```

---

## Optimization Checklist

```
┌─────────────────────────────────────────────────────────────┐
│              CUDA OPTIMIZATION CHECKLIST                    │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  □ MEMORY ACCESS                                            │
│    □ Coalesced global memory access                         │
│    □ Minimize global memory transactions                    │
│    □ Use shared memory for reused data                      │
│    □ Avoid bank conflicts in shared memory                  │
│    □ Use __ldg() for read-only data                         │
│    □ Use pinned memory for host-device transfers            │
│                                                             │
│  □ COMPUTE                                                  │
│    □ Use fast math where precision allows                   │
│    □ Minimize warp divergence                               │
│    □ Use appropriate data types (float vs double)           │
│    □ Unroll loops where beneficial                          │
│    □ Use warp-level primitives                              │
│                                                             │
│  □ OCCUPANCY                                                │
│    □ Use occupancy calculator for block size                │
│    □ Balance registers vs occupancy                         │
│    □ Balance shared memory vs occupancy                     │
│    □ Ensure enough blocks to saturate GPU                   │
│                                                             │
│  □ CONCURRENCY                                              │
│    □ Overlap compute and memory transfers                   │
│    □ Use multiple streams for independent work              │
│    □ Use CUDA Graphs for repeated patterns                  │
│    □ Enable peer access for multi-GPU                       │
│                                                             │
│  □ PROFILING                                                │
│    □ Profile with Nsight Compute                            │
│    □ Check roofline position                                │
│    □ Identify bottlenecks before optimizing                 │
│    □ Measure improvement after each change                  │
│                                                             │
└─────────────────────────────────────────────────────────────┘
```

---

## Complete Optimized Example: Matrix Transpose

```cpp
// optimized_transpose.cu - Fully optimized matrix transpose
#include <stdio.h>
#include <cuda_runtime.h>

#define TILE_DIM 32
#define BLOCK_ROWS 8

// Naive transpose (for comparison)
__global__ void transposeNaive(float* out, float* in, int width, int height) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    
    if (x < width && y < height) {
        out[x * height + y] = in[y * width + x];
    }
}

// Optimized transpose with shared memory and bank conflict avoidance
__global__ void transposeOptimized(float* out, float* in, int width, int height) {
    // +1 padding to avoid bank conflicts
    __shared__ float tile[TILE_DIM][TILE_DIM + 1];
    
    int x = blockIdx.x * TILE_DIM + threadIdx.x;
    int y = blockIdx.y * TILE_DIM + threadIdx.y;
    
    // Load tile (coalesced read)
    #pragma unroll
    for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
        if (x < width && (y + j) < height) {
            tile[threadIdx.y + j][threadIdx.x] = in[(y + j) * width + x];
        }
    }
    
    __syncthreads();
    
    // Transposed coordinates
    x = blockIdx.y * TILE_DIM + threadIdx.x;
    y = blockIdx.x * TILE_DIM + threadIdx.y;
    
    // Store transposed tile (coalesced write)
    #pragma unroll
    for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
        if (x < height && (y + j) < width) {
            out[(y + j) * height + x] = tile[threadIdx.x][threadIdx.y + j];
        }
    }
}

int main() {
    const int WIDTH = 4096, HEIGHT = 4096;
    size_t bytes = WIDTH * HEIGHT * sizeof(float);
    
    float *d_in, *d_out;
    cudaMalloc(&d_in, bytes);
    cudaMalloc(&d_out, bytes);
    
    dim3 block(TILE_DIM, BLOCK_ROWS);
    dim3 grid((WIDTH + TILE_DIM - 1) / TILE_DIM,
              (HEIGHT + TILE_DIM - 1) / TILE_DIM);
    
    // Warmup
    transposeOptimized<<<grid, block>>>(d_out, d_in, WIDTH, HEIGHT);
    
    // Benchmark
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    const int RUNS = 100;
    
    cudaEventRecord(start);
    for (int i = 0; i < RUNS; i++) {
        transposeOptimized<<<grid, block>>>(d_out, d_in, WIDTH, HEIGHT);
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    float ms;
    cudaEventElapsedTime(&ms, start, stop);
    
    float bandwidth = 2.0f * bytes / (ms / RUNS / 1000.0f) / 1e9;
    printf("Optimized Transpose: %.2f GB/s\n", bandwidth);
    
    cudaFree(d_in);
    cudaFree(d_out);
    
    return 0;
}
```

---

## Key Takeaways

```
┌─────────────────────────────────────────────────────────────┐
│              OPTIMIZATION SUMMARY                           │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  1. Profile First                                           │
│     • Identify bottleneck before optimizing                 │
│     • Don't guess, measure                                  │
│                                                             │
│  2. Memory is Usually the Bottleneck                        │
│     • Coalescing is critical                                │
│     • Shared memory for data reuse                          │
│     • Minimize transfers                                    │
│                                                             │
│  3. Occupancy Matters (to a point)                          │
│     • Need enough parallelism to hide latency               │
│     • But higher isn't always better                        │
│                                                             │
│  4. Concurrency for Free Performance                        │
│     • Overlap compute and transfers                         │
│     • Use streams and graphs                                │
│                                                             │
│  5. Iterate and Measure                                     │
│     • One optimization at a time                            │
│     • Verify improvement after each change                  │
│                                                             │
└─────────────────────────────────────────────────────────────┘
```

## Next: Day 4 - Capstone Project