In [None]:
# ⚙️ Setup
import subprocess, sys
try:
    import google.colab
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numba"])
except ImportError:
    pass

import numpy as np
from numba import cuda

print("⚠️  CUDA C++ is PRIMARY. Python/Numba for quick testing only.")
if cuda.is_available():
    print(f"GPU: {cuda.get_current_device().name}")

---

## Part 1: When to Use Graphs

### Decision Matrix

```
╔══════════════════════════════════════════════════════════╗
║               WHEN TO USE CUDA GRAPHS                    ║
╠══════════════════════════════════════════════════════════╣
║                                                          ║
║  ✅ USE GRAPHS when:                                     ║
║  • Same sequence repeated many times (1000+)             ║
║  • Many small kernels (<0.1ms each)                      ║
║  • Fixed workflow with variable data                     ║
║  • Launch overhead is significant portion of time        ║
║  • Inference in production (fixed model)                 ║
║                                                          ║
║  ❌ AVOID GRAPHS when:                                   ║
║  • Dynamic control flow (if/else, variable loops)        ║
║  • One-shot execution                                    ║
║  • Large kernels where launch overhead negligible        ║
║  • Frequently changing graph topology                    ║
║  • Grid/block dimensions change per iteration            ║
║                                                          ║
╚══════════════════════════════════════════════════════════╝
```

### Breakeven Analysis

```cpp
// When does a graph pay off?

// Costs:
// - Capture:     ~100-500μs
// - Instantiate: ~10-50μs
// - Launch:      ~1-5μs (vs ~5-15μs regular)

// Breakeven example:
// Regular: 10 kernels × 10μs launch = 100μs per iteration
// Graph:   Setup 150μs + 5μs per iteration
// Breakeven at 150μs / (100 - 5)μs ≈ 2 iterations

// After 100 iterations:
// Regular: 100 × 100μs = 10,000μs
// Graph:   150μs + 100 × 5μs = 650μs
// → 15x improvement!
```

---

## Part 2: Graph Structure Optimization

### CUDA C++ Optimized Graph (Primary)

This example demonstrates techniques for optimizing graph structure including minimizing depth and fusing operations.

In [None]:
%%writefile graph_optimization.cu
// graph_optimization.cu - Graph structure optimization
#include <stdio.h>
#include <cuda_runtime.h>

// Technique 1: Minimize graph depth
// ─────────────────────────────────
// BAD: Sequential chain
//   A → B → C → D → E → F
//   Depth = 6, minimal parallelism

// GOOD: Parallel branches
//   A → B → C
//             \→ F
//   D → E →  /
//   Depth = 4, more parallelism

__global__ void kernel(float* d, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) d[tid] *= 2.0f;
}

// Technique 2: Batch similar operations
// ─────────────────────────────────────
__global__ void fusedKernel(float* a, float* b, float* c, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) {
        // Multiple operations in one kernel
        float x = a[tid];
        x = x * 2.0f;  // Was kernel 1
        x = x + 1.0f;  // Was kernel 2
        x = sqrtf(x);  // Was kernel 3
        c[tid] = x;
    }
}

int main() {
    const int N = 1 << 20;
    float *d_a, *d_b;
    cudaMalloc(&d_a, N * sizeof(float));
    cudaMalloc(&d_b, N * sizeof(float));
    
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    
    // ============================================
    // Capture Optimized Graph
    // ============================================
    cudaGraph_t graph;
    cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
    
    // Parallel memcpys (device-to-device)
    cudaMemcpyAsync(d_b, d_a, N/2 * sizeof(float), 
                    cudaMemcpyDeviceToDevice, stream);
    
    // Fused kernel instead of multiple small ones
    fusedKernel<<<256, 256, 0, stream>>>(d_a, d_a, d_b, N);
    
    cudaStreamEndCapture(stream, &graph);
    
    // ============================================
    // Verify Graph Structure
    // ============================================
    size_t numNodes;
    cudaGraphGetNodes(graph, NULL, &numNodes);
    printf("Optimized graph has %zu nodes\n", numNodes);
    
    cudaGraphExec_t graphExec;
    cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0);
    
    // Warmup
    cudaGraphLaunch(graphExec, stream);
    cudaStreamSynchronize(stream);
    
    // Benchmark
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    const int ITERATIONS = 1000;
    cudaEventRecord(start, stream);
    
    for (int i = 0; i < ITERATIONS; i++) {
        cudaGraphLaunch(graphExec, stream);
    }
    
    cudaEventRecord(stop, stream);
    cudaStreamSynchronize(stream);
    
    float ms;
    cudaEventElapsedTime(&ms, start, stop);
    printf("Average launch time: %.2f μs\n", (ms * 1000) / ITERATIONS);
    
    // Cleanup
    cudaGraphExecDestroy(graphExec);
    cudaGraphDestroy(graph);
    cudaStreamDestroy(stream);
    cudaFree(d_a);
    cudaFree(d_b);
    
    return 0;
}

---

## Part 3: Graphs with Streams

### Concurrent Graph Execution

```cpp
// concurrent_graphs.cu - Running multiple graphs concurrently
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void process(float* data, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) data[tid] = sqrtf(data[tid]);
}

int main() {
    const int N = 1 << 18;
    const int NUM_GRAPHS = 4;
    
    float* d_data[NUM_GRAPHS];
    cudaStream_t streams[NUM_GRAPHS];
    cudaGraphExec_t graphExecs[NUM_GRAPHS];
    
    // ============================================
    // Create Multiple Graphs
    // ============================================
    for (int i = 0; i < NUM_GRAPHS; i++) {
        cudaMalloc(&d_data[i], N * sizeof(float));
        cudaStreamCreate(&streams[i]);
        
        // Capture graph for this stream
        cudaGraph_t graph;
        cudaStreamBeginCapture(streams[i], cudaStreamCaptureModeGlobal);
        
        process<<<256, 256, 0, streams[i]>>>(d_data[i], N);
        
        cudaStreamEndCapture(streams[i], &graph);
        cudaGraphInstantiate(&graphExecs[i], graph, NULL, NULL, 0);
        cudaGraphDestroy(graph);  // Can destroy after instantiate
    }
    
    // ============================================
    // Launch All Graphs Concurrently
    // ============================================
    // Different graphs on different streams run in parallel!
    
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    cudaEventRecord(start, 0);
    
    for (int iter = 0; iter < 100; iter++) {
        for (int i = 0; i < NUM_GRAPHS; i++) {
            // Each graph on its own stream
            cudaGraphLaunch(graphExecs[i], streams[i]);
        }
    }
    
    for (int i = 0; i < NUM_GRAPHS; i++) {
        cudaStreamSynchronize(streams[i]);
    }
    
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    
    float ms;
    cudaEventElapsedTime(&ms, start, stop);
    printf("%d concurrent graphs, 100 iterations: %.2f ms\n", NUM_GRAPHS, ms);
    
    // Cleanup
    for (int i = 0; i < NUM_GRAPHS; i++) {
        cudaGraphExecDestroy(graphExecs[i]);
        cudaStreamDestroy(streams[i]);
        cudaFree(d_data[i]);
    }
    
    return 0;
}
```

---

## Part 4: Real-World Pattern - Inference Pipeline

### CUDA C++ Inference Graph (Primary)

```cpp
// inference_graph.cu - Neural network inference pattern
#include <stdio.h>
#include <cuda_runtime.h>

// Simplified layer kernels
__global__ void linearLayer(float* out, float* in, float* W, float* b,
                            int out_dim, int in_dim) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < out_dim) {
        float sum = b[tid];
        for (int i = 0; i < in_dim; i++) {
            sum += W[tid * in_dim + i] * in[i];
        }
        out[tid] = sum;
    }
}

__global__ void relu(float* data, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) data[tid] = fmaxf(0.0f, data[tid]);
}

__global__ void softmax(float* out, float* in, int n) {
    __shared__ float sum;
    if (threadIdx.x == 0) sum = 0.0f;
    __syncthreads();
    
    int tid = threadIdx.x;
    float val = (tid < n) ? expf(in[tid]) : 0.0f;
    atomicAdd(&sum, val);
    __syncthreads();
    
    if (tid < n) out[tid] = val / sum;
}

struct InferenceContext {
    float* d_input;
    float* d_hidden1;
    float* d_hidden2;
    float* d_output;
    float* d_W1, *d_b1;
    float* d_W2, *d_b2;
    float* d_W3, *d_b3;
    cudaStream_t stream;
    cudaGraphExec_t graphExec;
    
    int input_dim;
    int hidden_dim;
    int output_dim;
};

void buildInferenceGraph(InferenceContext* ctx) {
    cudaGraph_t graph;
    cudaStreamBeginCapture(ctx->stream, cudaStreamCaptureModeGlobal);
    
    // Layer 1: Linear + ReLU
    linearLayer<<<1, 256, 0, ctx->stream>>>(
        ctx->d_hidden1, ctx->d_input, 
        ctx->d_W1, ctx->d_b1,
        ctx->hidden_dim, ctx->input_dim);
    relu<<<1, 256, 0, ctx->stream>>>(ctx->d_hidden1, ctx->hidden_dim);
    
    // Layer 2: Linear + ReLU
    linearLayer<<<1, 256, 0, ctx->stream>>>(
        ctx->d_hidden2, ctx->d_hidden1,
        ctx->d_W2, ctx->d_b2,
        ctx->hidden_dim, ctx->hidden_dim);
    relu<<<1, 256, 0, ctx->stream>>>(ctx->d_hidden2, ctx->hidden_dim);
    
    // Layer 3: Linear + Softmax
    linearLayer<<<1, 256, 0, ctx->stream>>>(
        ctx->d_output, ctx->d_hidden2,
        ctx->d_W3, ctx->d_b3,
        ctx->output_dim, ctx->hidden_dim);
    softmax<<<1, 32, 0, ctx->stream>>>(
        ctx->d_output, ctx->d_output, ctx->output_dim);
    
    cudaStreamEndCapture(ctx->stream, &graph);
    cudaGraphInstantiate(&ctx->graphExec, graph, NULL, NULL, 0);
    cudaGraphDestroy(graph);
}

void runInference(InferenceContext* ctx, float* h_input, float* h_output) {
    // Copy input (H2D)
    cudaMemcpyAsync(ctx->d_input, h_input,
                    ctx->input_dim * sizeof(float),
                    cudaMemcpyHostToDevice, ctx->stream);
    
    // Launch inference graph
    cudaGraphLaunch(ctx->graphExec, ctx->stream);
    
    // Copy output (D2H)
    cudaMemcpyAsync(h_output, ctx->d_output,
                    ctx->output_dim * sizeof(float),
                    cudaMemcpyDeviceToHost, ctx->stream);
    
    cudaStreamSynchronize(ctx->stream);
}

int main() {
    InferenceContext ctx;
    ctx.input_dim = 784;    // e.g., MNIST
    ctx.hidden_dim = 256;
    ctx.output_dim = 10;
    
    // Allocate buffers...
    cudaStreamCreate(&ctx.stream);
    cudaMalloc(&ctx.d_input, ctx.input_dim * sizeof(float));
    cudaMalloc(&ctx.d_hidden1, ctx.hidden_dim * sizeof(float));
    cudaMalloc(&ctx.d_hidden2, ctx.hidden_dim * sizeof(float));
    cudaMalloc(&ctx.d_output, ctx.output_dim * sizeof(float));
    // ... allocate weights ...
    
    // Build graph ONCE
    buildInferenceGraph(&ctx);
    
    // Run inference 10000 times
    float h_input[784], h_output[10];
    for (int i = 0; i < 10000; i++) {
        // Only graph launch, no CPU overhead!
        runInference(&ctx, h_input, h_output);
    }
    
    printf("Completed 10000 inference runs\n");
    
    return 0;
}
```

---

## Part 5: Best Practices Summary

### Graph Lifecycle

```cpp
// BEST PRACTICE: Proper lifecycle management

// 1. Create/capture graph
cudaGraph_t graph;
// ... capture or build ...

// 2. Instantiate once
cudaGraphExec_t graphExec;
cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0);

// 3. Destroy template (optional - saves memory)
cudaGraphDestroy(graph);

// 4. Launch many times
for (int i = 0; i < 1000000; i++) {
    // Update if needed
    if (parametersChanged) {
        cudaGraphExecKernelNodeSetParams(...);
    }
    cudaGraphLaunch(graphExec, stream);
}

// 5. Cleanup at end
cudaGraphExecDestroy(graphExec);
```

### Common Mistakes

```
❌ MISTAKE 1: Rebuilding graphs every iteration
   → Build once, update parameters

❌ MISTAKE 2: Using graphs for single-shot work
   → Use regular launches for one-off tasks

❌ MISTAKE 3: Capturing allocations in graph
   → Allocate outside, only use inside

❌ MISTAKE 4: Forgetting to sync before reading results
   → Always cudaStreamSynchronize before host access

❌ MISTAKE 5: Using host-side conditionals in capture
   → All conditionals evaluated at capture time!
```

---

## Exercises

### Exercise 1: Breakeven Analysis
Measure at what iteration count graphs become faster than regular launches.

### Exercise 2: Multi-Stream Graphs
Create a graph that uses multiple internal streams.

### Exercise 3: Complete Inference Pipeline
Extend the inference example with memcpy nodes in the graph.

---

## Key Takeaways

```
┌─────────────────────────────────────────────────────────┐
│           GRAPH OPTIMIZATION BEST PRACTICES             │
├─────────────────────────────────────────────────────────┤
│                                                         │
│  When to Use:                                           │
│  • Many small kernels (launch-bound)                    │
│  • Repeated execution (100+ times)                      │
│  • Fixed topology with variable data                    │
│                                                         │
│  Optimization Tips:                                     │
│  • Minimize graph depth                                 │
│  • Fuse small kernels                                   │
│  • Maximize parallelism (fork-join)                     │
│  • Use updates, not rebuilds                            │
│                                                         │
│  Lifecycle:                                             │
│  • Capture/build → Instantiate → Launch(N) → Destroy    │
│  • Can destroy template after instantiate               │
│                                                         │
│  Concurrency:                                           │
│  • Different graphs on different streams = parallel     │
│  • Same graphExec = serialized                          │
│                                                         │
└─────────────────────────────────────────────────────────┘
```

## Week 10 Complete! Next: Week 11 - Cooperative Groups & Dynamic Parallelism