In [None]:
# ⚙️ Setup
import subprocess, sys
try:
    import google.colab
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numba"])
except ImportError:
    pass

import numpy as np
from numba import cuda
import time

print("⚠️  CUDA C++ is PRIMARY. Python/Numba for quick testing only.")
if cuda.is_available():
    print(f"GPU: {cuda.get_current_device().name}")

---

## Part 1: The Problem Graphs Solve

### Kernel Launch Overhead

```
Traditional Stream Execution:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
CPU: [Launch K1][Launch K2][Launch K3][Launch K4]
          ↓          ↓          ↓          ↓
GPU:   [  K1  ]  [  K2  ]  [  K3  ]  [  K4  ]

Each launch: ~5-10 μs overhead
If kernels are fast (10 μs), overhead = 50% of time!

With CUDA Graph:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
CPU: [Launch Graph]  (single call!)
          ↓
GPU: [K1][K2][K3][K4]  (all pre-planned)

Launch overhead: ~5-10 μs TOTAL
```

### When Graphs Help

```
✅ Good for Graphs:
• Repetitive workflows (training loops)
• Many small kernels
• Fixed computation pattern
• Inference pipelines

❌ Not Ideal:
• Dynamic control flow
• Frequently changing shapes
• Single large kernel
• One-time computations
```

---

## Part 2: Stream Capture

### CUDA C++ Stream Capture (Primary)

The following example demonstrates creating CUDA graphs via stream capture - the easiest way to create graphs from existing code.

In [None]:
%%writefile graph_capture.cu
// graph_capture.cu - Creating graphs via stream capture
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void scaleKernel(float* data, float scale, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) {
        data[tid] *= scale;
    }
}

__global__ void addKernel(float* data, float value, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) {
        data[tid] += value;
    }
}

int main() {
    const int N = 1 << 20;
    const size_t BYTES = N * sizeof(float);
    
    // Allocate pinned host and device memory
    float *h_data, *d_data;
    cudaMallocHost(&h_data, BYTES);
    cudaMalloc(&d_data, BYTES);
    
    for (int i = 0; i < N; i++) h_data[i] = 1.0f;
    
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    
    // ============================================
    // STEP 1: Begin Stream Capture
    // ============================================
    cudaGraph_t graph;
    
    // Start capturing operations
    cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
    
    // All operations on this stream are now recorded (not executed!)
    cudaMemcpyAsync(d_data, h_data, BYTES, cudaMemcpyHostToDevice, stream);
    scaleKernel<<<256, 256, 0, stream>>>(d_data, 2.0f, N);
    addKernel<<<256, 256, 0, stream>>>(d_data, 1.0f, N);
    scaleKernel<<<256, 256, 0, stream>>>(d_data, 0.5f, N);
    cudaMemcpyAsync(h_data, d_data, BYTES, cudaMemcpyDeviceToHost, stream);
    
    // ============================================
    // STEP 2: End Capture and Get Graph
    // ============================================
    cudaStreamEndCapture(stream, &graph);
    
    // ============================================
    // STEP 3: Instantiate Graph (compile it)
    // ============================================
    cudaGraphExec_t graphExec;
    cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0);
    
    // ============================================
    // STEP 4: Launch Graph (can do many times!)
    // ============================================
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    const int ITERATIONS = 100;
    
    // Time graph launches
    cudaEventRecord(start);
    for (int i = 0; i < ITERATIONS; i++) {
        cudaGraphLaunch(graphExec, stream);
    }
    cudaEventRecord(stop);
    cudaStreamSynchronize(stream);
    
    float graphTime;
    cudaEventElapsedTime(&graphTime, start, stop);
    
    printf("Graph: %d iterations in %.2f ms (%.2f us/iter)\n",
           ITERATIONS, graphTime, graphTime * 1000 / ITERATIONS);
    
    // ============================================
    // Compare with Stream (no graph)
    // ============================================
    cudaEventRecord(start);
    for (int i = 0; i < ITERATIONS; i++) {
        cudaMemcpyAsync(d_data, h_data, BYTES, cudaMemcpyHostToDevice, stream);
        scaleKernel<<<256, 256, 0, stream>>>(d_data, 2.0f, N);
        addKernel<<<256, 256, 0, stream>>>(d_data, 1.0f, N);
        scaleKernel<<<256, 256, 0, stream>>>(d_data, 0.5f, N);
        cudaMemcpyAsync(h_data, d_data, BYTES, cudaMemcpyDeviceToHost, stream);
    }
    cudaEventRecord(stop);
    cudaStreamSynchronize(stream);
    
    float streamTime;
    cudaEventElapsedTime(&streamTime, start, stop);
    
    printf("Stream: %d iterations in %.2f ms (%.2f us/iter)\n",
           ITERATIONS, streamTime, streamTime * 1000 / ITERATIONS);
    printf("Speedup: %.2fx\n", streamTime / graphTime);
    
    // Cleanup
    cudaGraphExecDestroy(graphExec);
    cudaGraphDestroy(graph);
    cudaStreamDestroy(stream);
    cudaFreeHost(h_data);
    cudaFree(d_data);
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o graph_capture graph_capture.cu
!./graph_capture

---

## Part 3: Graph Lifecycle

### The Three Objects

```
┌─────────────────────────────────────────────────────────┐
│                  CUDA GRAPH LIFECYCLE                   │
├─────────────────────────────────────────────────────────┤
│                                                         │
│  1. cudaGraph_t (Template)                              │
│     └─ Definition of operations and dependencies       │
│     └─ Created by capture or explicit construction     │
│     └─ Can be inspected, modified, cloned              │
│                                                         │
│  2. cudaGraphExec_t (Executable)                        │
│     └─ Compiled/instantiated version of graph          │
│     └─ Ready for launch                                 │
│     └─ Some parameters can be updated                   │
│                                                         │
│  3. cudaStream_t (Where it runs)                        │
│     └─ Graph launches into a stream                     │
│     └─ Follows stream ordering rules                    │
│                                                         │
│  Workflow:                                              │
│  Capture/Build → Graph → Instantiate → GraphExec       │
│                                            ↓            │
│                              Launch (many times!)       │
│                                                         │
└─────────────────────────────────────────────────────────┘
```

### Capture Modes

```cpp
// Capture mode options:

// cudaStreamCaptureModeGlobal
// - Any operation in any thread on capturing stream is captured
// - Most common for single-threaded code
cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);

// cudaStreamCaptureModeThreadLocal  
// - Only operations from this thread are captured
cudaStreamBeginCapture(stream, cudaStreamCaptureModeThreadLocal);

// cudaStreamCaptureModeRelaxed
// - Doesn't insert sync barriers, slightly faster capture
cudaStreamBeginCapture(stream, cudaStreamCaptureModeRelaxed);
```

---

## Part 4: Capture Rules and Restrictions

### What Can Be Captured?

```
✅ CAN be captured:
• Kernel launches
• cudaMemcpyAsync
• cudaMemsetAsync
• Events (record/wait)
• Child graph launches

❌ CANNOT be captured:
• cudaMemcpy (synchronous!)
• cudaMalloc/cudaFree
• cudaDeviceSynchronize
• CPU operations
• Cross-stream sync (without events)
```

### Multi-Stream Capture

The following example shows how to capture operations across multiple streams into a single graph, creating parallel execution branches.

In [None]:
%%writefile multi_stream_capture.cu
// multi_stream_capture.cu - Capturing multiple streams
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void workA(float* a, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) a[tid] *= 2.0f;
}

__global__ void workB(float* b, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) b[tid] += 1.0f;
}

__global__ void combine(float* a, float* b, float* c, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) c[tid] = a[tid] + b[tid];
}

int main() {
    const int N = 1 << 20;
    float *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, N * sizeof(float));
    cudaMalloc(&d_b, N * sizeof(float));
    cudaMalloc(&d_c, N * sizeof(float));
    
    cudaStream_t streamMain, streamA, streamB;
    cudaStreamCreate(&streamMain);
    cudaStreamCreate(&streamA);
    cudaStreamCreate(&streamB);
    
    cudaEvent_t forkEvent, joinA, joinB;
    cudaEventCreate(&forkEvent);
    cudaEventCreate(&joinA);
    cudaEventCreate(&joinB);
    
    cudaGraph_t graph;
    
    // Begin capture on main stream
    cudaStreamBeginCapture(streamMain, cudaStreamCaptureModeGlobal);
    
    // Record fork point
    cudaEventRecord(forkEvent, streamMain);
    
    // Stream A waits for fork, does work
    cudaStreamWaitEvent(streamA, forkEvent);
    workA<<<256, 256, 0, streamA>>>(d_a, N);
    cudaEventRecord(joinA, streamA);
    
    // Stream B waits for fork, does work
    cudaStreamWaitEvent(streamB, forkEvent);
    workB<<<256, 256, 0, streamB>>>(d_b, N);
    cudaEventRecord(joinB, streamB);
    
    // Main stream waits for both, combines
    cudaStreamWaitEvent(streamMain, joinA);
    cudaStreamWaitEvent(streamMain, joinB);
    combine<<<256, 256, 0, streamMain>>>(d_a, d_b, d_c, N);
    
    // End capture
    cudaStreamEndCapture(streamMain, &graph);
    
    // Graph now contains:
    //     fork
    //    /    \
    // workA  workB
    //    \    /
    //   combine
    
    cudaGraphExec_t graphExec;
    cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0);
    
    // Launch!
    cudaGraphLaunch(graphExec, streamMain);
    cudaStreamSynchronize(streamMain);
    
    printf("Multi-stream graph executed!\n");
    
    // Cleanup
    cudaGraphExecDestroy(graphExec);
    cudaGraphDestroy(graph);
    cudaStreamDestroy(streamMain);
    cudaStreamDestroy(streamA);
    cudaStreamDestroy(streamB);
    cudaEventDestroy(forkEvent);
    cudaEventDestroy(joinA);
    cudaEventDestroy(joinB);
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o multi_stream_capture multi_stream_capture.cu
!./multi_stream_capture

In [None]:
# Python/Numba Note (OPTIONAL)
# CUDA Graphs are not directly supported in Numba
# For graph functionality, use CUDA C++ or libraries like CuPy

# Here's a simulation of the concept:
@cuda.jit
def kernel1(data):
    tid = cuda.grid(1)
    if tid < data.shape[0]:
        data[tid] *= 2.0

@cuda.jit
def kernel2(data):
    tid = cuda.grid(1)
    if tid < data.shape[0]:
        data[tid] += 1.0

# Without graphs: each call has overhead
n = 1 << 18
d_data = cuda.device_array(n, dtype=np.float32)

# Warmup
kernel1[(n+255)//256, 256](d_data)
kernel2[(n+255)//256, 256](d_data)
cuda.synchronize()

# Time
start = time.time()
for _ in range(100):
    kernel1[(n+255)//256, 256](d_data)
    kernel2[(n+255)//256, 256](d_data)
cuda.synchronize()
elapsed = time.time() - start

print(f"100 iterations (no graphs): {elapsed*1000:.2f} ms")
print("Note: CUDA Graphs require CUDA C++ for direct usage")

---

## Exercises

### Exercise 1: Basic Capture
Capture a pipeline with 5 kernels and measure speedup.

### Exercise 2: Find Break-Even
At what number of kernels does graph overhead pay off?

### Exercise 3: Multi-Stream Graph
Create a graph with fork-join pattern (parallel branches).

---

## Key Takeaways

```
┌─────────────────────────────────────────────────────────┐
│                 GRAPH BASICS                            │
├─────────────────────────────────────────────────────────┤
│                                                         │
│  Stream Capture:                                        │
│  1. cudaStreamBeginCapture(stream, mode)                │
│  2. ... operations ...                                  │
│  3. cudaStreamEndCapture(stream, &graph)                │
│                                                         │
│  Execution:                                             │
│  4. cudaGraphInstantiate(&exec, graph, ...)             │
│  5. cudaGraphLaunch(exec, stream)                       │
│                                                         │
│  Benefits:                                              │
│  • Reduced launch overhead                              │
│  • Pre-planned dependencies                             │
│  • Good for repetitive patterns                         │
│                                                         │
└─────────────────────────────────────────────────────────┘
```

## Next: Day 2 - Explicit Graph Construction