## Why CUDA Graphs?

**Problem**: Each kernel launch has CPU overhead (~5-10μs per launch)

**Solution**: CUDA Graphs capture entire workflows and replay them with minimal overhead

| Approach | Launch Overhead per Kernel |
|----------|---------------------------|
| Regular launches | ~5-10 μs |
| CUDA Graph launch | ~10 μs total (for entire graph) |

In [None]:
%%writefile cuda_graphs_basics.cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void kernel1(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) data[idx] = data[idx] + 1.0f;
}

__global__ void kernel2(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) data[idx] = data[idx] * 2.0f;
}

__global__ void kernel3(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) data[idx] = data[idx] - 1.0f;
}

int main() {
    const int N = 1 << 20;
    const int blocks = (N + 255) / 256;
    
    float* d_data;
    cudaMalloc(&d_data, N * sizeof(float));
    cudaMemset(d_data, 0, N * sizeof(float));
    
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    
    // === METHOD 1: Stream Capture ===
    cudaGraph_t graph;
    cudaGraphExec_t graphExec;
    
    // Begin capture
    cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
    
    // These are captured, NOT executed yet
    kernel1<<<blocks, 256, 0, stream>>>(d_data, N);
    kernel2<<<blocks, 256, 0, stream>>>(d_data, N);
    kernel3<<<blocks, 256, 0, stream>>>(d_data, N);
    
    // End capture
    cudaStreamEndCapture(stream, &graph);
    
    // Create executable graph
    cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0);
    
    // Launch graph multiple times
    printf("Launching graph 10 times...\n");
    for (int i = 0; i < 10; i++) {
        cudaGraphLaunch(graphExec, stream);
    }
    cudaStreamSynchronize(stream);
    
    // Verify result: ((0+1)*2-1) = 1, applied 10 times
    float h_result;
    cudaMemcpy(&h_result, d_data, sizeof(float), cudaMemcpyDeviceToHost);
    printf("Result after 10 graph launches: %.0f\n", h_result);
    
    // Cleanup
    cudaGraphExecDestroy(graphExec);
    cudaGraphDestroy(graph);
    cudaStreamDestroy(stream);
    cudaFree(d_data);
    
    return 0;
}

In [None]:
!nvcc cuda_graphs_basics.cu -o cuda_graphs_basics && ./cuda_graphs_basics

## Manual Graph Construction

Build graphs programmatically with full control over node dependencies.

In [None]:
%%writefile cuda_graphs_manual.cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void initKernel(float* data, int n, float val) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) data[idx] = val;
}

__global__ void squareKernel(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) data[idx] = data[idx] * data[idx];
}

int main() {
    const int N = 1024;
    float* d_data;
    cudaMalloc(&d_data, N * sizeof(float));
    
    // Create empty graph
    cudaGraph_t graph;
    cudaGraphCreate(&graph, 0);
    
    // Add kernel nodes manually
    cudaGraphNode_t initNode, squareNode;
    
    // Kernel parameters for initKernel
    cudaKernelNodeParams initParams = {};
    void* initArgs[] = {&d_data, (void*)&N, nullptr};
    float initVal = 3.0f;
    initArgs[2] = &initVal;
    initParams.func = (void*)initKernel;
    initParams.gridDim = dim3((N + 255) / 256);
    initParams.blockDim = dim3(256);
    initParams.sharedMemBytes = 0;
    initParams.kernelParams = initArgs;
    initParams.extra = nullptr;
    
    // Add init node (no dependencies)
    cudaGraphAddKernelNode(&initNode, graph, nullptr, 0, &initParams);
    
    // Kernel parameters for squareKernel
    cudaKernelNodeParams squareParams = {};
    void* squareArgs[] = {&d_data, (void*)&N};
    squareParams.func = (void*)squareKernel;
    squareParams.gridDim = dim3((N + 255) / 256);
    squareParams.blockDim = dim3(256);
    squareParams.sharedMemBytes = 0;
    squareParams.kernelParams = squareArgs;
    squareParams.extra = nullptr;
    
    // Add square node (depends on init)
    cudaGraphNode_t deps[] = {initNode};
    cudaGraphAddKernelNode(&squareNode, graph, deps, 1, &squareParams);
    
    // Instantiate and launch
    cudaGraphExec_t graphExec;
    cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0);
    
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    
    cudaGraphLaunch(graphExec, stream);
    cudaStreamSynchronize(stream);
    
    float h_result;
    cudaMemcpy(&h_result, d_data, sizeof(float), cudaMemcpyDeviceToHost);
    printf("Result: init(3.0) then square = %.0f (expected 9)\n", h_result);
    
    cudaGraphExecDestroy(graphExec);
    cudaGraphDestroy(graph);
    cudaStreamDestroy(stream);
    cudaFree(d_data);
    
    return 0;
}

In [None]:
!nvcc cuda_graphs_manual.cu -o cuda_graphs_manual && ./cuda_graphs_manual

## Graph Update (Parameter Changes)

Update graph parameters without rebuilding the entire graph.

In [None]:
%%writefile cuda_graphs_update.cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void scaleKernel(float* data, int n, float scale) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) data[idx] = data[idx] * scale;
}

int main() {
    const int N = 1024;
    float* d_data;
    cudaMalloc(&d_data, N * sizeof(float));
    
    float h_data[N];
    for (int i = 0; i < N; i++) h_data[i] = 1.0f;
    cudaMemcpy(d_data, h_data, N * sizeof(float), cudaMemcpyHostToDevice);
    
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    
    // Capture initial graph with scale = 2.0
    float scale = 2.0f;
    cudaGraph_t graph;
    cudaGraphExec_t graphExec;
    
    cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
    scaleKernel<<<(N+255)/256, 256, 0, stream>>>(d_data, N, scale);
    cudaStreamEndCapture(stream, &graph);
    cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0);
    
    // Launch with scale = 2.0
    cudaGraphLaunch(graphExec, stream);
    cudaStreamSynchronize(stream);
    
    float h_result;
    cudaMemcpy(&h_result, d_data, sizeof(float), cudaMemcpyDeviceToHost);
    printf("After scale=2.0: %.1f\n", h_result);
    
    // Update graph with new scale = 3.0
    cudaGraph_t newGraph;
    scale = 3.0f;
    cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
    scaleKernel<<<(N+255)/256, 256, 0, stream>>>(d_data, N, scale);
    cudaStreamEndCapture(stream, &newGraph);
    
    // Update existing executable with new graph
    cudaGraphExecUpdateResultInfo updateResult;
    cudaGraphExecUpdate(graphExec, newGraph, &updateResult);
    
    if (updateResult.result == cudaGraphExecUpdateSuccess) {
        printf("Graph updated successfully!\n");
    }
    
    // Launch updated graph
    cudaGraphLaunch(graphExec, stream);
    cudaStreamSynchronize(stream);
    
    cudaMemcpy(&h_result, d_data, sizeof(float), cudaMemcpyDeviceToHost);
    printf("After scale=3.0: %.1f\n", h_result);
    
    cudaGraphDestroy(newGraph);
    cudaGraphExecDestroy(graphExec);
    cudaGraphDestroy(graph);
    cudaStreamDestroy(stream);
    cudaFree(d_data);
    
    return 0;
}

In [None]:
!nvcc cuda_graphs_update.cu -o cuda_graphs_update && ./cuda_graphs_update

## Performance Comparison

In [None]:
%%writefile cuda_graphs_benchmark.cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void tinyKernel(float* data) {
    data[threadIdx.x] += 1.0f;
}

int main() {
    float* d_data;
    cudaMalloc(&d_data, 256 * sizeof(float));
    cudaMemset(d_data, 0, 256 * sizeof(float));
    
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    
    const int KERNELS = 100;
    const int ITERATIONS = 1000;
    
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    // Benchmark regular launches
    cudaEventRecord(start);
    for (int iter = 0; iter < ITERATIONS; iter++) {
        for (int k = 0; k < KERNELS; k++) {
            tinyKernel<<<1, 256>>>(d_data);
        }
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float regularMs;
    cudaEventElapsedTime(&regularMs, start, stop);
    
    // Capture graph
    cudaGraph_t graph;
    cudaGraphExec_t graphExec;
    
    cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
    for (int k = 0; k < KERNELS; k++) {
        tinyKernel<<<1, 256, 0, stream>>>(d_data);
    }
    cudaStreamEndCapture(stream, &graph);
    cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0);
    
    // Benchmark graph launches
    cudaEventRecord(start);
    for (int iter = 0; iter < ITERATIONS; iter++) {
        cudaGraphLaunch(graphExec, stream);
    }
    cudaStreamSynchronize(stream);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float graphMs;
    cudaEventElapsedTime(&graphMs, start, stop);
    
    printf("Launching %d tiny kernels x %d iterations:\n", KERNELS, ITERATIONS);
    printf("Regular launches: %.2f ms\n", regularMs);
    printf("Graph launches:   %.2f ms\n", graphMs);
    printf("Speedup: %.1fx\n", regularMs / graphMs);
    
    cudaGraphExecDestroy(graphExec);
    cudaGraphDestroy(graph);
    cudaStreamDestroy(stream);
    cudaFree(d_data);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    
    return 0;
}

In [None]:
!nvcc cuda_graphs_benchmark.cu -o cuda_graphs_benchmark && ./cuda_graphs_benchmark

## CUDA Graph Node Types

| Node Type | Purpose |
|-----------|--------|
| `cudaGraphAddKernelNode` | Kernel execution |
| `cudaGraphAddMemcpyNode` | Memory copy |
| `cudaGraphAddMemsetNode` | Memory set |
| `cudaGraphAddHostNode` | CPU callback |
| `cudaGraphAddChildGraphNode` | Nested graph |
| `cudaGraphAddEventRecordNode` | Record event |
| `cudaGraphAddEventWaitNode` | Wait for event |

## Key Takeaways

1. **Stream Capture** - Easiest way to create graphs from existing code
2. **Manual Construction** - Full control over dependencies
3. **Graph Update** - Change parameters without rebuilding
4. **Best for** - Repetitive workflows with many small kernels
5. **Overhead reduction** - 10-100x for launch-bound workloads