In [None]:
# ⚙️ Setup
import subprocess, sys
try:
    import google.colab
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numba"])
except ImportError:
    pass

import numpy as np
from numba import cuda

print("⚠️  CUDA C++ is PRIMARY. Python/Numba for quick testing only.")
if cuda.is_available():
    print(f"GPU: {cuda.get_current_device().name}")

---

## Part 1: Why Update Graphs?

### The Problem

```
Scenario: Training loop with changing data

Without Updates (slow):
━━━━━━━━━━━━━━━━━━━━━━━
for epoch in range(100):
    for batch in batches:
        # Rebuild graph every time! (expensive)
        capture_graph(batch)  # ~100ms
        instantiate()         # ~10ms
        launch()              # ~0.1ms

With Updates (fast):
━━━━━━━━━━━━━━━━━━━━━━━
# Build once
capture_graph(first_batch)
instantiate()

for epoch in range(100):
    for batch in batches:
        update_parameters(batch)  # ~1μs
        launch()                  # ~0.1ms
```

---

## Part 2: Updating Kernel Parameters

### CUDA C++ Kernel Node Updates (Primary)

This example shows how to efficiently update kernel parameters without rebuilding the graph - essential for training loops and batch processing.

In [None]:
%%writefile graph_update_kernel.cu
// graph_update_kernel.cu - Updating kernel parameters
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void scaleAdd(float* data, float scale, float add, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) {
        data[tid] = data[tid] * scale + add;
    }
}

int main() {
    const int N = 1 << 20;
    float *d_data;
    cudaMalloc(&d_data, N * sizeof(float));
    
    // Initial parameters
    float scale = 2.0f;
    float add = 1.0f;
    int n = N;
    
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    
    // ============================================
    // Capture Initial Graph
    // ============================================
    cudaGraph_t graph;
    cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
    
    scaleAdd<<<256, 256, 0, stream>>>(d_data, scale, add, n);
    
    cudaStreamEndCapture(stream, &graph);
    
    cudaGraphExec_t graphExec;
    cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0);
    
    // ============================================
    // Method 1: Update via Node
    // ============================================
    
    // Get the kernel node from graph
    size_t numNodes;
    cudaGraphGetNodes(graph, NULL, &numNodes);
    cudaGraphNode_t* nodes = new cudaGraphNode_t[numNodes];
    cudaGraphGetNodes(graph, nodes, &numNodes);
    
    // Find the kernel node
    cudaGraphNode_t kernelNode = NULL;
    for (size_t i = 0; i < numNodes; i++) {
        cudaGraphNodeType type;
        cudaGraphNodeGetType(nodes[i], &type);
        if (type == cudaGraphNodeTypeKernel) {
            kernelNode = nodes[i];
            break;
        }
    }
    
    // Update parameters
    float newScale = 3.0f;
    float newAdd = 2.0f;
    
    cudaKernelNodeParams params;
    cudaGraphKernelNodeGetParams(kernelNode, &params);
    
    // Update kernel arguments
    void* newArgs[] = { &d_data, &newScale, &newAdd, &n };
    params.kernelParams = newArgs;
    
    // Apply update to executable graph
    cudaGraphExecKernelNodeSetParams(graphExec, kernelNode, &params);
    
    // Launch with new parameters!
    cudaGraphLaunch(graphExec, stream);
    cudaStreamSynchronize(stream);
    
    printf("Updated kernel with scale=%.1f, add=%.1f\n", newScale, newAdd);
    
    // ============================================
    // Method 2: cudaGraphExecUpdate (whole graph)
    // ============================================
    
    // Capture a new graph with different parameters
    cudaGraph_t newGraph;
    float anotherScale = 4.0f;
    
    cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
    scaleAdd<<<256, 256, 0, stream>>>(d_data, anotherScale, add, n);
    cudaStreamEndCapture(stream, &newGraph);
    
    // Update executable from new graph (must have same topology!)
    cudaGraphNode_t errorNode;
    cudaGraphExecUpdateResult updateResult;
    
    cudaGraphExecUpdate(graphExec, newGraph, &errorNode, &updateResult);
    
    if (updateResult == cudaGraphExecUpdateSuccess) {
        printf("Graph updated successfully!\n");
        cudaGraphLaunch(graphExec, stream);
        cudaStreamSynchronize(stream);
    } else {
        printf("Update failed, need to reinstantiate\n");
    }
    
    // Cleanup
    delete[] nodes;
    cudaGraphExecDestroy(graphExec);
    cudaGraphDestroy(graph);
    cudaGraphDestroy(newGraph);
    cudaStreamDestroy(stream);
    cudaFree(d_data);
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o graph_update_kernel graph_update_kernel.cu
!./graph_update_kernel

---

## Part 3: Updating Memcpy Nodes

### CUDA C++ Memcpy Updates (Primary)

This example demonstrates updating data pointers in memcpy nodes - useful for processing multiple buffers with the same graph.

In [None]:
%%writefile graph_update_memcpy.cu
// graph_update_memcpy.cu - Updating data pointers
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void process(float* data, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) data[tid] *= 2.0f;
}

int main() {
    const int N = 1 << 20;
    const size_t BYTES = N * sizeof(float);
    const int NUM_BUFFERS = 4;
    
    // Multiple input/output buffers
    float* h_inputs[NUM_BUFFERS];
    float* h_outputs[NUM_BUFFERS];
    for (int i = 0; i < NUM_BUFFERS; i++) {
        cudaMallocHost(&h_inputs[i], BYTES);
        cudaMallocHost(&h_outputs[i], BYTES);
        for (int j = 0; j < N; j++) h_inputs[i][j] = (float)(i + 1);
    }
    
    float* d_data;
    cudaMalloc(&d_data, BYTES);
    
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    
    // Capture with first buffer
    cudaGraph_t graph;
    cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
    
    cudaMemcpyAsync(d_data, h_inputs[0], BYTES, cudaMemcpyHostToDevice, stream);
    process<<<256, 256, 0, stream>>>(d_data, N);
    cudaMemcpyAsync(h_outputs[0], d_data, BYTES, cudaMemcpyDeviceToHost, stream);
    
    cudaStreamEndCapture(stream, &graph);
    
    cudaGraphExec_t graphExec;
    cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0);
    
    // ============================================
    // Get memcpy nodes
    // ============================================
    size_t numNodes;
    cudaGraphGetNodes(graph, NULL, &numNodes);
    cudaGraphNode_t* nodes = new cudaGraphNode_t[numNodes];
    cudaGraphGetNodes(graph, nodes, &numNodes);
    
    cudaGraphNode_t h2dNode = NULL, d2hNode = NULL;
    for (size_t i = 0; i < numNodes; i++) {
        cudaGraphNodeType type;
        cudaGraphNodeGetType(nodes[i], &type);
        if (type == cudaGraphNodeTypeMemcpy) {
            // Determine if H2D or D2H by checking parameters
            cudaMemcpy3DParms params;
            cudaGraphMemcpyNodeGetParams(nodes[i], &params);
            if (params.kind == cudaMemcpyHostToDevice) {
                h2dNode = nodes[i];
            } else {
                d2hNode = nodes[i];
            }
        }
    }
    
    // ============================================
    // Process each buffer by updating graph
    // ============================================
    for (int buf = 0; buf < NUM_BUFFERS; buf++) {
        // Update H2D source
        cudaMemcpy3DParms h2dParams = {0};
        h2dParams.srcPtr = make_cudaPitchedPtr(h_inputs[buf], BYTES, N, 1);
        h2dParams.dstPtr = make_cudaPitchedPtr(d_data, BYTES, N, 1);
        h2dParams.extent = make_cudaExtent(BYTES, 1, 1);
        h2dParams.kind = cudaMemcpyHostToDevice;
        cudaGraphExecMemcpyNodeSetParams(graphExec, h2dNode, &h2dParams);
        
        // Update D2H destination
        cudaMemcpy3DParms d2hParams = {0};
        d2hParams.srcPtr = make_cudaPitchedPtr(d_data, BYTES, N, 1);
        d2hParams.dstPtr = make_cudaPitchedPtr(h_outputs[buf], BYTES, N, 1);
        d2hParams.extent = make_cudaExtent(BYTES, 1, 1);
        d2hParams.kind = cudaMemcpyDeviceToHost;
        cudaGraphExecMemcpyNodeSetParams(graphExec, d2hNode, &d2hParams);
        
        // Launch with updated pointers
        cudaGraphLaunch(graphExec, stream);
        cudaStreamSynchronize(stream);
        
        printf("Buffer %d: input=%.0f, output=%.0f\n", 
               buf, h_inputs[buf][0], h_outputs[buf][0]);
    }
    
    // Cleanup
    delete[] nodes;
    cudaGraphExecDestroy(graphExec);
    cudaGraphDestroy(graph);
    cudaStreamDestroy(stream);
    cudaFree(d_data);
    for (int i = 0; i < NUM_BUFFERS; i++) {
        cudaFreeHost(h_inputs[i]);
        cudaFreeHost(h_outputs[i]);
    }
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o graph_update_memcpy graph_update_memcpy.cu
!./graph_update_memcpy

---

## Part 4: Update Limitations

### What Can Be Updated

```
✅ CAN update:
• Kernel arguments (different values)
• Memcpy source/destination pointers
• Memcpy size (same or smaller)
• Memset values

❌ CANNOT update:
• Graph topology (add/remove nodes)
• Kernel function (different kernel)
• Block/grid dimensions
• Dependency structure
• Node types

If you need to change these → Reinstantiate!
```

### Update Error Handling

```cpp
cudaGraphNode_t errorNode;
cudaGraphExecUpdateResult result;

cudaGraphExecUpdate(graphExec, newGraph, &errorNode, &result);

switch (result) {
    case cudaGraphExecUpdateSuccess:
        // Good to go!
        break;
    case cudaGraphExecUpdateError:
        // General error
        break;
    case cudaGraphExecUpdateErrorTopologyChanged:
        // Graph structure changed - must reinstantiate
        cudaGraphExecDestroy(graphExec);
        cudaGraphInstantiate(&graphExec, newGraph, NULL, NULL, 0);
        break;
    case cudaGraphExecUpdateErrorNodeTypeChanged:
        // Node type changed - must reinstantiate
        break;
    case cudaGraphExecUpdateErrorFunctionChanged:
        // Kernel function changed - must reinstantiate
        break;
    case cudaGraphExecUpdateErrorParametersChanged:
        // Parameters incompatible
        break;
}
```

---

## Exercises

### Exercise 1: Iterative Updates
Create a graph and update kernel parameters 1000 times, measuring update overhead.

### Exercise 2: Buffer Rotation
Implement double-buffering with graph updates.

### Exercise 3: Graceful Fallback
Implement update logic with fallback to reinstantiation when needed.

---

## Key Takeaways

```
┌─────────────────────────────────────────────────────────┐
│                 GRAPH UPDATES                           │
├─────────────────────────────────────────────────────────┤
│                                                         │
│  Per-Node Updates:                                      │
│  • cudaGraphExecKernelNodeSetParams()                   │
│  • cudaGraphExecMemcpyNodeSetParams()                   │
│  • cudaGraphExecMemsetNodeSetParams()                   │
│                                                         │
│  Whole-Graph Update:                                    │
│  • cudaGraphExecUpdate(exec, newGraph, ...)             │
│  • Check result for success/failure                     │
│                                                         │
│  Limitations:                                           │
│  • Topology must stay the same                          │
│  • Kernel function must stay the same                   │
│  • Grid/block dims must stay the same                   │
│                                                         │
│  Performance: Updates are ~1000x faster than rebuild    │
│                                                         │
└─────────────────────────────────────────────────────────┘
```

## Next: Day 4 - Graph Optimization