In [None]:
# ⚙️ Setup
import subprocess, sys
try:
    import google.colab
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numba"])
except ImportError:
    pass

import numpy as np
from numba import cuda
import time

print("⚠️  CUDA C++ is PRIMARY. Python/Numba for quick testing only.")
if cuda.is_available():
    print(f"GPU: {cuda.get_current_device().name}")

---

## Part 1: Why Explicit Construction?

### Capture vs Explicit

```
Stream Capture:
━━━━━━━━━━━━━━━
✅ Easy - just run code in capture mode
✅ Natural for converting existing code
❌ Limited control over structure
❌ Can't build graphs dynamically

Explicit Construction:
━━━━━━━━━━━━━━━━━━━━━
✅ Full control over graph structure
✅ Can build programmatically
✅ More flexible dependencies
❌ More verbose code
❌ Need to manage node handles
```

---

## Part 2: Building a Simple Graph

### CUDA C++ Explicit Graph (Primary)

This example demonstrates building a graph node by node, giving you full control over the graph structure.

In [None]:
%%writefile explicit_graph.cu
// explicit_graph.cu - Building graphs node by node
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void kernelA(float* data, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) data[tid] *= 2.0f;
}

__global__ void kernelB(float* data, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) data[tid] += 1.0f;
}

int main() {
    const int N = 1 << 20;
    const size_t BYTES = N * sizeof(float);
    
    float *h_data, *d_data;
    cudaMallocHost(&h_data, BYTES);
    cudaMalloc(&d_data, BYTES);
    
    for (int i = 0; i < N; i++) h_data[i] = 1.0f;
    
    // ============================================
    // Create Empty Graph
    // ============================================
    cudaGraph_t graph;
    cudaGraphCreate(&graph, 0);  // 0 = flags (none)
    
    // ============================================
    // Add Memcpy Node (H2D)
    // ============================================
    cudaGraphNode_t h2dNode;
    cudaMemcpy3DParms h2dParams = {0};
    h2dParams.srcPtr = make_cudaPitchedPtr((void*)h_data, BYTES, N, 1);
    h2dParams.dstPtr = make_cudaPitchedPtr((void*)d_data, BYTES, N, 1);
    h2dParams.extent = make_cudaExtent(BYTES, 1, 1);
    h2dParams.kind = cudaMemcpyHostToDevice;
    
    cudaGraphAddMemcpyNode(&h2dNode, graph, 
                           NULL, 0,  // No dependencies
                           &h2dParams);
    
    // ============================================
    // Add Kernel Node A (depends on H2D)
    // ============================================
    cudaGraphNode_t kernelANode;
    
    cudaKernelNodeParams kernelAParams = {0};
    void* argsA[] = { &d_data, (void*)&N };
    
    kernelAParams.func = (void*)kernelA;
    kernelAParams.gridDim = dim3(256);
    kernelAParams.blockDim = dim3(256);
    kernelAParams.sharedMemBytes = 0;
    kernelAParams.kernelParams = argsA;
    kernelAParams.extra = NULL;
    
    cudaGraphNode_t depA[] = { h2dNode };  // Depends on H2D
    cudaGraphAddKernelNode(&kernelANode, graph, 
                           depA, 1,  // 1 dependency
                           &kernelAParams);
    
    // ============================================
    // Add Kernel Node B (depends on Kernel A)
    // ============================================
    cudaGraphNode_t kernelBNode;
    
    cudaKernelNodeParams kernelBParams = {0};
    void* argsB[] = { &d_data, (void*)&N };
    
    kernelBParams.func = (void*)kernelB;
    kernelBParams.gridDim = dim3(256);
    kernelBParams.blockDim = dim3(256);
    kernelBParams.sharedMemBytes = 0;
    kernelBParams.kernelParams = argsB;
    kernelBParams.extra = NULL;
    
    cudaGraphNode_t depB[] = { kernelANode };  // Depends on A
    cudaGraphAddKernelNode(&kernelBNode, graph, 
                           depB, 1,
                           &kernelBParams);
    
    // ============================================
    // Add Memcpy Node (D2H, depends on Kernel B)
    // ============================================
    cudaGraphNode_t d2hNode;
    cudaMemcpy3DParms d2hParams = {0};
    d2hParams.srcPtr = make_cudaPitchedPtr((void*)d_data, BYTES, N, 1);
    d2hParams.dstPtr = make_cudaPitchedPtr((void*)h_data, BYTES, N, 1);
    d2hParams.extent = make_cudaExtent(BYTES, 1, 1);
    d2hParams.kind = cudaMemcpyDeviceToHost;
    
    cudaGraphNode_t depD2H[] = { kernelBNode };
    cudaGraphAddMemcpyNode(&d2hNode, graph, 
                           depD2H, 1,
                           &d2hParams);
    
    // ============================================
    // Instantiate and Execute
    // ============================================
    cudaGraphExec_t graphExec;
    cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0);
    
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    
    cudaGraphLaunch(graphExec, stream);
    cudaStreamSynchronize(stream);
    
    // Verify
    printf("Result[0] = %.1f (expected 3.0)\n", h_data[0]);
    
    // Cleanup
    cudaGraphExecDestroy(graphExec);
    cudaGraphDestroy(graph);
    cudaStreamDestroy(stream);
    cudaFreeHost(h_data);
    cudaFree(d_data);
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o explicit_graph explicit_graph.cu
!./explicit_graph

---

## Part 3: Node Types

### Available Node Types

```cpp
// All available graph node types:

// 1. Kernel Node
cudaGraphAddKernelNode(&node, graph, deps, numDeps, &kernelParams);

// 2. Memcpy Node
cudaGraphAddMemcpyNode(&node, graph, deps, numDeps, &memcpyParams);

// 3. Memset Node
cudaGraphAddMemsetNode(&node, graph, deps, numDeps, &memsetParams);

// 4. Host Node (CPU callback)
cudaGraphAddHostNode(&node, graph, deps, numDeps, &hostParams);

// 5. Child Graph Node (nested graph)
cudaGraphAddChildGraphNode(&node, graph, deps, numDeps, childGraph);

// 6. Empty Node (synchronization point)
cudaGraphAddEmptyNode(&node, graph, deps, numDeps);

// 7. Event Record Node
cudaGraphAddEventRecordNode(&node, graph, deps, numDeps, event);

// 8. Event Wait Node
cudaGraphAddEventWaitNode(&node, graph, deps, numDeps, event);
```

### Empty Nodes for Synchronization

```cpp
// Use empty nodes as synchronization barriers
//
//    A1    A2    A3
//     \    |    /
//      [Empty]     <- Sync point
//         |
//         B

cudaGraphNode_t syncNode;
cudaGraphNode_t deps[] = { nodeA1, nodeA2, nodeA3 };
cudaGraphAddEmptyNode(&syncNode, graph, deps, 3);

// B depends on sync point
cudaGraphNode_t depB[] = { syncNode };
cudaGraphAddKernelNode(&nodeB, graph, depB, 1, &paramsB);
```

---

## Part 4: Complex DAG Patterns

### Fork-Join Pattern

This example demonstrates creating a graph with parallel branches that merge - a common pattern for concurrent operations.

In [None]:
%%writefile fork_join_graph.cu
// fork_join_graph.cu - Parallel branches that merge
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void processA(float* a, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) a[tid] = a[tid] * 2.0f;
}

__global__ void processB(float* b, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) b[tid] = b[tid] + 1.0f;
}

__global__ void combine(float* a, float* b, float* c, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) c[tid] = a[tid] + b[tid];
}

int main() {
    const int N = 1 << 20;
    float *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, N * sizeof(float));
    cudaMalloc(&d_b, N * sizeof(float));
    cudaMalloc(&d_c, N * sizeof(float));
    
    cudaGraph_t graph;
    cudaGraphCreate(&graph, 0);
    
    // ============================================
    // Fork: Two independent parallel kernels
    // ============================================
    cudaGraphNode_t nodeA, nodeB;
    
    cudaKernelNodeParams paramsA = {0};
    void* argsA[] = { &d_a, (void*)&N };
    paramsA.func = (void*)processA;
    paramsA.gridDim = dim3(256);
    paramsA.blockDim = dim3(256);
    paramsA.kernelParams = argsA;
    
    cudaKernelNodeParams paramsB = {0};
    void* argsB[] = { &d_b, (void*)&N };
    paramsB.func = (void*)processB;
    paramsB.gridDim = dim3(256);
    paramsB.blockDim = dim3(256);
    paramsB.kernelParams = argsB;
    
    // No dependencies - they can run in parallel!
    cudaGraphAddKernelNode(&nodeA, graph, NULL, 0, &paramsA);
    cudaGraphAddKernelNode(&nodeB, graph, NULL, 0, &paramsB);
    
    // ============================================
    // Join: Combine depends on both A and B
    // ============================================
    cudaGraphNode_t nodeC;
    
    cudaKernelNodeParams paramsC = {0};
    void* argsC[] = { &d_a, &d_b, &d_c, (void*)&N };
    paramsC.func = (void*)combine;
    paramsC.gridDim = dim3(256);
    paramsC.blockDim = dim3(256);
    paramsC.kernelParams = argsC;
    
    cudaGraphNode_t depsC[] = { nodeA, nodeB };  // Depends on BOTH
    cudaGraphAddKernelNode(&nodeC, graph, depsC, 2, &paramsC);
    
    /*
    Graph structure:
    
    [processA]    [processB]
           \      /
          [combine]
    */
    
    // Instantiate and run
    cudaGraphExec_t graphExec;
    cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0);
    
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    
    cudaGraphLaunch(graphExec, stream);
    cudaStreamSynchronize(stream);
    
    printf("Fork-join graph executed!\n");
    
    // Cleanup
    cudaGraphExecDestroy(graphExec);
    cudaGraphDestroy(graph);
    cudaStreamDestroy(stream);
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o fork_join_graph fork_join_graph.cu
!./fork_join_graph

---

## Part 5: Graph Inspection

### Querying Graph Structure

This example demonstrates how to examine a graph's properties, including the number of nodes, node types, and dependencies.

In [None]:
%%writefile graph_inspection.cu
// graph_inspection.cu - Examining graph properties
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void dummyKernel(float* data, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) data[tid] *= 2.0f;
}

int main() {
    const int N = 1 << 20;
    float *d_data;
    cudaMalloc(&d_data, N * sizeof(float));
    
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    
    // Capture a sample graph
    cudaGraph_t graph;
    cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
    
    dummyKernel<<<256, 256, 0, stream>>>(d_data, N);
    dummyKernel<<<256, 256, 0, stream>>>(d_data, N);
    dummyKernel<<<256, 256, 0, stream>>>(d_data, N);
    
    cudaStreamEndCapture(stream, &graph);
    
    // Get number of nodes
    size_t numNodes;
    cudaGraphGetNodes(graph, NULL, &numNodes);
    printf("Graph has %zu nodes\n", numNodes);
    
    // Get all nodes
    cudaGraphNode_t* nodes = new cudaGraphNode_t[numNodes];
    cudaGraphGetNodes(graph, nodes, &numNodes);
    
    // For each node, get type
    for (size_t i = 0; i < numNodes; i++) {
        cudaGraphNodeType type;
        cudaGraphNodeGetType(nodes[i], &type);
        
        switch (type) {
            case cudaGraphNodeTypeKernel:
                printf("Node %zu: Kernel\n", i);
                break;
            case cudaGraphNodeTypeMemcpy:
                printf("Node %zu: Memcpy\n", i);
                break;
            case cudaGraphNodeTypeMemset:
                printf("Node %zu: Memset\n", i);
                break;
            case cudaGraphNodeTypeHost:
                printf("Node %zu: Host callback\n", i);
                break;
            case cudaGraphNodeTypeGraph:
                printf("Node %zu: Child graph\n", i);
                break;
            case cudaGraphNodeTypeEmpty:
                printf("Node %zu: Empty (sync)\n", i);
                break;
            default:
                printf("Node %zu: Other\n", i);
        }
        
        // Get dependencies
        size_t numDeps;
        cudaGraphNodeGetDependencies(nodes[i], NULL, &numDeps);
        printf("  Has %zu dependencies\n", numDeps);
    }
    
    delete[] nodes;
    cudaGraphDestroy(graph);
    cudaStreamDestroy(stream);
    cudaFree(d_data);
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o graph_inspection graph_inspection.cu
!./graph_inspection

---

## Exercises

### Exercise 1: Pipeline Graph
Build a 4-stage pipeline graph explicitly (H2D → K1 → K2 → D2H).

### Exercise 2: Diamond Pattern
```
    A
   / \
  B   C
   \ /
    D
```

### Exercise 3: Graph Cloning
Use `cudaGraphClone` to create a modified copy of a graph.

---

## Key Takeaways

```
┌─────────────────────────────────────────────────────────┐
│           EXPLICIT GRAPH CONSTRUCTION                   │
├─────────────────────────────────────────────────────────┤
│                                                         │
│  Create Graph:                                          │
│  • cudaGraphCreate(&graph, 0)                           │
│                                                         │
│  Add Nodes:                                             │
│  • cudaGraphAddKernelNode(&node, graph, deps, n, &p)    │
│  • cudaGraphAddMemcpyNode(...)                          │
│  • cudaGraphAddEmptyNode(...)  // sync barrier          │
│                                                         │
│  Dependencies:                                          │
│  • Pass array of dependency nodes                       │
│  • NULL, 0 = no dependencies                            │
│  • Multiple deps = waits for ALL                        │
│                                                         │
│  Fork-Join:                                             │
│  • No deps = parallel (fork)                            │
│  • Multiple deps = sync (join)                          │
│                                                         │
└─────────────────────────────────────────────────────────┘
```

## Next: Day 3 - Graph Updates