In [None]:
# ‚öôÔ∏è Setup
import subprocess, sys
try:
    import google.colab
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numba"])
except ImportError:
    pass

import numpy as np
from numba import cuda
import time

print("‚ö†Ô∏è  CUDA C++ is PRIMARY. Python/Numba for quick testing only.")
if cuda.is_available():
    print(f"GPU: {cuda.get_current_device().name}")

---

## Part 1: Why Explicit Construction?

### Capture vs Explicit

```
Stream Capture:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
‚úÖ Easy - just run code in capture mode
‚úÖ Natural for converting existing code
‚ùå Limited control over structure
‚ùå Can't build graphs dynamically

Explicit Construction:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
‚úÖ Full control over graph structure
‚úÖ Can build programmatically
‚úÖ More flexible dependencies
‚ùå More verbose code
‚ùå Need to manage node handles
```

---

## Part 2: Building a Simple Graph

### üî∑ CUDA C++ Implementation (Primary)

This example demonstrates building a graph node by node, giving you full control over the graph structure.

In [None]:
%%writefile explicit_graph.cu
// explicit_graph.cu - Building graphs node by node
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void kernelA(float* data, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) data[tid] *= 2.0f;
}

__global__ void kernelB(float* data, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) data[tid] += 1.0f;
}

int main() {
    const int N = 1 << 20;
    const size_t BYTES = N * sizeof(float);
    
    float *h_data, *d_data;
    cudaMallocHost(&h_data, BYTES);
    cudaMalloc(&d_data, BYTES);
    
    for (int i = 0; i < N; i++) h_data[i] = 1.0f;
    
    // ============================================
    // Create Empty Graph
    // ============================================
    cudaGraph_t graph;
    cudaGraphCreate(&graph, 0);  // 0 = flags (none)
    
    // ============================================
    // Add Memcpy Node (H2D)
    // ============================================
    cudaGraphNode_t h2dNode;
    cudaMemcpy3DParms h2dParams = {0};
    h2dParams.srcPtr = make_cudaPitchedPtr((void*)h_data, BYTES, N, 1);
    h2dParams.dstPtr = make_cudaPitchedPtr((void*)d_data, BYTES, N, 1);
    h2dParams.extent = make_cudaExtent(BYTES, 1, 1);
    h2dParams.kind = cudaMemcpyHostToDevice;
    
    cudaGraphAddMemcpyNode(&h2dNode, graph, 
                           NULL, 0,  // No dependencies
                           &h2dParams);
    
    // ============================================
    // Add Kernel Node A (depends on H2D)
    // ============================================
    cudaGraphNode_t kernelANode;
    
    cudaKernelNodeParams kernelAParams = {0};
    void* argsA[] = { &d_data, (void*)&N };
    
    kernelAParams.func = (void*)kernelA;
    kernelAParams.gridDim = dim3(256);
    kernelAParams.blockDim = dim3(256);
    kernelAParams.sharedMemBytes = 0;
    kernelAParams.kernelParams = argsA;
    kernelAParams.extra = NULL;
    
    cudaGraphNode_t depA[] = { h2dNode };  // Depends on H2D
    cudaGraphAddKernelNode(&kernelANode, graph, 
                           depA, 1,  // 1 dependency
                           &kernelAParams);
    
    // ============================================
    // Add Kernel Node B (depends on Kernel A)
    // ============================================
    cudaGraphNode_t kernelBNode;
    
    cudaKernelNodeParams kernelBParams = {0};
    void* argsB[] = { &d_data, (void*)&N };
    
    kernelBParams.func = (void*)kernelB;
    kernelBParams.gridDim = dim3(256);
    kernelBParams.blockDim = dim3(256);
    kernelBParams.sharedMemBytes = 0;
    kernelBParams.kernelParams = argsB;
    kernelBParams.extra = NULL;
    
    cudaGraphNode_t depB[] = { kernelANode };  // Depends on A
    cudaGraphAddKernelNode(&kernelBNode, graph, 
                           depB, 1,
                           &kernelBParams);
    
    // ============================================
    // Add Memcpy Node (D2H, depends on Kernel B)
    // ============================================
    cudaGraphNode_t d2hNode;
    cudaMemcpy3DParms d2hParams = {0};
    d2hParams.srcPtr = make_cudaPitchedPtr((void*)d_data, BYTES, N, 1);
    d2hParams.dstPtr = make_cudaPitchedPtr((void*)h_data, BYTES, N, 1);
    d2hParams.extent = make_cudaExtent(BYTES, 1, 1);
    d2hParams.kind = cudaMemcpyDeviceToHost;
    
    cudaGraphNode_t depD2H[] = { kernelBNode };
    cudaGraphAddMemcpyNode(&d2hNode, graph, 
                           depD2H, 1,
                           &d2hParams);
    
    // ============================================
    // Instantiate and Execute
    // ============================================
    cudaGraphExec_t graphExec;
    cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0);
    
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    
    cudaGraphLaunch(graphExec, stream);
    cudaStreamSynchronize(stream);
    
    // Verify
    printf("Result[0] = %.1f (expected 3.0)\n", h_data[0]);
    
    // Cleanup
    cudaGraphExecDestroy(graphExec);
    cudaGraphDestroy(graph);
    cudaStreamDestroy(stream);
    cudaFreeHost(h_data);
    cudaFree(d_data);
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o explicit_graph explicit_graph.cu
!./explicit_graph

---

## Part 3: Node Types

### Available Node Types

```cpp
// All available graph node types:

// 1. Kernel Node
cudaGraphAddKernelNode(&node, graph, deps, numDeps, &kernelParams);

// 2. Memcpy Node
cudaGraphAddMemcpyNode(&node, graph, deps, numDeps, &memcpyParams);

// 3. Memset Node
cudaGraphAddMemsetNode(&node, graph, deps, numDeps, &memsetParams);

// 4. Host Node (CPU callback)
cudaGraphAddHostNode(&node, graph, deps, numDeps, &hostParams);

// 5. Child Graph Node (nested graph)
cudaGraphAddChildGraphNode(&node, graph, deps, numDeps, childGraph);

// 6. Empty Node (synchronization point)
cudaGraphAddEmptyNode(&node, graph, deps, numDeps);

// 7. Event Record Node
cudaGraphAddEventRecordNode(&node, graph, deps, numDeps, event);

// 8. Event Wait Node
cudaGraphAddEventWaitNode(&node, graph, deps, numDeps, event);
```

### Empty Nodes for Synchronization

```cpp
// Use empty nodes as synchronization barriers
//
//    A1    A2    A3
//     \    |    /
//      [Empty]     <- Sync point
//         |
//         B

cudaGraphNode_t syncNode;
cudaGraphNode_t deps[] = { nodeA1, nodeA2, nodeA3 };
cudaGraphAddEmptyNode(&syncNode, graph, deps, 3);

// B depends on sync point
cudaGraphNode_t depB[] = { syncNode };
cudaGraphAddKernelNode(&nodeB, graph, depB, 1, &paramsB);
```

---

## Part 4: Complex DAG Patterns

### üî∑ CUDA C++ Implementation (Primary)

This example demonstrates creating a graph with parallel branches that merge - a common pattern for concurrent operations.

In [None]:
%%writefile fork_join_graph.cu
// fork_join_graph.cu - Parallel branches that merge
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void processA(float* a, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) a[tid] = a[tid] * 2.0f;
}

__global__ void processB(float* b, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) b[tid] = b[tid] + 1.0f;
}

__global__ void combine(float* a, float* b, float* c, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) c[tid] = a[tid] + b[tid];
}

int main() {
    const int N = 1 << 20;
    float *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, N * sizeof(float));
    cudaMalloc(&d_b, N * sizeof(float));
    cudaMalloc(&d_c, N * sizeof(float));
    
    cudaGraph_t graph;
    cudaGraphCreate(&graph, 0);
    
    // ============================================
    // Fork: Two independent parallel kernels
    // ============================================
    cudaGraphNode_t nodeA, nodeB;
    
    cudaKernelNodeParams paramsA = {0};
    void* argsA[] = { &d_a, (void*)&N };
    paramsA.func = (void*)processA;
    paramsA.gridDim = dim3(256);
    paramsA.blockDim = dim3(256);
    paramsA.kernelParams = argsA;
    
    cudaKernelNodeParams paramsB = {0};
    void* argsB[] = { &d_b, (void*)&N };
    paramsB.func = (void*)processB;
    paramsB.gridDim = dim3(256);
    paramsB.blockDim = dim3(256);
    paramsB.kernelParams = argsB;
    
    // No dependencies - they can run in parallel!
    cudaGraphAddKernelNode(&nodeA, graph, NULL, 0, &paramsA);
    cudaGraphAddKernelNode(&nodeB, graph, NULL, 0, &paramsB);
    
    // ============================================
    // Join: Combine depends on both A and B
    // ============================================
    cudaGraphNode_t nodeC;
    
    cudaKernelNodeParams paramsC = {0};
    void* argsC[] = { &d_a, &d_b, &d_c, (void*)&N };
    paramsC.func = (void*)combine;
    paramsC.gridDim = dim3(256);
    paramsC.blockDim = dim3(256);
    paramsC.kernelParams = argsC;
    
    cudaGraphNode_t depsC[] = { nodeA, nodeB };  // Depends on BOTH
    cudaGraphAddKernelNode(&nodeC, graph, depsC, 2, &paramsC);
    
    /*
    Graph structure:
    
    [processA]    [processB]
           \      /
          [combine]
    */
    
    // Instantiate and run
    cudaGraphExec_t graphExec;
    cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0);
    
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    
    cudaGraphLaunch(graphExec, stream);
    cudaStreamSynchronize(stream);
    
    printf("Fork-join graph executed!\n");
    
    // Cleanup
    cudaGraphExecDestroy(graphExec);
    cudaGraphDestroy(graph);
    cudaStreamDestroy(stream);
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o fork_join_graph fork_join_graph.cu
!./fork_join_graph

---

## Part 5: Graph Inspection

### üî∑ CUDA C++ Implementation (Primary)

This example demonstrates how to examine a graph's properties, including the number of nodes, node types, and dependencies.

In [None]:
%%writefile graph_inspection.cu
// graph_inspection.cu - Examining graph properties
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void dummyKernel(float* data, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) data[tid] *= 2.0f;
}

int main() {
    const int N = 1 << 20;
    float *d_data;
    cudaMalloc(&d_data, N * sizeof(float));
    
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    
    // Capture a sample graph
    cudaGraph_t graph;
    cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
    
    dummyKernel<<<256, 256, 0, stream>>>(d_data, N);
    dummyKernel<<<256, 256, 0, stream>>>(d_data, N);
    dummyKernel<<<256, 256, 0, stream>>>(d_data, N);
    
    cudaStreamEndCapture(stream, &graph);
    
    // Get number of nodes
    size_t numNodes;
    cudaGraphGetNodes(graph, NULL, &numNodes);
    printf("Graph has %zu nodes\n", numNodes);
    
    // Get all nodes
    cudaGraphNode_t* nodes = new cudaGraphNode_t[numNodes];
    cudaGraphGetNodes(graph, nodes, &numNodes);
    
    // For each node, get type
    for (size_t i = 0; i < numNodes; i++) {
        cudaGraphNodeType type;
        cudaGraphNodeGetType(nodes[i], &type);
        
        switch (type) {
            case cudaGraphNodeTypeKernel:
                printf("Node %zu: Kernel\n", i);
                break;
            case cudaGraphNodeTypeMemcpy:
                printf("Node %zu: Memcpy\n", i);
                break;
            case cudaGraphNodeTypeMemset:
                printf("Node %zu: Memset\n", i);
                break;
            case cudaGraphNodeTypeHost:
                printf("Node %zu: Host callback\n", i);
                break;
            case cudaGraphNodeTypeGraph:
                printf("Node %zu: Child graph\n", i);
                break;
            case cudaGraphNodeTypeEmpty:
                printf("Node %zu: Empty (sync)\n", i);
                break;
            default:
                printf("Node %zu: Other\n", i);
        }
        
        // Get dependencies
        size_t numDeps;
        cudaGraphNodeGetDependencies(nodes[i], NULL, &numDeps);
        printf("  Has %zu dependencies\n", numDeps);
    }
    
    delete[] nodes;
    cudaGraphDestroy(graph);
    cudaStreamDestroy(stream);
    cudaFree(d_data);
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o graph_inspection graph_inspection.cu
!./graph_inspection

---

## üéØ Exercises

### üî∑ CUDA C++ Exercises (Primary)

In [None]:
%%writefile explicit_graph_exercises.cu
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>

#define CHECK_CUDA(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            printf("CUDA Error: %s at line %d\n", cudaGetErrorString(err), __LINE__); \
            exit(1); \
        } \
    } while(0)

__global__ void kernel1(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) data[idx] = data[idx] + 1.0f;
}

__global__ void kernel2(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) data[idx] = data[idx] * 2.0f;
}

// ============================================================
// Exercise 1: Pipeline Graph (H2D -> K1 -> K2 -> D2H)
// ============================================================

void exercise1_pipelineGraph() {
    printf("=== Exercise 1: Pipeline Graph ===\n");
    
    const int n = 1 << 18;
    size_t bytes = n * sizeof(float);
    
    float *h_data, *d_data;
    CHECK_CUDA(cudaMallocHost(&h_data, bytes));
    CHECK_CUDA(cudaMalloc(&d_data, bytes));
    
    for (int i = 0; i < n; i++) h_data[i] = 1.0f;
    
    int grid = (n + 255) / 256;
    
    cudaGraph_t graph;
    cudaGraphNode_t h2dNode, k1Node, k2Node, d2hNode;
    
    CHECK_CUDA(cudaGraphCreate(&graph, 0));
    
    // H2D memcpy node
    cudaMemcpy3DParms h2dParams = {0};
    h2dParams.srcPtr = make_cudaPitchedPtr(h_data, bytes, n, 1);
    h2dParams.dstPtr = make_cudaPitchedPtr(d_data, bytes, n, 1);
    h2dParams.extent = make_cudaExtent(bytes, 1, 1);
    h2dParams.kind = cudaMemcpyHostToDevice;
    CHECK_CUDA(cudaGraphAddMemcpyNode(&h2dNode, graph, NULL, 0, &h2dParams));
    
    // Kernel 1 node
    cudaKernelNodeParams k1Params = {0};
    void* k1Args[] = {&d_data, (void*)&n};
    k1Params.func = (void*)kernel1;
    k1Params.gridDim = dim3(grid);
    k1Params.blockDim = dim3(256);
    k1Params.kernelParams = k1Args;
    CHECK_CUDA(cudaGraphAddKernelNode(&k1Node, graph, &h2dNode, 1, &k1Params));
    
    // Kernel 2 node
    cudaKernelNodeParams k2Params = {0};
    void* k2Args[] = {&d_data, (void*)&n};
    k2Params.func = (void*)kernel2;
    k2Params.gridDim = dim3(grid);
    k2Params.blockDim = dim3(256);
    k2Params.kernelParams = k2Args;
    CHECK_CUDA(cudaGraphAddKernelNode(&k2Node, graph, &k1Node, 1, &k2Params));
    
    // D2H memcpy node
    cudaMemcpy3DParms d2hParams = {0};
    d2hParams.srcPtr = make_cudaPitchedPtr(d_data, bytes, n, 1);
    d2hParams.dstPtr = make_cudaPitchedPtr(h_data, bytes, n, 1);
    d2hParams.extent = make_cudaExtent(bytes, 1, 1);
    d2hParams.kind = cudaMemcpyDeviceToHost;
    CHECK_CUDA(cudaGraphAddMemcpyNode(&d2hNode, graph, &k2Node, 1, &d2hParams));
    
    // Instantiate and execute
    cudaGraphExec_t graphExec;
    CHECK_CUDA(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0));
    
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    cudaEventRecord(start);
    for (int i = 0; i < 100; i++) {
        cudaGraphLaunch(graphExec, 0);
    }
    cudaDeviceSynchronize();
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    float ms;
    cudaEventElapsedTime(&ms, start, stop);
    
    printf("Pipeline: H2D -> K1 -> K2 -> D2H\n");
    printf("100 iterations: %.2f ms (%.3f ms/iter)\n", ms, ms / 100);
    printf("Result check: h_data[0] = %.1f (expected: 4.0)\n\n", h_data[0]);
    
    cudaGraphExecDestroy(graphExec);
    cudaGraphDestroy(graph);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    cudaFreeHost(h_data);
    cudaFree(d_data);
}

// ============================================================
// Exercise 2: Diamond Pattern
// ============================================================

__global__ void kernelB(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) data[idx] = sinf(data[idx]);
}

__global__ void kernelC(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) data[idx] = cosf(data[idx]);
}

__global__ void kernelD(const float* b, const float* c, float* out, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) out[idx] = b[idx] + c[idx];
}

void exercise2_diamondPattern() {
    printf("=== Exercise 2: Diamond Pattern ===\n");
    printf("    A\n   / \\\n  B   C\n   \\ /\n    D\n\n");
    
    const int n = 1 << 18;
    
    float *d_input, *d_b, *d_c, *d_output;
    CHECK_CUDA(cudaMalloc(&d_input, n * sizeof(float)));
    CHECK_CUDA(cudaMalloc(&d_b, n * sizeof(float)));
    CHECK_CUDA(cudaMalloc(&d_c, n * sizeof(float)));
    CHECK_CUDA(cudaMalloc(&d_output, n * sizeof(float)));
    
    int grid = (n + 255) / 256;
    
    cudaGraph_t graph;
    cudaGraphNode_t nodeA, nodeB, nodeC, nodeD;
    
    CHECK_CUDA(cudaGraphCreate(&graph, 0));
    
    // Node A
    cudaKernelNodeParams aParams = {0};
    void* aArgs[] = {&d_input, (void*)&n};
    aParams.func = (void*)kernel1;
    aParams.gridDim = dim3(grid);
    aParams.blockDim = dim3(256);
    aParams.kernelParams = aArgs;
    CHECK_CUDA(cudaGraphAddKernelNode(&nodeA, graph, NULL, 0, &aParams));
    
    // Node B (depends on A)
    cudaKernelNodeParams bParams = {0};
    void* bArgs[] = {&d_b, (void*)&n};
    bParams.func = (void*)kernelB;
    bParams.gridDim = dim3(grid);
    bParams.blockDim = dim3(256);
    bParams.kernelParams = bArgs;
    CHECK_CUDA(cudaGraphAddKernelNode(&nodeB, graph, &nodeA, 1, &bParams));
    
    // Node C (depends on A)
    cudaKernelNodeParams cParams = {0};
    void* cArgs[] = {&d_c, (void*)&n};
    cParams.func = (void*)kernelC;
    cParams.gridDim = dim3(grid);
    cParams.blockDim = dim3(256);
    cParams.kernelParams = cArgs;
    CHECK_CUDA(cudaGraphAddKernelNode(&nodeC, graph, &nodeA, 1, &cParams));
    
    // Node D (depends on B and C)
    cudaGraphNode_t bcDeps[] = {nodeB, nodeC};
    cudaKernelNodeParams dParams = {0};
    void* dArgs[] = {&d_b, &d_c, &d_output, (void*)&n};
    dParams.func = (void*)kernelD;
    dParams.gridDim = dim3(grid);
    dParams.blockDim = dim3(256);
    dParams.kernelParams = dArgs;
    CHECK_CUDA(cudaGraphAddKernelNode(&nodeD, graph, bcDeps, 2, &dParams));
    
    // Instantiate and execute
    cudaGraphExec_t graphExec;
    CHECK_CUDA(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0));
    
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    cudaEventRecord(start);
    for (int i = 0; i < 100; i++) {
        cudaGraphLaunch(graphExec, 0);
    }
    cudaDeviceSynchronize();
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    float ms;
    cudaEventElapsedTime(&ms, start, stop);
    
    printf("Diamond graph 100 iterations: %.2f ms\n\n", ms);
    
    cudaGraphExecDestroy(graphExec);
    cudaGraphDestroy(graph);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    cudaFree(d_input);
    cudaFree(d_b);
    cudaFree(d_c);
    cudaFree(d_output);
}

// ============================================================
// Exercise 3: Graph Cloning
// ============================================================

void exercise3_graphCloning() {
    printf("=== Exercise 3: Graph Cloning ===\n");
    
    const int n = 1 << 18;
    
    float *d_data;
    CHECK_CUDA(cudaMalloc(&d_data, n * sizeof(float)));
    
    int grid = (n + 255) / 256;
    
    // Create original graph
    cudaGraph_t originalGraph;
    cudaGraphNode_t k1Node;
    
    CHECK_CUDA(cudaGraphCreate(&originalGraph, 0));
    
    cudaKernelNodeParams k1Params = {0};
    void* k1Args[] = {&d_data, (void*)&n};
    k1Params.func = (void*)kernel1;
    k1Params.gridDim = dim3(grid);
    k1Params.blockDim = dim3(256);
    k1Params.kernelParams = k1Args;
    CHECK_CUDA(cudaGraphAddKernelNode(&k1Node, originalGraph, NULL, 0, &k1Params));
    
    // Clone the graph
    cudaGraph_t clonedGraph;
    CHECK_CUDA(cudaGraphClone(&clonedGraph, originalGraph));
    
    // Get nodes from cloned graph and modify
    size_t numNodes;
    cudaGraphGetNodes(clonedGraph, NULL, &numNodes);
    printf("Original graph has %zu nodes\n", numNodes);
    
    cudaGraphNode_t* nodes = (cudaGraphNode_t*)malloc(numNodes * sizeof(cudaGraphNode_t));
    cudaGraphGetNodes(clonedGraph, nodes, &numNodes);
    
    // Modify the cloned node (change grid size)
    cudaKernelNodeParams modifiedParams;
    cudaGraphKernelNodeGetParams(nodes[0], &modifiedParams);
    modifiedParams.gridDim = dim3(grid * 2);  // Double the grid
    cudaGraphKernelNodeSetParams(nodes[0], &modifiedParams);
    
    printf("Cloned and modified graph (doubled grid size)\n");
    
    // Instantiate both
    cudaGraphExec_t origExec, cloneExec;
    CHECK_CUDA(cudaGraphInstantiate(&origExec, originalGraph, NULL, NULL, 0));
    CHECK_CUDA(cudaGraphInstantiate(&cloneExec, clonedGraph, NULL, NULL, 0));
    
    printf("Both graphs instantiated successfully\n\n");
    
    free(nodes);
    cudaGraphExecDestroy(origExec);
    cudaGraphExecDestroy(cloneExec);
    cudaGraphDestroy(originalGraph);
    cudaGraphDestroy(clonedGraph);
    cudaFree(d_data);
}

int main() {
    printf("‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó\n");
    printf("‚ïë           Explicit Graph Construction Exercises              ‚ïë\n");
    printf("‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù\n\n");
    
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);
    printf("Device: %s\n\n", prop.name);
    
    exercise1_pipelineGraph();
    exercise2_diamondPattern();
    exercise3_graphCloning();
    
    printf("‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê\n");
    printf("                    All exercises completed!\n");
    printf("‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê\n");
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o explicit_graph_exercises explicit_graph_exercises.cu && ./explicit_graph_exercises

### üî∂ Python/Numba Exercises (Optional)

### Exercise 1: Pipeline Graph
Build a 4-stage pipeline graph explicitly (H2D ‚Üí K1 ‚Üí K2 ‚Üí D2H).

### Exercise 2: Diamond Pattern
```
    A
   / \
  B   C
   \ /
    D
```

### Exercise 3: Graph Cloning
Use `cudaGraphClone` to create a modified copy of a graph.

---

## Key Takeaways

```
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ           EXPLICIT GRAPH CONSTRUCTION                   ‚îÇ
‚îú‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î§
‚îÇ                                                         ‚îÇ
‚îÇ  Create Graph:                                          ‚îÇ
‚îÇ  ‚Ä¢ cudaGraphCreate(&graph, 0)                           ‚îÇ
‚îÇ                                                         ‚îÇ
‚îÇ  Add Nodes:                                             ‚îÇ
‚îÇ  ‚Ä¢ cudaGraphAddKernelNode(&node, graph, deps, n, &p)    ‚îÇ
‚îÇ  ‚Ä¢ cudaGraphAddMemcpyNode(...)                          ‚îÇ
‚îÇ  ‚Ä¢ cudaGraphAddEmptyNode(...)  // sync barrier          ‚îÇ
‚îÇ                                                         ‚îÇ
‚îÇ  Dependencies:                                          ‚îÇ
‚îÇ  ‚Ä¢ Pass array of dependency nodes                       ‚îÇ
‚îÇ  ‚Ä¢ NULL, 0 = no dependencies                            ‚îÇ
‚îÇ  ‚Ä¢ Multiple deps = waits for ALL                        ‚îÇ
‚îÇ                                                         ‚îÇ
‚îÇ  Fork-Join:                                             ‚îÇ
‚îÇ  ‚Ä¢ No deps = parallel (fork)                            ‚îÇ
‚îÇ  ‚Ä¢ Multiple deps = sync (join)                          ‚îÇ
‚îÇ                                                         ‚îÇ
‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò
```

## Next: Day 3 - Graph Updates