In [1]:
import dace
import cupy as cp
import numpy as np
from IPython.display import Code
from typing import Optional

from dace import SDFG, properties
from dace.config import Config
from dace.transformation import pass_pipeline as ppl, transformation
from dace.sdfg import nodes
from dace import dtypes
from dace.transformation.passes.shared_memory_synchronization import DefaultSharedMemorySync
from dace.sdfg.state import LoopRegion, ConditionalBlock


In [2]:

# SDFG and the main state
sdfg = dace.SDFG("asyn_cpy_sdfg")
state = sdfg.add_state("main")

# Arrays and access nodes
sdfg.add_array("A", (128,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global)
sdfg.add_array("B", (128,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global)
sdfg.add_array("S", (128,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Shared, transient=True)

a_acc = state.add_read("A")
b_acc = state.add_access("B")
s_acc = state.add_access("S")




# Device and thread-block maps
gpu_map_entry, gpu_map_exit = state.add_map("gpu_map", dict(bid="0:128:128"), schedule=dace.dtypes.ScheduleType.GPU_Device)
tb_map_entry, tb_map_exit = state.add_map("tb_map", dict(tid="0:128"), schedule=dace.dtypes.ScheduleType.GPU_ThreadBlock)

# Edges with proper data flow
# Global to device scope
state.add_edge(a_acc, None, gpu_map_entry, None, dace.Memlet("A[0:128]"))
# Device scope to thread-block scope
state.add_edge(gpu_map_entry, None, s_acc, None,  dace.Memlet("A[0:128]->S[0:128]"))
state.add_edge(s_acc, None,  tb_map_entry, None, dace.Memlet("S[0:128]"))

assign_tasklet = state.add_tasklet(
    "assign", inputs={"__in_S"}, outputs={"__out_S"},
    code="__out_S = __in_S;",
    language=dace.dtypes.Language.CPP
)


state.add_edge(tb_map_entry, None, assign_tasklet, "__in_S", dace.Memlet("S[tid]"))
state.add_edge(assign_tasklet, "__out_S", tb_map_exit, None, dace.Memlet("B[tid]"))
state.add_edge(tb_map_exit, None, gpu_map_exit, None, dace.Memlet("B[0:128]"))
state.add_edge(gpu_map_exit, None, b_acc, None, dace.Memlet("B[0:128]"))


# pipeline and async related 

pipeline_name = "pipeline"
s_acc.async_copy = True
s_acc.async_pipeline = pipeline_name
sdfg.metadata = {
    s_acc.guid: {
        "pipelines": {
            pipeline_name: {
                "pipeline_depth" : 1
            }
        }
    }
}


acquire_pipeline_tasklet = state.add_tasklet(
    "acquire", inputs={}, outputs={},
    code=f"{pipeline_name}.producer_acquire();",
    language=dace.dtypes.Language.CPP
)

commit_pipeline_tasklet = state.add_tasklet(
    "commit", inputs={}, outputs={},
    code=f"{pipeline_name}.producer_commit();",
    language=dace.dtypes.Language.CPP
)

wait_pipeline_tasklet = state.add_tasklet(
    "wait", inputs={}, outputs={},
    code=f"{pipeline_name}.consumer_wait();",
    language=dace.dtypes.Language.CPP
)

release_pipeline_tasklet = state.add_tasklet(
    "release", inputs={}, outputs={},
    code=f"{pipeline_name}.consumer_release();",
    language=dace.dtypes.Language.CPP
)



state.add_edge(gpu_map_entry, None, acquire_pipeline_tasklet, None, dace.Memlet())
state.add_edge(acquire_pipeline_tasklet, None, s_acc, None, dace.Memlet())

state.add_edge(s_acc, None, commit_pipeline_tasklet, None, dace.Memlet())
state.add_edge(commit_pipeline_tasklet, None, wait_pipeline_tasklet, None, dace.Memlet())
state.add_edge(wait_pipeline_tasklet, None, tb_map_entry, None, dace.Memlet())

state.add_edge(tb_map_exit, None, release_pipeline_tasklet, None, dace.Memlet())
state.add_edge(release_pipeline_tasklet, None, gpu_map_exit, None, dace.Memlet())




# Fill scope connectors
state.fill_scope_connectors()


# Display the SDFG
sdfg

In [3]:

Code(sdfg.generate_code()[1].clean_code)

In [4]:

A = cp.ones((128,), dtype=cp.uint32)
B = cp.zeros((128,), dtype=cp.uint32)

print(f"A before:\n{A}")
print(f"B before:\n{B}")

sdfg(A=A, B=B)

print(f"A after:\n{A}")
print(f"B after:\n{B}")


A before:
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
B before:
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
A after:
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
B after:
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 