In [1]:
import dace
import cupy as cp
import numpy as np
from IPython.display import Code


In [2]:

# SDFG and the main state
sdfg = dace.SDFG("asyn_cpy_sdfg")
state = sdfg.add_state("main")

# Arrays and access nodes
sdfg.add_array("A", (128,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global)
sdfg.add_array("B", (128,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global)
sdfg.add_array("S", (128,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Shared, transient=True)

a_acc = state.add_read("A")
b_acc = state.add_access("B")
s_acc = state.add_access("S")

# Device and thread-block maps
gpu_map_entry, gpu_map_exit = state.add_map("gpu_map", dict(bid="0:128:128"), schedule=dace.dtypes.ScheduleType.GPU_Device)
tb_map_entry, tb_map_exit = state.add_map("tb_map", dict(tid="0:128"), schedule=dace.dtypes.ScheduleType.GPU_ThreadBlock)

# Edges with proper data flow
# Global to device scope
state.add_edge(a_acc, None, gpu_map_entry, None, dace.Memlet("A[0:128]", is_asynchronous=True))
# Device scope to thread-block scope
state.add_edge(gpu_map_entry, None, s_acc, None,  dace.Memlet("A[0:128]->S[0:128]", is_asynchronous=True))
state.add_edge(s_acc, None,  tb_map_entry, None, dace.Memlet("S[0:128]", is_asynchronous=True))

assign_tasklet = state.add_tasklet(
    "assign", inputs={"__in_S"}, outputs={"__out_S"},
    code="__out_S = __in_S;",
    language=dace.dtypes.Language.CPP
)

state.add_edge(tb_map_entry, None, assign_tasklet, "__in_S", dace.Memlet("S[tid]", is_asynchronous=True))
state.add_edge(assign_tasklet, "__out_S", tb_map_exit, None, dace.Memlet("B[tid]", is_asynchronous=True))
state.add_edge(tb_map_exit, None, gpu_map_exit, None, dace.Memlet("B[0:128]", is_asynchronous=True))
state.add_edge(gpu_map_exit, None, b_acc, None, dace.Memlet("B[0:128]", is_asynchronous=True))

# Fill scope connectors
state.fill_scope_connectors()


# Display the SDFG
sdfg

TypeError: Memlet.__init__() got an unexpected keyword argument 'is_asynchronous'