In [1]:
import dace
import cupy as cp

from IPython.display import Code
from dace.config import Config


print(Config.get('compiler', 'cuda', 'implementation'))


experimental


In [2]:

# Generate framework
sdfg = dace.SDFG("Warp_test_1")

state = sdfg.add_state("main")

# Generate access nodes
a_dev = sdfg.add_array("A", (32,), dace.uint32, dace.dtypes.StorageType.GPU_Global)
b_dev = sdfg.add_array("B", (32,), dace.uint32, dace.dtypes.StorageType.GPU_Global)
a_acc = state.add_access("A")
b_acc = state.add_access("B")


# Generate maps, connect entries with access data
gpu_map_entry, gpu_map_exit = state.add_map(name = "GPU_Map",
                                            ndrange = dict(i='0:32:32'),
                                            schedule = dace.dtypes.ScheduleType.GPU_Device)
state.add_edge(a_acc, None, gpu_map_entry, None, dace.memlet.Memlet('A[0:32]'))


tblock_map_entry, tblock_map_exit = state.add_map(name = "Block_Map",
                                                  ndrange = dict(j='0:32'),
                                                  schedule = dace.dtypes.ScheduleType.GPU_ThreadBlock)
state.add_edge(gpu_map_entry, None, tblock_map_entry, None, dace.memlet.Memlet('A[0:32]'))




tasklet, warp_scope_entry, warp_scope_exit = state.add_mapped_tasklet(
    name='WarpLevel_Operation',
    map_ranges=dict(_='0:1'),
    inputs=dict(inp=dace.Memlet('A[0:32]', volume=32)),
    code=
''' 
value = inp[j]
out = __reduce_add_sync(0xFFFFFFFF, value);
''',
    outputs=dict(out=dace.Memlet("B[j]")),
    schedule=dace.dtypes.ScheduleType.GPU_Warp
)

state.add_edge(tblock_map_entry, None, warp_scope_entry, None, dace.memlet.Memlet('A[0:32]'))

# Connect Exit nodes
state.add_edge(warp_scope_exit, None, tblock_map_exit, None, dace.memlet.Memlet('B[j]'))
state.add_edge(tblock_map_exit, None, gpu_map_exit, None, dace.memlet.Memlet('B[j]'))
state.add_edge(gpu_map_exit, None, b_acc, None, dace.memlet.Memlet('B[0:32]'))




#sdfg.fill_scope_connectors()



sdfg

In [3]:
Code(sdfg.generate_code()[1].clean_code, language='cpp')

In [4]:
call_it = sdfg.compile()

In [5]:
A = cp.ones(32, dtype=cp.uint32) 
B = cp.zeros(32, dtype=cp.uint32) 

sdfg(A=A, B=B)

print(B)



[32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
 32 32 32 32 32 32 32 32]
