In [15]:
import dace
import cupy as cp
import numpy as np

from dace import registry

from dace.sdfg.scope import ScopeSubgraphView
from dace.codegen.prettycode import CodeIOStream
from dace.codegen.targets.target import TargetCodeGenerator
from dace.codegen.targets.framecode import DaCeCodeGenerator
from dace.codegen.targets.cpp import sym2cpp
from IPython.display import Code
from dace.config import Config

In [16]:
@dace.program
def test(A: dace.uint32[32,32] @ dace.dtypes.StorageType.GPU_Global):
    for i, j in dace.map[0:32:32, 0:32:32] @ dace.dtypes.ScheduleType.GPU_Device:
        for ii, jj in dace.map[0:32, 0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock:
            for wx,wy in dace.map[0:4, 0:8] @ dace.dtypes.ScheduleType.GPU_Warp:
                r = wx
                c = wy
                result = dace.define_local_scalar(dace.uint32)
                with dace.tasklet(dace.Language.CPP):
                    iwx << r
                    iwy << c
                    out_result >> result
                    """
                    out_result = iwx * 100 + iwy;
                    """
                
                A[i + ii, j + jj] = result

sdfg = test.to_sdfg()
A = cp.zeros((32,32), dtype=cp.uint32)
sdfg(A=A)

A_cpu = cp.asnumpy(A)
A_reshaped = A_cpu.reshape(-1, 256)
np.savetxt("A_output.txt", A_reshaped, fmt='%d')


In [7]:
sdfg = dace.SDFG("Warp_test_1")
state = sdfg.add_state("main")

# Generate access nodes
a_dev = sdfg.add_array("A", (32,32), dace.uint32, dace.dtypes.StorageType.GPU_Global)
a_acc = state.add_access("A")



# Generate maps, connect entries with access data
gpu_map_entry, gpu_map_exit = state.add_map(name = "GPU_Map",
                                            ndrange = dict(i='0:32:32', j ='0:32:32'),
                                            schedule = dace.dtypes.ScheduleType.GPU_Device)



tblock_map_entry, tblock_map_exit = state.add_map(name = "Block_Map",
                                                ndrange = dict(ii='0:32', jj='0:32'),
                                                schedule = dace.dtypes.ScheduleType.GPU_ThreadBlock)

state.add_edge(gpu_map_entry, None, tblock_map_entry, None, dace.memlet.Memlet())



tasklet, warp_scope_entry, warp_scope_exit = state.add_mapped_tasklet(
    name='WarpLevel_Operation',
    map_ranges=dict(wi='0:32'),
    inputs=dict(),
    code=
"""
out = wi
""",
    outputs=dict(out=dace.Memlet("A[i+ii, j+jj]")),
    schedule=dace.dtypes.ScheduleType.GPU_Warp
)

state.add_edge(tblock_map_entry, None, warp_scope_entry, None, dace.memlet.Memlet())

state.add_edge(warp_scope_exit, None, tblock_map_exit, None, dace.memlet.Memlet('A[i+ii, j+jj]'))
state.add_edge(tblock_map_exit, None, gpu_map_exit, None, dace.memlet.Memlet('A[i:i+32,j:j+32]'))
state.add_edge(gpu_map_exit, None, a_acc, None, dace.memlet.Memlet('A[0:32, 0:32]'))

sdfg.fill_scope_connectors()
sdfg

In [None]:

Code(sdfg.generate_code()[1].clean_code, language='cpp')