Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add kernel detection flag #1061

Merged
merged 5 commits into from
Jul 20, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
46 changes: 23 additions & 23 deletions dace/codegen/targets/fpga.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,22 +486,26 @@ def generate_state(self, sdfg: dace.SDFG, state: dace.SDFGState, function_stream
# Then, try to split these components further
subgraphs = dace.sdfg.concurrent_subgraphs(state)

start_kernel = 0
for sg in subgraphs:
# Determine kernels in state
num_kernels, dependencies = self.partition_kernels(sg, default_kernel=start_kernel)
if num_kernels > 1:
# For each kernel, derive the corresponding subgraphs
# and keep track of dependencies
kernels.extend(self._kernels_subgraphs(sg, dependencies))
self._kernels_dependencies.update(dependencies)
else:
kernels.append((sg, start_kernel))
start_kernel = start_kernel + num_kernels
if Config.get_bool("compiler", "fpga", "concurrent_kernels_detection"):
TizianoDeMatteis marked this conversation as resolved.
Show resolved Hide resolved
start_kernel = 0
for sg in subgraphs:
# Determine kernels in state
num_kernels, dependencies = self.partition_kernels(sg, default_kernel=start_kernel)
if num_kernels > 1:
# For each kernel, derive the corresponding subgraphs
# and keep track of dependencies
kernels.extend(self._kernels_subgraphs(sg, dependencies))
self._kernels_dependencies.update(dependencies)
else:
kernels.append((sg, start_kernel))
start_kernel = start_kernel + num_kernels

# There is no need to generate additional kernels if the number of found kernels
# is equal to the number of connected components: use PEs instead (only one kernel)
if len(subgraphs) == len(kernels):
# There is no need to generate additional kernels if the number of found kernels
# is equal to the number of connected components: use PEs instead (only one kernel)
if len(subgraphs) == len(kernels):
kernels = [(state, 0)]
else:
# Only one FPGA kernel (possibly with multiple PEs)
kernels = [(state, 0)]

self._num_kernels = len(kernels)
Expand Down Expand Up @@ -920,8 +924,7 @@ def make_parameters(self, sdfg: SDFG, state: SDFGState, subgraphs):
trace_type, trace_bank = parse_location_bank(trace_desc)
if (bank is not None and bank_type is not None
and (bank != trace_bank or bank_type != trace_type)):
raise cgx.CodegenError("Found inconsistent memory bank "
f"specifier for {trace_name}.")
raise cgx.CodegenError("Found inconsistent memory bank " f"specifier for {trace_name}.")
bank = trace_bank
bank_type = trace_type

Expand Down Expand Up @@ -1460,8 +1463,7 @@ def _emit_copy(self, sdfg, state_id, src_node, src_storage, dst_node, dst_storag

if (not sum(copy_shape) == 1 and
(not isinstance(memlet.subset, subsets.Range) or any([step != 1 for _, _, step in memlet.subset]))):
raise NotImplementedError("Only contiguous copies currently "
"supported for FPGA codegen.")
raise NotImplementedError("Only contiguous copies currently " "supported for FPGA codegen.")

if host_to_device or device_to_device:
host_dtype = sdfg.data(src_node.data).dtype
Expand Down Expand Up @@ -1709,8 +1711,7 @@ def _emit_copy(self, sdfg, state_id, src_node, src_storage, dst_node, dst_storag
@staticmethod
def make_opencl_parameter(name, desc):
if isinstance(desc, dt.Array):
return (f"hlslib::ocl::Buffer<{desc.dtype.ctype}, "
f"hlslib::ocl::Access::readWrite> &{name}")
return (f"hlslib::ocl::Buffer<{desc.dtype.ctype}, " f"hlslib::ocl::Access::readWrite> &{name}")
else:
return (desc.as_arg(with_types=True, name=name))

Expand Down Expand Up @@ -1970,8 +1971,7 @@ def _generate_MapEntry(self, sdfg, dfg, state_id, node, function_stream, callsit
elif np.issubdtype(np.dtype(end_type.dtype.type), np.unsignedinteger):
loop_var_type = "size_t"
except (UnboundLocalError):
raise UnboundLocalError('Pipeline scopes require '
'specialized bound values')
raise UnboundLocalError('Pipeline scopes require ' 'specialized bound values')
except (TypeError):
# Raised when the evaluation of begin or skip fails.
# This could occur, for example, if they are defined in terms of other symbols, which
Expand Down
11 changes: 11 additions & 0 deletions dace/config_schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,17 @@ required:
Target Xilinx ("xilinx") or Intel ("intel_fpga") FPGAs when
generating code.

concurrent_kernels_detection:
TizianoDeMatteis marked this conversation as resolved.
Show resolved Hide resolved
type: bool
default: false
title: Detect parts of an SDFG that can run in parallel
description: >
If set to false, DaCe will place each independent connected
TizianoDeMatteis marked this conversation as resolved.
Show resolved Hide resolved
component found in an SDFG state in a different Kernel/Processing Elements.
TizianoDeMatteis marked this conversation as resolved.
Show resolved Hide resolved
If true, an heuristic will further inspect each independent component
TizianoDeMatteis marked this conversation as resolved.
Show resolved Hide resolved
for other parallelism opportunities (e.g., branches of the SDFG
that can be executed in parallel), creating the corresponding kernels.

#############################################
# FPGA (Xilinx) compiler flags
xilinx:
Expand Down
4 changes: 3 additions & 1 deletion tests/fpga/fpga_instrumentation_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from dace.transformation.interstate import FPGATransformSDFG, InlineSDFG
import numpy as np
import re
from dace.config import set_temporary
TizianoDeMatteis marked this conversation as resolved.
Show resolved Hide resolved


def make_sdfg(make_tmp_local: bool):
Expand Down Expand Up @@ -114,7 +115,8 @@ def test_instrumentation_single():
@fpga_test()
def test_instrumentation_multiple():
sdfg = make_sdfg(False)
run_program(sdfg)
with set_temporary("compiler", "fpga", "concurrent_kernels_detection", value=True):
TizianoDeMatteis marked this conversation as resolved.
Show resolved Hide resolved
run_program(sdfg)
report = sdfg.get_latest_report()
# There should be five runtimes: One for each kernel, and two for the state
assert len(re.findall(r"[0-9\.]+\s+[0-9\.]+\s+[0-9\.]+\s+[0-9\.]+\s+", str(report))) == 6
Expand Down
27 changes: 16 additions & 11 deletions tests/fpga/kernel_detection_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from dace.sdfg.utils import is_fpga_kernel
from dace.transformation.interstate import FPGATransformSDFG, InlineSDFG
from dace.fpga_testing import fpga_test
from dace.config import set_temporary


def count_kernels(sdfg: dace.SDFG):
Expand Down Expand Up @@ -49,7 +50,6 @@ def test_kernels_inside_component_0():
The 4 maps, should belong to three distinct kernels
:return:
'''

@dace.program
def kernels_inside_component_0(x: dace.float32[8], y: dace.float32[8], v: dace.float32[8], w: dace.float32[8],
z: dace.float32[8]):
Expand All @@ -69,7 +69,8 @@ def kernels_inside_component_0(x: dace.float32[8], y: dace.float32[8], v: dace.f
if is_fpga_kernel(sdfg, state):
state.instrument = dace.InstrumentationType.FPGA

res = sdfg(x=x, y=y, v=v, w=w, z=z)
with set_temporary("compiler", "fpga", "concurrent_kernels_detection", value=True):
res = sdfg(x=x, y=y, v=v, w=w, z=z)
assert count_kernels(sdfg) == 3
assert np.allclose(res, x + y + v + w + z)

Expand Down Expand Up @@ -103,7 +104,6 @@ def test_kernels_inside_component_1():
The five Maps should belong to 5 distinct kernels

'''

@dace.program
def kernels_inside_component_1(x: dace.float32[8], y: dace.float32[8], v: dace.float32[8], w: dace.float32[8],
z: dace.float32[8], t: dace.float32[8], alpha: dace.float32, beta: dace.float32):
Expand All @@ -124,7 +124,9 @@ def kernels_inside_component_1(x: dace.float32[8], y: dace.float32[8], v: dace.f

sdfg = kernels_inside_component_1.to_sdfg()
sdfg.apply_transformations([FPGATransformSDFG, InlineSDFG])
program = sdfg.compile()

with set_temporary("compiler", "fpga", "concurrent_kernels_detection", value=True):
program = sdfg.compile()
assert count_kernels(sdfg) == 5
program(x=x, y=y, v=v, w=w, z=z, t=t, alpha=alpha, beta=beta)
ref_z = alpha * (x + y + v + w)
Expand Down Expand Up @@ -154,7 +156,6 @@ def test_kernels_inside_component_2():

:return:
'''

@dace.program
def kernels_inside_component_2(x: dace.float32[8], y: dace.float32[8], v: dace.float32[8], z: dace.float32[8],
t: dace.float32[8]):
Expand All @@ -169,7 +170,8 @@ def kernels_inside_component_2(x: dace.float32[8], y: dace.float32[8], v: dace.f

sdfg = kernels_inside_component_2.to_sdfg()
sdfg.apply_transformations([FPGATransformSDFG, InlineSDFG])
program = sdfg.compile()
with set_temporary("compiler", "fpga", "concurrent_kernels_detection", value=True):
program = sdfg.compile()

# NOTE: here we have only one kernel since subgraph detection already
# detects two PEs
Expand Down Expand Up @@ -216,7 +218,9 @@ def kernels_lns_inside_component(A: dace.float32[8, 8], x: dace.float32[8], B: d

sdfg = kernels_lns_inside_component.to_sdfg()
sdfg.apply_transformations([FPGATransformSDFG, InlineSDFG])
program = sdfg.compile()

with set_temporary("compiler", "fpga", "concurrent_kernels_detection", value=True):
program = sdfg.compile()

assert count_kernels(sdfg) == 3
z = program(A=A, x=x, B=B, y=y)
Expand Down Expand Up @@ -246,7 +250,6 @@ def test_kernels_inside_components_0():
The three maps, should belong to three distinct kernels

'''

@dace.program
def kernels_inside_components_0(x: dace.float32[8], y: dace.float32[8], v: dace.float32[8], w: dace.float32[8],
xx: dace.float32[8], yy: dace.float32[8], vv: dace.float32[8], ww: dace.float32[8]):
Expand All @@ -265,7 +268,9 @@ def kernels_inside_components_0(x: dace.float32[8], y: dace.float32[8], v: dace.

sdfg = kernels_inside_components_0.to_sdfg()
sdfg.apply_transformations([FPGATransformSDFG, InlineSDFG])
program = sdfg.compile()

with set_temporary("compiler", "fpga", "concurrent_kernels_detection", value=True):
program = sdfg.compile()

assert count_kernels(sdfg) == 6
z, zz = program(x=x, y=y, v=v, w=w, xx=xx, yy=yy, vv=vv, ww=ww)
Expand Down Expand Up @@ -294,7 +299,6 @@ def test_kernels_inside_components_multiple_states():
The three maps, should belong to three distinct kernels
:return:
'''

def make_sdfg(dtype=dace.float32):
sdfg = dace.SDFG("multiple_kernels_multiple_states")
n = dace.symbol("size")
Expand Down Expand Up @@ -543,7 +547,8 @@ def make_sdfg(dtype=dace.float32):
zz = np.random.rand(8).astype(np.float32)

sdfg = make_sdfg()
program = sdfg.compile()
with set_temporary("compiler", "fpga", "concurrent_kernels_detection", value=True):
program = sdfg.compile()
assert count_kernels(sdfg) == 6
program(z=z, zz=zz, x=x, y=y, v=v, w=w, xx=xx, yy=yy, vv=vv, ww=ww, size=8)
assert np.allclose(z, x + y + v + w)
Expand Down