/
temporal_vectorization.py
166 lines (144 loc) · 8.31 KB
/
temporal_vectorization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import dace
from dace import data, dtypes, nodes, properties
from dace.libraries.standard import Gearbox
from dace.memlet import Memlet
from dace.sdfg.graph import SubgraphView
from dace.sdfg.sdfg import SDFG
from dace.sdfg.state import SDFGState
from dace.transformation import transformation
from dace.transformation.subgraph import helpers
@properties.make_properties
class TemporalVectorization(transformation.SubgraphTransformation):
'''
This transformation applies the multi-pumping optimization to a subgraph targeting FPGAs, in turn packing more computations temporally rather than spatially as is done in traditional vectorization.
Currently, it can only be applied to applications targeting Xilinx FPGAs, and to subgraphs that purely communicate through streams. It can be applied in two ways:
1. Where the widths of the internal paths remain unchanged while the external paths are widened by the multi-pumping factor. This gives the benefit of increased throughput at the same critical-resource footprint.
2. Where the widths of the internal paths are divided by the multi-pumping factor, while the widths of the external paths remain unchanged. This gives the benefit of a reduced critical-resource footprint at the same throughput.
'''
# TODO 3rd approach: where the subgraph is just clocked faster without introducing gearboxing. This could help designs where the subgraph reaches II=2, then by clocking it two times faster, it essentially behaves as II=1 from the slow clock domain.
factor = properties.Property(dtype=int,
default=2,
desc='The multi-pumping factor. E.g. double-pumping is a factor of 2.')
approach = properties.Property(
dtype=int,
default=2,
desc=
'Which approach to use. Can be 1 (increased throughput, same resources) or 2 (same throughput, reduced resourced).'
)
def can_be_applied(self, sdfg: SDFG, subgraph: SubgraphView) -> bool:
'''
Temporal vectorization can be applied if:
1. There is one outermost map in the subgraph.
2. All of the non- source and sink nodes are only accessed within this subgraph.
3. All of the source and sink nodes are either streams or scalars.
4. If the approach is 1, then either:
- The elemental type of the streams must be a vector type.
- The elemental type must be convertible to a vector type.
- Data packers/issuers are allowed to be inserted at the cost of performance through additional data plumbing overhead.
5. If the approach is 2, all the elemental types of the streams must be a vector type that is integer divisable by the multi-pumping factor.
'''
# Extract all of the relevant components of the subgraph
graph = subgraph.graph
src_nodes = subgraph.source_nodes()
dst_nodes = subgraph.sink_nodes()
srcdst_nodes = src_nodes + dst_nodes
srcdst_arrays = [sdfg.arrays[node.data] for node in srcdst_nodes]
access_nodes = [
node for node in subgraph.nodes() if isinstance(node, nodes.AccessNode) and not node in srcdst_nodes
]
map_entries = helpers.get_outermost_scope_maps(sdfg, graph, subgraph)
map_exits = [graph.exit_node(map_entry) for map_entry in map_entries]
maps = [map_entry.map for map_entry in map_entries]
# Perform checks
# 1. There is at least one map.
if len(maps) < 1: return False
# TODO array of streams
# TODO map of computation (matmul)
# TODO scalars
# 2. All of the non- source and sink nodes only resides within this subgraph.
for sg in dace.sdfg.concurrent_subgraphs(graph):
if sg == subgraph: continue
for nd in sg.nodes():
if isinstance(nd, nodes.AccessNode) and nd in access_nodes:
return False
# 3. All of the source and sink nodes only are either streams or scalars.
for arr in srcdst_arrays:
if not (isinstance(arr, data.Stream) or isinstance(arr, data.Scalar)):
return False
# 4. If the approach is 1, then either the dataype must be a vector type or must be convertible to a vector type.
if self.approach == 1:
# TODO not implemented yet.
return False
# 5. If the approach is 2, then all the elemental datatype of the streams must be a vector type.
elif self.approach == 2:
for arr in srcdst_arrays:
if (isinstance(arr, data.Stream)
and not isinstance(arr.dtype, dtypes.vector)) or arr.veclen % self.factor != 0:
return False
# If the approach is wrong, then it should not be applied.
else:
return False
return True
def issuer(self, sdfg: SDFG, state: SDFGState, subgraph: SubgraphView, src):
arr = sdfg.arrays[src.data]
veclen = arr.dtype.veclen // self.factor
dtype = dace.vector(arr.dtype.base_type, veclen)
name = f'{src.data}_pumped'
new_src = sdfg.add_stream(name, dtype, storage=dtypes.StorageType.FPGA_Local, transient=True)
# Update the subgraph
old_edge = subgraph.out_edges(src)[0]
old_path = state.memlet_path(old_edge)
for edge in old_path[1:]:
edge.data = dace.Memlet(f'{name}[0]')
old_path[-1].dst.in_connectors[old_path[-1].dst_conn] = dtype
state.remove_edge(old_edge)
new_src = state.add_read(name)
state.add_edge(new_src, old_edge.src_conn, old_edge.dst, old_edge.dst_conn, memlet=dace.Memlet(f'{name}[0]'))
innermost_map = [n.src.map for n in old_path if isinstance(n.src, nodes.MapEntry)][-1]
# Insert gearboxing for converting stream widths
gearbox = Gearbox(innermost_map.range.ranges[0][1] + 1, schedule=dtypes.ScheduleType.FPGA_Multi_Pumped)
gearbox_src = state.add_write(name)
state.add_memlet_path(src, gearbox, dst_conn='from_memory', memlet=Memlet(f'{src.data}[0]'))
state.add_memlet_path(gearbox, gearbox_src, src_conn='to_kernel', memlet=Memlet(f'{name}[0]'))
return innermost_map
def packer(self, sdfg: SDFG, state: SDFGState, subgraph: SubgraphView, dst):
arr = sdfg.arrays[dst.data]
veclen = arr.dtype.veclen // self.factor
dtype = dace.vector(arr.dtype.base_type, veclen)
name = f'{dst.data}_pumped'
sdfg.add_stream(name, dtype, storage=dtypes.StorageType.FPGA_Local, transient=True)
# Update the subgraph
old_edge = subgraph.in_edges(dst)[0]
old_path = state.memlet_path(old_edge)
for edge in old_path[:-1]:
edge.data = dace.Memlet(f'{name}[0]')
old_path[0].src.out_connectors[old_path[0].src_conn] = dtype
state.remove_edge(old_edge)
new_dst = state.add_write(name)
state.add_edge(old_edge.src, old_edge.src_conn, new_dst, old_edge.dst_conn, memlet=dace.Memlet(f'{name}[0]'))
innermost_map = [n.dst.map for n in old_path if isinstance(n.dst, nodes.MapExit)][0]
# Insert gearbox for converting stream widths.
gearbox = Gearbox(innermost_map.range.ranges[0][1] + 1, schedule=dtypes.ScheduleType.FPGA_Multi_Pumped)
gearbox_dst = state.add_read(name)
state.add_memlet_path(gearbox_dst, gearbox, dst_conn='from_memory', memlet=Memlet(f'{name}[0]'))
state.add_memlet_path(gearbox, dst, src_conn='to_kernel', memlet=Memlet(f'{dst.data}[0]'))
return innermost_map
def apply(self, sdfg: SDFG, **kwargs):
# Get the graphs and the nodes
subgraph = self.subgraph_view(sdfg)
graph = subgraph.graph
src_nodes = subgraph.source_nodes()
dst_nodes = subgraph.sink_nodes()
affected_maps = set()
# Update all of the subgraph inputs
for src in src_nodes:
affected_maps.add(self.issuer(sdfg, graph, subgraph, src))
# Update all of the subgraph outputs
for dst in dst_nodes:
affected_maps.add(self.packer(sdfg, graph, subgraph, dst))
# Update the schedules of the innermost affected maps.
for map in affected_maps:
rng = list(map.range.ranges[0])
rng[1] = ((rng[1] + 1) * 2) - 1
map.range.ranges[0] = rng
map.schedule = dtypes.ScheduleType.FPGA_Multi_Pumped