In [1]:
import dace
import typing
import os
import numpy as np
from dace.transformation.dataflow import DoubleBuffering, MapTiling
from dace.transformation.soft_hier import SystolocTransformer, SystolicTransformer, SystolicSplitStore, SummaTransformer
from dace.soft_hier import generate_arg_cfg, make_preload_elf, make_preload_elf_hbm_interleaved_new, InterleaveHandler


In [2]:

def _my_gen_matmul_sdfg(hardware_matmul_mnk: typing.Tuple,
                     global_storage: dace.dtypes.StorageType,
                     local_storage: dace.dtypes.StorageType,
                     device_schedule: dace.dtypes.ScheduleType,
                     thread_group_schedule: dace.dtypes.ScheduleType,
                     thread_group_dims: typing.Tuple,
                     hbm_split_scheme: typing.List[typing.Tuple[int, int]],
                     hbm_placement_scheme: typing.List[typing.Tuple[int, int]],
                     input_float,
                     output_float,
                     coarsening_factor,
                     mmad_tasklet_str: str,
                     is_hbm_interleaved: bool = False):
    sdfg = dace.SDFG("GEMM")
    tM, tN, tK = hardware_matmul_mnk
    tM *= coarsening_factor
    tN *= coarsening_factor
    tK *= coarsening_factor
    gM, gN = thread_group_dims

    main_state = sdfg.add_state("main")
    state = main_state

    arrs = dict()
    for arr_name, shape, ftype in [("A", (M, K), input_float), ("B", (K, N), input_float), ("C", (M, N), output_float)]:
        if arr_name == "A":
            arrn, arr = sdfg.add_array(name=arr_name, shape=shape, dtype=ftype, storage=global_storage, transient=False, is_hbm_interleaved=is_hbm_interleaved, hbm_split_scheme=hbm_split_scheme[0], hbm_placement_scheme=hbm_placement_scheme[0])
            arrs[arrn] = arr
        if arr_name == "B":
            arrn, arr = sdfg.add_array(name=arr_name, shape=shape, dtype=ftype, storage=global_storage, transient=False, is_hbm_interleaved=is_hbm_interleaved, hbm_split_scheme=hbm_split_scheme[1], hbm_placement_scheme=hbm_placement_scheme[1])
            arrs[arrn] = arr
        if arr_name == "C":
            arrn, arr = sdfg.add_array(name=arr_name, shape=shape, dtype=ftype, storage=global_storage, transient=False, is_hbm_interleaved=is_hbm_interleaved, hbm_split_scheme=hbm_split_scheme[2], hbm_placement_scheme=hbm_placement_scheme[2])
            arrs[arrn] = arr
    arrn, arr = sdfg.add_array(name="accumulator", shape=(coarsening_factor*coarsening_factor, tM//coarsening_factor, tN//coarsening_factor), dtype=ftype, storage=local_storage, transient=True)
    arrs[arrn] = arr

    dev_map_entry, dev_map_exit = main_state.add_map(
        name="gemm_entry",
        ndrange={"i" : dace.subsets.Range([(0, M-1, tM*gM)]),
                 "j" : dace.subsets.Range([(0, N-1, tN*gN)])},
        schedule=device_schedule
    )
    i = dace.symbol("i")
    j = dace.symbol("j")

    for name in ["A", "B", "C"]:
    # for name in ["A", "B"]:
        if name == "A" or name == "B":
            access_str = ", ".join([f"0:{n}" for n in arrs[name].shape])
            an = state.add_access(name)
            state.add_edge(an, None, dev_map_entry, f"IN_{name}", dace.memlet.Memlet(f"{name}[{access_str}]"))
            dev_map_entry.add_in_connector(f"IN_{name}")
        if name == "C":
            access_str = ", ".join([f"0:{n}" for n in arrs[name].shape])
            # an = state.add_access(name)
            dev_map_exit.add_out_connector(f"OUT_{name}")
            anc3 = state.add_access(name)
            state.add_edge(dev_map_exit, f"OUT_{name}", anc3, None, dace.memlet.Memlet(f"{name}[{access_str}]"))

    thread_group_map_entry, thread_group_map_exit = main_state.add_map(
        name="thread_group_mmad",
        ndrange={"gi" : dace.subsets.Range([(0, gM-1, 1)]),
                 "gj" : dace.subsets.Range([(0, gM-1, 1)])},
        schedule=thread_group_schedule
    )

    gi = dace.symbol("gi")
    gj = dace.symbol("gj")
    
    for name in ["A", "B", "C"]:
        if name == "A" or name == "B":
            if name == "A":
                access_str = ", ".join([f"i:i + {tM} * {gM}", "0:K"])
            elif name == "B":
                access_str = ", ".join(["0:K", f"j:j + {tN} * {gN}"])
            elif name == "C":
                access_str = ", ".join([f"i:i + {gM} * {tM}", f"j:j + {gN} * {tN}"])
            state.add_edge(dev_map_entry, f"OUT_{name}", thread_group_map_entry, f"IN_{name}", dace.memlet.Memlet(f"{name}[{access_str}]"))
            dev_map_entry.add_out_connector(f"OUT_{name}")
            thread_group_map_entry.add_in_connector(f"IN_{name}")
        if name == "C":
            access_str = ", ".join([f"i:i + {gM} * {tM}", f"j:j + {gN} * {tN}"])
            state.add_edge(thread_group_map_exit, f"OUT_{name}", dev_map_exit, f"IN_{name}", dace.memlet.Memlet(f"{name}[{access_str}]"))
            dev_map_exit.add_in_connector(f"IN_{name}")
            thread_group_map_exit.add_out_connector(f"OUT_{name}")

    thread_coarsened_map_entry, thread_coarsened_map_exit = main_state.add_map(
        name="thread_coarsened",
        ndrange={"ci" : dace.subsets.Range([(0, tM-1, tM//coarsening_factor)]),
                 "cj" : dace.subsets.Range([(0, tN-1, tN//coarsening_factor)])},
        schedule=dace.dtypes.ScheduleType.SoftHier_Sequential
    )

    for name in ["A", "B", "C"]:
        if name == "A" or name == "B":
                if name == "A":
                    access_str = ", ".join([f"i + gi * {tM}:i + gi * {tM} + {tM}", "0:K"])
                elif name == "B":
                    access_str = ", ".join(["0:K", f"j + gj * {tN}:j + gj * {tN} + {tN}"])
                elif name == "C":
                    access_str = ", ".join([f"i + gj * {tM}:i + gj * {tM} + {tM}", f"j + gj * {tN}:j + gj * {tN} + {tN}"])
                state.add_edge(thread_group_map_entry, f"OUT_{name}", thread_coarsened_map_entry, f"IN_{name}", dace.memlet.Memlet(f"{name}[{access_str}]"))
                thread_group_map_entry.add_out_connector(f"OUT_{name}")
                thread_coarsened_map_entry.add_in_connector(f"IN_{name}")
        if name == "C":
            access_str = ", ".join([f"i + gj * {tM}:i + gj * {tM} + {tM}", f"j + gj * {tN}:j + gj * {tN} + {tN}"])
            state.add_edge(thread_coarsened_map_exit, f"OUT_{name}", thread_group_map_exit, f"IN_{name}", dace.memlet.Memlet(f"{name}[{access_str}]"))
            thread_group_map_exit.add_in_connector(f"IN_{name}")
            thread_coarsened_map_exit.add_out_connector(f"OUT_{name}")

    block_tiled_map_entry, block_tiled_map_exit = main_state.add_map(
        name="block_tiled",
        ndrange={"bK" : dace.subsets.Range([(0, K-1, tK//coarsening_factor)])},
        schedule=dace.dtypes.ScheduleType.SoftHier_Sequential
    )

    accumulator_an = state.add_access("accumulator")
    accumulator_an.setzero = True
    state.add_edge(thread_coarsened_map_entry, None, accumulator_an, None, dace.memlet.Memlet(None))
    access_str = ", ".join([f"0:{coarsening_factor}*{coarsening_factor}", f"0:{tM//coarsening_factor}", f"0:{tN//coarsening_factor}"])
    state.add_edge(accumulator_an, None, block_tiled_map_entry, f"IN_accumulator", dace.memlet.Memlet(f"accumulator[{access_str}]"))
    block_tiled_map_entry.add_in_connector("IN_accumulator")
    thread_group_map_entry


    for name in ["A", "B"]:
        if name == "A":
            access_str = ", ".join([f"i + gi * {tM} + ci * {tM//coarsening_factor}:i + gi * {tM} + ci * {tM//coarsening_factor} + {tM//coarsening_factor}", "0:K"])
        elif name == "B":
            access_str = ", ".join(["0:K", f"j + gj * {tN} + cj * {tN//coarsening_factor}:j + gj * {tN} + cj * {tN//coarsening_factor} + {tN//coarsening_factor}"])

        state.add_edge(thread_coarsened_map_entry, f"OUT_{name}", block_tiled_map_entry, f"IN_{name}", dace.memlet.Memlet(f"{name}[{access_str}]"))
        block_tiled_map_entry.add_in_connector(f"IN_{name}")
        thread_coarsened_map_entry.add_out_connector(f"OUT_{name}")


    # Load
    local_access_nodes = dict()
    for name, shape in [("A", (tM//coarsening_factor, tK//coarsening_factor)), ("B", (tK//coarsening_factor, tN//coarsening_factor))]:
        block_tiled_map_entry.add_out_connector(f"OUT_{name}")
        arrn, arr = sdfg.add_array(name=f"local_{name}", shape=shape, dtype=input_float, storage=local_storage, transient=True)
        arrs[arrn] = arr
        an = state.add_access(f"local_{name}")
        local_access_nodes[f"local_{name}"] = an
        if name == "A":
            access_str = ", ".join([f"i + gi * {tM} + ci * {tM//coarsening_factor}:i + gi * {tM} + ci * {tM//coarsening_factor} + {tM//coarsening_factor}", 
                                    f"bK:bK+{tK//coarsening_factor}"])
        elif name == "B":
            access_str = ", ".join([f"bK:bK+{tK//coarsening_factor}", 
                                    f"j + gj * {tN} + cj * {tN//coarsening_factor}:j + gj * {tN} + cj * {tN//coarsening_factor} + {tN//coarsening_factor}"])
        state.add_edge(block_tiled_map_entry, f"OUT_{name}", an, None, dace.memlet.Memlet(f"{name}[{access_str}]"))

    # Connect local_A + local_B -> matmul -> accumulator
    matmul_tasklet = state.add_tasklet(name="mmad_redmule", inputs={"_in_local_a", "_in_local_b", "_in_accumulator"}, outputs={"_out_accumulator"},
                                       code=mmad_tasklet_str, language=dace.dtypes.Language.CPP)

    #for name in ["local_A", "local_B", "accumulate"]:
    #    state.add_edge()

    for name, an in local_access_nodes.items():
        state.add_edge(an, None, matmul_tasklet, "_in_" + name.lower(), dace.memlet.Memlet(name))
    state.add_edge(block_tiled_map_entry, f"OUT_accumulator", matmul_tasklet, "_in_accumulator", dace.memlet.Memlet("accumulator"))
    # accumulator_an2 = state.add_access("accumulator")
    # state.add_edge(matmul_tasklet, f"_out_accumulator", accumulator_an2, None, dace.memlet.Memlet("accumulator"))
    # state.add_edge(accumulator_an2, None, block_tiled_map_exit, "IN_accumulator", dace.memlet.Memlet("accumulator"))
    access_str = ", ".join([f"0:{coarsening_factor}*{coarsening_factor}", f"0:{tM//coarsening_factor}", f"0:{tN//coarsening_factor}"])
    state.add_edge(matmul_tasklet, "_out_accumulator", block_tiled_map_exit, "IN_accumulator", dace.memlet.Memlet(f"accumulator[{access_str}]"))
    block_tiled_map_entry.add_in_connector("IN_accumulator")
    block_tiled_map_exit.add_in_connector("IN_accumulator")
    block_tiled_map_entry.add_out_connector("OUT_accumulator")
    block_tiled_map_exit.add_out_connector("OUT_accumulator")


    # assign_tasklet = state.add_tasklet(name="assign", inputs={"_in_accumulator"}, outputs={"_out_C"}, code="_out_C = _in_accumulator")
    # state.add_edge(block_tiled_map_exit, "OUT_C", assign_tasklet, "_in_accumulator", dace.memlet.Memlet("accumulator")) , "_in_C"

    # accumulator_an3 = state.add_access("accumulator")
    # state.add_edge(block_tiled_map_exit, f"OUT_accumulator", accumulator_an3, None, dace.memlet.Memlet("accumulator"))
    # state.add_edge(accumulator_an3, None, assign_tasklet, "_in_accumulator", dace.memlet.Memlet("accumulator"))
    # state.add_edge(block_tiled_map_exit, f"OUT_accumulator", assign_tasklet, "_in_accumulator", dace.memlet.Memlet())

    # c_an2 = state.add_access("C")
    accumulator_an3 = state.add_access("accumulator")
    
    # state.add_edge(assign_tasklet, "_out_C", c_an2, None, dace.memlet.Memlet(f"C[{access_str}]"))
    # thread_coarsened_map_entry.add_out_connector(f"OUT_C")
    # state.add_edge(c_an2, None, thread_coarsened_map_exit, "IN_C", dace.memlet.Memlet(f"C[{access_str}]"))
    # state.add_edge(assign_tasklet, "_out_C", thread_coarsened_map_exit, "IN_C", dace.memlet.Memlet(f"C[{access_str}]"))
    # state.add_edge(block_tiled_map_exit, f"OUT_accumulator", thread_coarsened_map_exit, "IN_C", dace.memlet.Memlet(f"C[{access_str}]"))
    access_str = ", ".join([f"0:{coarsening_factor}*{coarsening_factor}", f"0:{tM//coarsening_factor}", f"0:{tN//coarsening_factor}"])
    state.add_edge(block_tiled_map_exit, f"OUT_accumulator", accumulator_an3, None, dace.memlet.Memlet(f"accumulator[{access_str}]"))
    access_str = ", ".join([f"i + gi * {tM} + ci * {tM//coarsening_factor}:i + gi * {tM} + ci * {tM//coarsening_factor} + {tM//coarsening_factor}", 
                            f"j + gj * {tN} + cj * {tN//coarsening_factor}:j + gj * {tN} + cj * {tN//coarsening_factor} + {tN//coarsening_factor}"])
    state.add_edge(accumulator_an3, None, thread_coarsened_map_exit, "IN_C", dace.memlet.Memlet(f"C[{access_str}]"))
    thread_coarsened_map_exit.add_in_connector("IN_C")
    
    # SummaTransformer.apply_to(sdfg, map_entry=block_tiled_map_entry, transient=local_access_nodes["local_A"], options={"npe": gM, "gi": gi, "gj": gj,
    #                                                                                                                       "i": i, "j": j,
    #                                                                                                                       "M": M, "N": N, "K": K,
    #                                                                                                                       "tM": tM, "tN": tN, "tK": tK})
    
    SystolicTransformer.apply_to(sdfg, map_entry=block_tiled_map_entry, transient=local_access_nodes["local_A"], options={"npe": gM, "gi": gi, "gj": gj, 
                                                                                                                          "i": i, "j": j, 
                                                                                                                          "M": M, "N": N, 
                                                                                                                          "tM": tM, "tN": tN})
    SystolicSplitStore.apply_to(sdfg, map_entry=thread_coarsened_map_entry, accumulator=accumulator_an, options={"npe": gM, "gi": gi, "gj": gj,
                                                                                                                          "i": i, "j": j,
                                                                                                                          "M": M, "N": N, "K": K,
                                                                                                                          "tM": tM, "tN": tN, "tK": tK})
    
    return sdfg



In [3]:

M = dace.symbol("M")
N = dace.symbol("N")
K = dace.symbol("K")

dim_x = 8
dim_y = 8
cluster_dims = (dim_x, dim_y)

K = 2048
M = 4096
N = 4096

tM = 256
tN = 256
tK = 64

A_host = np.ones((M, K), dtype=np.float16)
B_host = np.ones((K, N), dtype=np.float16)
C_host = np.zeros((M, N), dtype=np.float16)

# for i in range(M):
#     for j in range(K):
#         if np.random.rand() < 0.3:
#             A_host[i, j] = 0.0
# for i in range(K):
#     for j in range(N):
#         if np.random.rand() < 0.3:
#             B_host[i, j] = 0.0

# G = A_host@B_host



    

In [5]:
A_handler = InterleaveHandler(array=A_host, block_shape=(tM, tK), cluster_dims=cluster_dims)
A_handler.split_horizental()
A_handler.place_to_range(place_range=(24, 31, 1))
split_scheme_A = A_handler.split_scheme
placement_scheme_A = A_handler.placement_scheme

B_handler = InterleaveHandler(array=B_host, block_shape=(tK, tN), cluster_dims=cluster_dims)
B_handler.split_vertical()
B_handler.place_to_range(place_range=(0, 7, 1))
split_scheme_B = B_handler.split_scheme
placement_scheme_B = B_handler.placement_scheme


C_handler = InterleaveHandler(array=C_host, block_shape=(tM, tN), cluster_dims=cluster_dims)
C_handler.split_to_blocks()
C_handler.systolic_place_to_left_and_bottom()
split_scheme_C = C_handler.split_scheme
placement_scheme_C = C_handler.placement_scheme

print(f"placement_scheme_A: {placement_scheme_A}")
print(f"placement_scheme_B: {placement_scheme_B}") 
print(f"placement_scheme_C: {placement_scheme_C}")

make_preload_elf_hbm_interleaved_new("./output.elf", [A_handler, B_handler, C_handler])


placement_scheme_A: (24, 25, 26, 27, 28, 29, 30, 31, 24, 25, 26, 27, 28, 29, 30, 31)
placement_scheme_B: (0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7)
placement_scheme_C: (24, 24, 2, 3, 24, 24, 6, 7, 24, 24, 2, 3, 24, 24, 6, 7, 0, 25, 25, 3, 4, 25, 25, 7, 0, 25, 25, 3, 4, 25, 25, 7, 0, 1, 26, 26, 4, 5, 26, 26, 0, 1, 26, 26, 4, 5, 26, 26, 27, 1, 2, 27, 27, 5, 6, 27, 27, 1, 2, 27, 27, 5, 6, 27, 28, 28, 2, 3, 28, 28, 6, 7, 28, 28, 2, 3, 28, 28, 6, 7, 0, 29, 29, 3, 4, 29, 29, 7, 0, 29, 29, 3, 4, 29, 29, 7, 0, 1, 30, 30, 4, 5, 30, 30, 0, 1, 30, 30, 4, 5, 30, 30, 31, 1, 2, 31, 31, 5, 6, 31, 31, 1, 2, 31, 31, 5, 6, 31, 24, 24, 2, 3, 24, 24, 6, 7, 24, 24, 2, 3, 24, 24, 6, 7, 0, 25, 25, 3, 4, 25, 25, 7, 0, 25, 25, 3, 4, 25, 25, 7, 0, 1, 26, 26, 4, 5, 26, 26, 0, 1, 26, 26, 4, 5, 26, 26, 27, 1, 2, 27, 27, 5, 6, 27, 27, 1, 2, 27, 27, 5, 6, 27, 28, 28, 2, 3, 28, 28, 6, 7, 28, 28, 2, 3, 28, 28, 6, 7, 0, 29, 29, 3, 4, 29, 29, 7, 0, 29, 29, 3, 4, 29, 29, 7, 0, 1, 30, 30, 4, 5, 30, 30, 0, 1, 30, 30,

array([2097216, 2097216,      64], dtype=uint32)

In [6]:
sdfg = _my_gen_matmul_sdfg(hardware_matmul_mnk=(tM, tN, tK),
                            global_storage=dace.dtypes.StorageType.SoftHier_HBM,
                            local_storage=dace.dtypes.StorageType.SoftHier_TCDM,
                            device_schedule=dace.dtypes.ScheduleType.SoftHier_Device,
                            thread_group_schedule=dace.dtypes.ScheduleType.SoftHier_Cluster,
                            thread_group_dims=cluster_dims,
                            hbm_split_scheme=[split_scheme_A, split_scheme_B, split_scheme_C],
                            hbm_placement_scheme=[placement_scheme_A, placement_scheme_B, placement_scheme_C],
                            is_hbm_interleaved=False,
                            input_float=dace.float16,
                            output_float=dace.float16,
                            coarsening_factor=1,
                            mmad_tasklet_str="flex_redmule_trigger(_in_local_a, _in_local_b, _in_accumulator, REDMULE_NONE_16);")
sdfg.save("my_gemm.sdfgz")
sdfg.validate()



In [None]:

# from IPython.display import Code
# Code(sdfg.generate_code()[1].clean_code, language='cpp')
# sdfg.compile()


In [None]:
# sdfg.compile()
# import numpy as np




start_address = 0x00000000
A_address = 64 + start_address
B_address = 64 + A_host.nbytes + start_address
C_address = 64 + A_host.nbytes + B_host.nbytes + start_address
G_address = 64 + A_host.nbytes + B_host.nbytes + C_host.nbytes + start_address
# create a uint32 np array to store the addresses
args = np.array([A_address, B_address, C_address, K, M, N, G_address], dtype=np.uint32)
# args = make_preload_elf_hbm_interleaved("output.elf", [A_host, B_host, C_host], [split_scheme_A, split_scheme_B, split_scheme_C], [placement_scheme_A, placement_scheme_B, placement_scheme_C], [(64, 32), (32, 16), (64, 16)], start_addresses=[])
# G = A_host@B_host
# make_preload_elf("./output.elf", [args, A_host, B_host, C_host, G])

# make_preload_elf("/usr/scratch/badile111/dace4softhier/gvsoc/output.elf", [args, A_host, B_host, C_host])
# print(A_host@B_host)
# sdfg(A=A_host, B=B_host, C=C_host, M=M, N=N, K=K)

Waring: No `gpu_block_size` property specified on map gemm_entry. 
RedMule Dims [[1, 256, 64], [1, 64, 256]]
Generating NestedSDFG using SoftHierCodeGen
Generating Tasklet Using CPU Codegen
GVSOC_INSTALL_PATH: None
GVSOC_DIR: None
SOFTHIER_INSTALL_PATH: None


CompilationError: Compiler failure:
[ 20%] [34m[1mGenerating softhier.elf[0m
In file included from /scratch/dace4softhier/dace_devel/dace/soft_hier/test/new_interleave_test/.dacecache/GEMM/src/soft_hier/GEMM_soft_hier.shcc:5:0:
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h: In function 'flex_barrier_init':
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:151:32: warning: passing argument 1 of 'flex_reset_barrier' discards 'volatile' qualifier from pointer target type [-Wdiscarded-qualifiers]
             flex_reset_barrier(barrier);
                                ^~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:130:6: note: expected 'uint32_t * {aka long unsigned int *}' but argument is of type 'volatile uint32_t * {aka volatile long unsigned int *}'
 void flex_reset_barrier(uint32_t* barrier){
      ^~~~~~~~~~~~~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h: In function 'flex_global_barrier':
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:169:94: warning: passing argument 1 of 'flex_amo_fetch_add' discards 'volatile' qualifier from pointer target type [-Wdiscarded-qualifiers]
         if ((flex_get_barrier_num_cluster() - flex_get_enable_value()) == flex_amo_fetch_add(barrier)) {
                                                                                              ^~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:134:10: note: expected 'uint32_t * {aka long unsigned int *}' but argument is of type 'volatile uint32_t * {aka volatile long unsigned int *}'
 uint32_t flex_amo_fetch_add(uint32_t* barrier){
          ^~~~~~~~~~~~~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:170:32: warning: passing argument 1 of 'flex_reset_barrier' discards 'volatile' qualifier from pointer target type [-Wdiscarded-qualifiers]
             flex_reset_barrier(barrier);
                                ^~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:130:6: note: expected 'uint32_t * {aka long unsigned int *}' but argument is of type 'volatile uint32_t * {aka volatile long unsigned int *}'
 void flex_reset_barrier(uint32_t* barrier){
      ^~~~~~~~~~~~~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h: In function 'flex_global_barrier_polling':
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:191:94: warning: passing argument 1 of 'flex_amo_fetch_add' discards 'volatile' qualifier from pointer target type [-Wdiscarded-qualifiers]
         if ((flex_get_barrier_num_cluster() - flex_get_enable_value()) == flex_amo_fetch_add(barrier)) {
                                                                                              ^~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:134:10: note: expected 'uint32_t * {aka long unsigned int *}' but argument is of type 'volatile uint32_t * {aka volatile long unsigned int *}'
 uint32_t flex_amo_fetch_add(uint32_t* barrier){
          ^~~~~~~~~~~~~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:192:32: warning: passing argument 1 of 'flex_reset_barrier' discards 'volatile' qualifier from pointer target type [-Wdiscarded-qualifiers]
             flex_reset_barrier(barrier);
                                ^~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:130:6: note: expected 'uint32_t * {aka long unsigned int *}' but argument is of type 'volatile uint32_t * {aka volatile long unsigned int *}'
 void flex_reset_barrier(uint32_t* barrier){
      ^~~~~~~~~~~~~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:193:32: warning: passing argument 1 of 'flex_amo_fetch_add' discards 'volatile' qualifier from pointer target type [-Wdiscarded-qualifiers]
             flex_amo_fetch_add(barrier_iter);
                                ^~~~~~~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:134:10: note: expected 'uint32_t * {aka long unsigned int *}' but argument is of type 'volatile uint32_t * {aka volatile long unsigned int *}'
 uint32_t flex_amo_fetch_add(uint32_t* barrier){
          ^~~~~~~~~~~~~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h: In function 'flex_barrier_xy_init':
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:214:32: warning: passing argument 1 of 'flex_reset_barrier' discards 'volatile' qualifier from pointer target type [-Wdiscarded-qualifiers]
             flex_reset_barrier(barrier_y);
                                ^~~~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:130:6: note: expected 'uint32_t * {aka long unsigned int *}' but argument is of type 'volatile uint32_t * {aka volatile long unsigned int *}'
 void flex_reset_barrier(uint32_t* barrier){
      ^~~~~~~~~~~~~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:218:36: warning: passing argument 1 of 'flex_reset_barrier' discards 'volatile' qualifier from pointer target type [-Wdiscarded-qualifiers]
                 flex_reset_barrier(barrier_x);
                                    ^~~~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:130:6: note: expected 'uint32_t * {aka long unsigned int *}' but argument is of type 'volatile uint32_t * {aka volatile long unsigned int *}'
 void flex_reset_barrier(uint32_t* barrier){
      ^~~~~~~~~~~~~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h: In function 'flex_global_barrier_xy':
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:244:96: warning: passing argument 1 of 'flex_amo_fetch_add' discards 'volatile' qualifier from pointer target type [-Wdiscarded-qualifiers]
         if ((flex_get_barrier_num_cluster_x() - flex_get_enable_value()) == flex_amo_fetch_add(barrier_x)) {
                                                                                                ^~~~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:134:10: note: expected 'uint32_t * {aka long unsigned int *}' but argument is of type 'volatile uint32_t * {aka volatile long unsigned int *}'
 uint32_t flex_amo_fetch_add(uint32_t* barrier){
          ^~~~~~~~~~~~~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:245:32: warning: passing argument 1 of 'flex_reset_barrier' discards 'volatile' qualifier from pointer target type [-Wdiscarded-qualifiers]
             flex_reset_barrier(barrier_x);
                                ^~~~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:130:6: note: expected 'uint32_t * {aka long unsigned int *}' but argument is of type 'volatile uint32_t * {aka volatile long unsigned int *}'
 void flex_reset_barrier(uint32_t* barrier){
      ^~~~~~~~~~~~~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:248:100: warning: passing argument 1 of 'flex_amo_fetch_add' discards 'volatile' qualifier from pointer target type [-Wdiscarded-qualifiers]
             if ((flex_get_barrier_num_cluster_y() - flex_get_enable_value()) == flex_amo_fetch_add(barrier_y))
                                                                                                    ^~~~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:134:10: note: expected 'uint32_t * {aka long unsigned int *}' but argument is of type 'volatile uint32_t * {aka volatile long unsigned int *}'
 uint32_t flex_amo_fetch_add(uint32_t* barrier){
          ^~~~~~~~~~~~~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:250:36: warning: passing argument 1 of 'flex_reset_barrier' discards 'volatile' qualifier from pointer target type [-Wdiscarded-qualifiers]
                 flex_reset_barrier(barrier_y);
                                    ^~~~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:130:6: note: expected 'uint32_t * {aka long unsigned int *}' but argument is of type 'volatile uint32_t * {aka volatile long unsigned int *}'
 void flex_reset_barrier(uint32_t* barrier){
      ^~~~~~~~~~~~~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h: In function 'flex_global_barrier_xy_polling':
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:282:96: warning: passing argument 1 of 'flex_amo_fetch_add' discards 'volatile' qualifier from pointer target type [-Wdiscarded-qualifiers]
         if ((flex_get_barrier_num_cluster_x() - flex_get_enable_value()) == flex_amo_fetch_add(barrier_x)) {
                                                                                                ^~~~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:134:10: note: expected 'uint32_t * {aka long unsigned int *}' but argument is of type 'volatile uint32_t * {aka volatile long unsigned int *}'
 uint32_t flex_amo_fetch_add(uint32_t* barrier){
          ^~~~~~~~~~~~~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:283:32: warning: passing argument 1 of 'flex_reset_barrier' discards 'volatile' qualifier from pointer target type [-Wdiscarded-qualifiers]
             flex_reset_barrier(barrier_x);
                                ^~~~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:130:6: note: expected 'uint32_t * {aka long unsigned int *}' but argument is of type 'volatile uint32_t * {aka volatile long unsigned int *}'
 void flex_reset_barrier(uint32_t* barrier){
      ^~~~~~~~~~~~~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:286:100: warning: passing argument 1 of 'flex_amo_fetch_add' discards 'volatile' qualifier from pointer target type [-Wdiscarded-qualifiers]
             if ((flex_get_barrier_num_cluster_y() - flex_get_enable_value()) == flex_amo_fetch_add(barrier_y))
                                                                                                    ^~~~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:134:10: note: expected 'uint32_t * {aka long unsigned int *}' but argument is of type 'volatile uint32_t * {aka volatile long unsigned int *}'
 uint32_t flex_amo_fetch_add(uint32_t* barrier){
          ^~~~~~~~~~~~~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:288:36: warning: passing argument 1 of 'flex_reset_barrier' discards 'volatile' qualifier from pointer target type [-Wdiscarded-qualifiers]
                 flex_reset_barrier(barrier_y);
                                    ^~~~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:130:6: note: expected 'uint32_t * {aka long unsigned int *}' but argument is of type 'volatile uint32_t * {aka volatile long unsigned int *}'
 void flex_reset_barrier(uint32_t* barrier){
      ^~~~~~~~~~~~~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:289:36: warning: passing argument 1 of 'flex_amo_fetch_add' discards 'volatile' qualifier from pointer target type [-Wdiscarded-qualifiers]
                 flex_amo_fetch_add(barrier_iy);
                                    ^~~~~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:134:10: note: expected 'uint32_t * {aka long unsigned int *}' but argument is of type 'volatile uint32_t * {aka volatile long unsigned int *}'
 uint32_t flex_amo_fetch_add(uint32_t* barrier){
          ^~~~~~~~~~~~~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:294:32: warning: passing argument 1 of 'flex_amo_fetch_add' discards 'volatile' qualifier from pointer target type [-Wdiscarded-qualifiers]
             flex_amo_fetch_add(barrier_ix);
                                ^~~~~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:134:10: note: expected 'uint32_t * {aka long unsigned int *}' but argument is of type 'volatile uint32_t * {aka volatile long unsigned int *}'
 uint32_t flex_amo_fetch_add(uint32_t* barrier){
          ^~~~~~~~~~~~~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_dma_pattern.h: In function 'flex_dma_async_pattern_access_south_hbm':
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:17:115: warning: integer overflow in expression [-Woverflow]
 #define hbm_south(nid,offset)       (ARCH_HBM_START_BASE+(nid)*ARCH_HBM_NODE_ADDR_SPACE+ARCH_HBM_NODE_ADDR_SPACE*2*ARCH_NUM_CLUSTER_Y+ARCH_HBM_NODE_ADDR_SPACE*ARCH_NUM_CLUSTER_X+offset)
                                                                                                                   ^
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_dma_pattern.h:311:43: note: in expansion of macro 'hbm_south'
     bare_dma_start_1d(local(local_offset),hbm_south(pos.x,remote_offset), transfer_size); //Start iDMA
                                           ^~~~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:17:159: warning: integer overflow in expression [-Woverflow]
 #define hbm_south(nid,offset)       (ARCH_HBM_START_BASE+(nid)*ARCH_HBM_NODE_ADDR_SPACE+ARCH_HBM_NODE_ADDR_SPACE*2*ARCH_NUM_CLUSTER_Y+ARCH_HBM_NODE_ADDR_SPACE*ARCH_NUM_CLUSTER_X+offset)
                                                                                                                                                               ^
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_dma_pattern.h:311:43: note: in expansion of macro 'hbm_south'
     bare_dma_start_1d(local(local_offset),hbm_south(pos.x,remote_offset), transfer_size); //Start iDMA
                                           ^~~~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_dma_pattern.h: In function 'flex_dma_pattern_systolic_shift_west_south':
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:17:115: warning: integer overflow in expression [-Woverflow]
 #define hbm_south(nid,offset)       (ARCH_HBM_START_BASE+(nid)*ARCH_HBM_NODE_ADDR_SPACE+ARCH_HBM_NODE_ADDR_SPACE*2*ARCH_NUM_CLUSTER_Y+ARCH_HBM_NODE_ADDR_SPACE*ARCH_NUM_CLUSTER_X+offset)
                                                                                                                   ^
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_dma_pattern.h:377:47: note: in expansion of macro 'hbm_south'
         bare_dma_start_1d(local(local_offset),hbm_south(pos.x,remote_offset), transfer_size);
                                               ^~~~~~~~~
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h:17:159: warning: integer overflow in expression [-Woverflow]
 #define hbm_south(nid,offset)       (ARCH_HBM_START_BASE+(nid)*ARCH_HBM_NODE_ADDR_SPACE+ARCH_HBM_NODE_ADDR_SPACE*2*ARCH_NUM_CLUSTER_Y+ARCH_HBM_NODE_ADDR_SPACE*ARCH_NUM_CLUSTER_X+offset)
                                                                                                                                                               ^
/usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/include/flex_dma_pattern.h:377:47: note: in expansion of macro 'hbm_south'
         bare_dma_start_1d(local(local_offset),hbm_south(pos.x,remote_offset), transfer_size);
                                               ^~~~~~~~~
/scratch/dace4softhier/gvsoc/third_party/toolchain/v1.0.16-pulp-riscv-gcc-centos-7/bin/../lib/gcc/riscv32-unknown-elf/7.1.1/../../../../riscv32-unknown-elf/bin/ld: warning: /usr/scratch/badile111/dace4softhier/gvsoc/soft_hier/flex_cluster_sdk/runtime/flex_memory.ld contains output sections; did you forget -T?
/scratch/dace4softhier/gvsoc/third_party/toolchain/v1.0.16-pulp-riscv-gcc-centos-7/bin/../lib/gcc/riscv32-unknown-elf/7.1.1/../../../../riscv32-unknown-elf/bin/ld: /scratch/dace4softhier/gvsoc/third_party/toolchain/v1.0.16-pulp-riscv-gcc-centos-7/bin/../lib/gcc/riscv32-unknown-elf/7.1.1/libgcc.a(_udivdi3.o): can't link hard-float modules with soft-float modules
/scratch/dace4softhier/gvsoc/third_party/toolchain/v1.0.16-pulp-riscv-gcc-centos-7/bin/../lib/gcc/riscv32-unknown-elf/7.1.1/../../../../riscv32-unknown-elf/bin/ld: failed to merge target specific data of file /scratch/dace4softhier/gvsoc/third_party/toolchain/v1.0.16-pulp-riscv-gcc-centos-7/bin/../lib/gcc/riscv32-unknown-elf/7.1.1/libgcc.a(_udivdi3.o)
/scratch/dace4softhier/gvsoc/third_party/toolchain/v1.0.16-pulp-riscv-gcc-centos-7/bin/../lib/gcc/riscv32-unknown-elf/7.1.1/../../../../riscv32-unknown-elf/bin/ld: /scratch/dace4softhier/gvsoc/third_party/toolchain/v1.0.16-pulp-riscv-gcc-centos-7/bin/../lib/gcc/riscv32-unknown-elf/7.1.1/libgcc.a(_umoddi3.o): can't link hard-float modules with soft-float modules
/scratch/dace4softhier/gvsoc/third_party/toolchain/v1.0.16-pulp-riscv-gcc-centos-7/bin/../lib/gcc/riscv32-unknown-elf/7.1.1/../../../../riscv32-unknown-elf/bin/ld: failed to merge target specific data of file /scratch/dace4softhier/gvsoc/third_party/toolchain/v1.0.16-pulp-riscv-gcc-centos-7/bin/../lib/gcc/riscv32-unknown-elf/7.1.1/libgcc.a(_umoddi3.o)
/scratch/dace4softhier/gvsoc/third_party/toolchain/v1.0.16-pulp-riscv-gcc-centos-7/bin/../lib/gcc/riscv32-unknown-elf/7.1.1/../../../../riscv32-unknown-elf/bin/ld: /scratch/dace4softhier/gvsoc/third_party/toolchain/v1.0.16-pulp-riscv-gcc-centos-7/bin/../lib/gcc/riscv32-unknown-elf/7.1.1/libgcc.a(_clz.o): can't link hard-float modules with soft-float modules
/scratch/dace4softhier/gvsoc/third_party/toolchain/v1.0.16-pulp-riscv-gcc-centos-7/bin/../lib/gcc/riscv32-unknown-elf/7.1.1/../../../../riscv32-unknown-elf/bin/ld: failed to merge target specific data of file /scratch/dace4softhier/gvsoc/third_party/toolchain/v1.0.16-pulp-riscv-gcc-centos-7/bin/../lib/gcc/riscv32-unknown-elf/7.1.1/libgcc.a(_clz.o)
collect2: error: ld returned 1 exit status
gmake[2]: *** [CMakeFiles/build_output.dir/build.make:73: softhier.elf] Error 1
gmake[1]: *** [CMakeFiles/Makefile2:87: CMakeFiles/build_output.dir/all] Error 2
gmake: *** [Makefile:91: all] Error 2


In [None]:
result = G


In [None]:
# print(A_host)

print(result[0][0])


            