# Scalar Multiplication 2

This notebook is quite similar to **Scalar Multiplication 1**, but instead of reusing shared memory due to a sequential scheduled map, we reuse shared memory since it is in the body of a **for loop**.
This notebook is shorter and does not explain everything all over again in detail.

Needed imports:

In [1]:
import dace
from dace.sdfg.state import LoopRegion
from IPython.display import Code
from dace.transformation.passes.shared_memory_synchronization import DefaultSharedMemorySync

## Insipration

As in **Scalar Multiplication 1**, the frontend DaCe program that was used as an inspiration. I omit the different positions where the for loop can be, I just assume it is the innermost "scope".

In [2]:
# To next file
@dace.program
def scalarMultiplication(A: dace.int32[128] @ dace.dtypes.StorageType.GPU_Global, scalar: dace.int32):
    for i in dace.map[0:32:32] @ dace.dtypes.ScheduleType.GPU_Device:
        for j in dace.map[0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock:
            for k in range(4):
                tmp = A[k * 32 + j]
                A[k * 32 + j] = scalar * tmp
                

sdfg = scalarMultiplication.to_sdfg()
sdfg

The sdfg we use with by using shared memory instead of a temporary local variable:

In [3]:
def scalarMultiplication_smem():
    sdfg = dace.SDFG("scalarMultiplication_smem")
    state = sdfg.add_state("main")

    # Arrays and access nodes
    sdfg.add_array("A", (128,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global)
    sdfg.add_scalar("scalar", dace.uint32)
    a_acc = state.add_read("A")
    a_store = state.add_write("A")
    scalar_acc = state.add_access("scalar")

    # Device and thread-block maps
    gpu_map_entry, gpu_map_exit = state.add_map(
        "gpu_map", dict(i="0:32:32"), schedule=dace.dtypes.ScheduleType.GPU_Device
    )
    tb_map_entry, tb_map_exit = state.add_map(
        "tb", dict(j="0:32"), schedule=dace.dtypes.ScheduleType.GPU_ThreadBlock
    )

    # Nested SDFG setup
    inner_sdfg = dace.SDFG('nested_sdfg')
    nested = state.add_nested_sdfg(inner_sdfg, sdfg, inputs={'__inp_A', '__inp_scalar'}, outputs={'tmp_ret'})

    loopreg = LoopRegion("loop", "k < 4", "k", "k = 0", "k = (k + 1)", False, inner_sdfg)
    inner_sdfg.add_node(loopreg)
    inner_state = loopreg.add_state("use_smem")

    # Shared memory and result
    inner_sdfg.add_array("S", (32,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Shared, transient=True)
    inner_sdfg.add_scalar("tmp_ret", dace.uint32)
    s_acc = inner_state.add_access("S")
    ret = inner_state.add_write("tmp_ret")

    # Tasklets
    tasklet1 = inner_state.add_tasklet(
        "assign_to_smem", inputs={}, outputs={"__out1"},
        code="__out1 = __inp_A[j + 32 * k]",
        language=dace.dtypes.Language.CPP
    )
    tasklet2 = inner_state.add_tasklet(
        "addMult", inputs={"__inp2"}, outputs={"__out2"},
        code="__out2 = __inp2 * __inp_scalar;",
        language=dace.dtypes.Language.CPP
    )

    # Main SDFG edges
    state.add_edge(a_acc, None, gpu_map_entry, None, dace.Memlet("A[0:128]"))
    state.add_edge(scalar_acc, None, gpu_map_entry, None, dace.Memlet("scalar[0]"))
    state.add_edge(gpu_map_entry, None, tb_map_entry, None, dace.Memlet("A[0:128]"))
    state.add_edge(gpu_map_entry, None, tb_map_entry, None, dace.Memlet("scalar[0]"))
    state.add_edge(tb_map_entry, None, nested, "__inp_A", dace.Memlet("A[j : j + 97 : 32]"))
    state.add_edge(tb_map_entry, None, nested, "__inp_scalar", dace.Memlet("scalar[0]"))
    state.add_edge(nested, "tmp_ret", tb_map_exit, None, dace.Memlet("A[j : j + 97 : 32]"))
    state.add_edge(tb_map_exit, None, gpu_map_exit, None, dace.Memlet("A[0:128]"))
    state.add_edge(gpu_map_exit, None, a_store, None, dace.Memlet("A[0:128]"))

    # Inner SDFG edges
    inner_state.add_edge(tasklet1, "__out1", s_acc, None, dace.Memlet("S[j]"))
    inner_state.add_edge(s_acc, None, tasklet2, "__inp2", dace.Memlet("S[j]"))
    inner_state.add_edge(tasklet2, "__out2", ret, None, dace.Memlet("S[j]"))

    sdfg.fill_scope_connectors()
    return sdfg


sdfg = scalarMultiplication_smem()
sdfg


Observe how the synchronization tasklets are inserted using the DefaultSharedMemorySync pass:

In [4]:
# insert synchronization barriers
DefaultSharedMemorySync().apply_pass(sdfg, None)
sdfg