spcl · tbennun · Jul 23, 2022 · Jul 10, 2022 · Jul 20, 2022 · Jul 20, 2022
diff --git a/dace/codegen/instrumentation/likwid.py b/dace/codegen/instrumentation/likwid.py
@@ -1,6 +1,7 @@
 # Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
 """ Implements the LIKWID counter performance instrumentation provider.
-    Used for collecting CPU performance counters. """
+    Used for collecting CPU performance counters.
+"""
 
 import dace
 from dace import dtypes, registry
@@ -9,11 +10,14 @@
 
 from dace.transformation import helpers as xfh
 
+from pathlib import Path
+
 
 @registry.autoregister_params(type=dtypes.InstrumentationType.LIKWID_Counters)
 class LIKWIDInstrumentation(InstrumentationProvider):
     """ Instrumentation provider that reports CPU performance counters using
-        the Likwid tool. """
+        the Likwid tool.
+    """
 
     perf_whitelist_schedules = [dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.Sequential]
 
@@ -44,6 +48,8 @@ def on_sdfg_begin(self, sdfg, local_stream, global_stream, codegen):
         if not self._likwid_used:
             return
 
+        likwid_marker_file = Path(sdfg.build_folder) / "perf" / "likwid_marker.out"
+
         # Add instrumentation includes and initialize LIKWID
         header_code = '''
 #include <omp.h>
@@ -64,7 +70,7 @@ def on_sdfg_begin(self, sdfg, local_stream, global_stream, codegen):
     exit(1);
 }}
 
-setenv("LIKWID_FILEPATH", "/tmp/likwid_marker.out", 0);
+setenv("LIKWID_FILEPATH", "{likwid_marker_file.absolute()}", 0);
 setenv("LIKWID_MODE", "1", 0);
 setenv("LIKWID_FORCE", "1", 1);
 setenv("LIKWID_EVENTS", "{self._default_events}", 0);
@@ -106,6 +112,8 @@ def on_sdfg_begin(self, sdfg, local_stream, global_stream, codegen):
         local_stream.write(init_code)
 
     def on_sdfg_end(self, sdfg, local_stream, global_stream):
+        if not sdfg.parent is None:
+            return
         if not self._likwid_used:
             return
 

diff --git a/dace/codegen/instrumentation/report.py b/dace/codegen/instrumentation/report.py
@@ -48,10 +48,6 @@ def __init__(self, filename: str):
                     continue
 
                 phase = event["ph"]
-
-                # WARNING: Removed
-                event_name = event["name"]
-
                 tid = event["tid"]
                 if phase == 'X':
                     # Time

diff --git a/dace/runtime/include/dace/perf/reporting.h b/dace/runtime/include/dace/perf/reporting.h
@@ -32,7 +32,7 @@ namespace perf {
         char cat[DACE_REPORT_EVENT_CAT_LEN];
         unsigned long int tstart;
         unsigned long int tend;
-        int tid;
+        size_t tid;
         struct _element_id {
             int sdfg_id;
             int state_id;
@@ -69,15 +69,17 @@ namespace perf {
             const char *counter_name,
             unsigned long int counter_val
         ) {
-            add_counter(name, cat, counter_name, counter_val, -1, -1, -1, -1);
+            std::thread::id thread_id = std::this_thread::get_id();
+            size_t tid = std::hash<std::thread::id>{}(thread_id);
+            add_counter(name, cat, counter_name, counter_val, tid, -1, -1, -1);
         }
 
         void add_counter(
             const char *name,
             const char *cat,
             const char *counter_name,
             unsigned long int counter_val,
-            int tid,
+            size_t tid,
             int sdfg_id,
             int state_id,
             int el_id
@@ -124,15 +126,17 @@ namespace perf {
             int state_id,
             int el_id
         ) {
-            add_completion(name, cat, tstart, tend, -1, sdfg_id, state_id, el_id);
+            std::thread::id thread_id = std::this_thread::get_id();
+            size_t tid = std::hash<std::thread::id>{}(thread_id);
+            add_completion(name, cat, tstart, tend, tid, sdfg_id, state_id, el_id);
         }
 
         void add_completion(
             const char *name,
             const char *cat,
             unsigned long int tstart,
             unsigned long int tend,
-            int tid,
+            size_t tid,
             int sdfg_id,
             int state_id,
             int el_id

diff --git a/samples/instrumentation/matmul_likwid.py b/samples/instrumentation/matmul_likwid.py
@@ -1,6 +1,9 @@
+# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
 import dace
 import numpy as np
 
+import dace.transformation.helpers as xfh
+
 M = dace.symbol('M')
 K = dace.symbol('K')
 N = dace.symbol('N')
@@ -9,54 +12,88 @@
 @dace.program
 def matmul(A: dace.float32[M, K], B: dace.float32[K, N], C: dace.float32[M, N]):
     tmp = np.ndarray([M, N, K], dtype=A.dtype)
-
-    # Multiply every pair of values to a large 3D temporary array
     for i, j, k in dace.map[0:M, 0:N, 0:K]:
         with dace.tasklet:
             in_A << A[i, k]
             in_B << B[k, j]
             out >> tmp[i, j, k]
 
             out = in_A * in_B
-
-    # Sum last dimension of temporary array to obtain resulting matrix
     dace.reduce(lambda a, b: a + b, tmp, C, axis=2, identity=0)
 
 
+##### DaCe + Likwid: Matmul Instrumentation #####
+# This samples demonstrates the likwid instrumentation  in Dace.
+#
+# In order to run the sample, specific environment variables must be set
+# - OMP_NUM_THREADS: number of threads [1, num procs]
+# - LIKWID_EVENTS: set of counters to be measured [FLOPS_SP, CACHE, MEM, ...]
+#
+# Example: 'OMP_NUM_THREADS=2 LIKWID_EVENTS="FLOPS_SP" python matmul_likwid.py'
+#
+# The available event set for your architecture can be found in the likwid
+# groups folder: https://github.com/RRZE-HPC/likwid/tree/master/groups
+
+## 1. Setup: SDFG + data
+# Convert to SDFG
+sdfg = matmul.to_sdfg()
+sdfg.expand_library_nodes()
+sdfg.simplify()
+
+# Specialize SDFG for input sizes
 m = 512
 k = 512
 n = 512
+sdfg.specialize({M: m, N: n, K: k})
 
+# Create arrays
 A = np.random.rand(m, k).astype(np.float32)
 B = np.random.rand(k, n).astype(np.float32)
 C = np.zeros((m, n), dtype=np.float32)
 
-sdfg = matmul.to_sdfg()
-sdfg.simplify()
-sdfg.expand_library_nodes()
-sdfg.specialize({M: m, N: n, K: k})
-
+## 2. Instrumentation
+# We will now iterate through the SDFG and set the instrumentation
+# type to LIKWID_Counters for all states and top-level map entries.
+# Non-top-level map entries are currently not supported!
 for nsdfg in sdfg.all_sdfgs_recursive():
     for state in nsdfg.nodes():
         state.instrument = dace.InstrumentationType.LIKWID_Counters
         for node in state.nodes():
-            if isinstance(node, dace.nodes.MapEntry):
+            if isinstance(node, dace.nodes.MapEntry) and xfh.get_parent_map(state, node) is None:
                 node.instrument = dace.InstrumentationType.LIKWID_Counters
 
-with dace.config.set_temporary("instrumentation", "report_each_invocation", value=False):
-    csdfg = sdfg.compile()
-    for _ in range(1):
-        csdfg(A=A, B=B, C=C)
-
-    csdfg.finalize()
-
+## 3. Compile and execute
+# During execution, the counters for different parts of the SDFG and different
+# threads are measured by likwid and written into a performance report
+# in form of events. This report is saved at .dacecache/matmul/perf.
+csdfg = sdfg.compile()
+csdfg(A=A, B=B, C=C)
+
+## 4. Report
+# We can now parse the performance report into a python-object
+# and read different counters or timers. Furthermore, the report
+# provides a table-like print.
 report = sdfg.get_latest_report()
+
+# Print human-readable table
+# Tip: Try this feature with only a 1-2 on instrumented states/nodes.
 print(report)
 
+# Access counters
+# We will now demonstrate how to access the raw values from the report
+# on the example of number of SP FLOPS. Those are measured
+# when executing the sample with LIKWID_EVENTS="FLOPS_SP".
+#
+# Counter values are grouped by the SDFG element which defines the scope
+# of the intrumentation. Those elements are described as the triplet
+# (sdfg_id, state_id, node_id).
+
 measured_flops = 0
 flops_report = report.counters[(0, 0, -1)]["RETIRED_SSE_AVX_FLOPS_SINGLE_ALL"]
 for tid in flops_report:
     measured_flops += flops_report[tid][0]
 
-flops = m * k * (n * 2)
-print(f"Expected {flops} FLOPS, measured {measured_flops} FLOPS, diff: {measured_flops - flops}")
+# ~ expected FLOPS
+expected_flops = m * k * (n * 2)
+
+print(f"Expected {expected_flops} FLOPS, measured {measured_flops} FLOPS, diff: {measured_flops - expected_flops}")