Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Native LIKWID Instrumentation #1063

Merged
merged 19 commits into from
Jul 23, 2022
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
14 changes: 11 additions & 3 deletions dace/codegen/instrumentation/likwid.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
""" Implements the LIKWID counter performance instrumentation provider.
Used for collecting CPU performance counters. """
Used for collecting CPU performance counters.
"""

import dace
from dace import dtypes, registry
Expand All @@ -9,11 +10,14 @@

from dace.transformation import helpers as xfh

from pathlib import Path


@registry.autoregister_params(type=dtypes.InstrumentationType.LIKWID_Counters)
class LIKWIDInstrumentation(InstrumentationProvider):
""" Instrumentation provider that reports CPU performance counters using
lukastruemper marked this conversation as resolved.
Show resolved Hide resolved
the Likwid tool. """
the Likwid tool.
"""

perf_whitelist_schedules = [dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.Sequential]

Expand Down Expand Up @@ -44,6 +48,8 @@ def on_sdfg_begin(self, sdfg, local_stream, global_stream, codegen):
if not self._likwid_used:
return

likwid_marker_file = Path(sdfg.build_folder) / "perf" / "likwid_marker.out"

# Add instrumentation includes and initialize LIKWID
header_code = '''
#include <omp.h>
Expand All @@ -64,7 +70,7 @@ def on_sdfg_begin(self, sdfg, local_stream, global_stream, codegen):
exit(1);
}}

setenv("LIKWID_FILEPATH", "/tmp/likwid_marker.out", 0);
setenv("LIKWID_FILEPATH", "{likwid_marker_file.absolute()}", 0);
setenv("LIKWID_MODE", "1", 0);
setenv("LIKWID_FORCE", "1", 1);
setenv("LIKWID_EVENTS", "{self._default_events}", 0);
Expand Down Expand Up @@ -106,6 +112,8 @@ def on_sdfg_begin(self, sdfg, local_stream, global_stream, codegen):
local_stream.write(init_code)

def on_sdfg_end(self, sdfg, local_stream, global_stream):
if not sdfg.parent is None:
lukastruemper marked this conversation as resolved.
Show resolved Hide resolved
return
if not self._likwid_used:
return

Expand Down
4 changes: 0 additions & 4 deletions dace/codegen/instrumentation/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,6 @@ def __init__(self, filename: str):
continue

phase = event["ph"]

# WARNING: Removed
event_name = event["name"]

tid = event["tid"]
if phase == 'X':
# Time
Expand Down
14 changes: 9 additions & 5 deletions dace/runtime/include/dace/perf/reporting.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ namespace perf {
char cat[DACE_REPORT_EVENT_CAT_LEN];
unsigned long int tstart;
unsigned long int tend;
int tid;
size_t tid;
struct _element_id {
int sdfg_id;
int state_id;
Expand Down Expand Up @@ -69,15 +69,17 @@ namespace perf {
const char *counter_name,
unsigned long int counter_val
) {
add_counter(name, cat, counter_name, counter_val, -1, -1, -1, -1);
std::thread::id thread_id = std::this_thread::get_id();
size_t tid = std::hash<std::thread::id>{}(thread_id);
add_counter(name, cat, counter_name, counter_val, tid, -1, -1, -1);
}

void add_counter(
const char *name,
const char *cat,
const char *counter_name,
unsigned long int counter_val,
int tid,
size_t tid,
int sdfg_id,
int state_id,
int el_id
Expand Down Expand Up @@ -124,15 +126,17 @@ namespace perf {
int state_id,
int el_id
) {
add_completion(name, cat, tstart, tend, -1, sdfg_id, state_id, el_id);
std::thread::id thread_id = std::this_thread::get_id();
size_t tid = std::hash<std::thread::id>{}(thread_id);
add_completion(name, cat, tstart, tend, tid, sdfg_id, state_id, el_id);
}

void add_completion(
const char *name,
const char *cat,
unsigned long int tstart,
unsigned long int tend,
int tid,
size_t tid,
int sdfg_id,
int state_id,
int el_id
Expand Down
75 changes: 56 additions & 19 deletions samples/instrumentation/matmul_likwid.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
import dace
lukastruemper marked this conversation as resolved.
Show resolved Hide resolved
import numpy as np

import dace.transformation.helpers as xfh

M = dace.symbol('M')
K = dace.symbol('K')
N = dace.symbol('N')
Expand All @@ -9,54 +12,88 @@
@dace.program
def matmul(A: dace.float32[M, K], B: dace.float32[K, N], C: dace.float32[M, N]):
tmp = np.ndarray([M, N, K], dtype=A.dtype)

# Multiply every pair of values to a large 3D temporary array
for i, j, k in dace.map[0:M, 0:N, 0:K]:
with dace.tasklet:
in_A << A[i, k]
in_B << B[k, j]
out >> tmp[i, j, k]

out = in_A * in_B

# Sum last dimension of temporary array to obtain resulting matrix
dace.reduce(lambda a, b: a + b, tmp, C, axis=2, identity=0)


##### DaCe + Likwid: Matmul Instrumentation #####
# This samples demonstrates the likwid instrumentation in Dace.
lukastruemper marked this conversation as resolved.
Show resolved Hide resolved
#
# In order to run the sample, specific environment variables must be set
# - OMP_NUM_THREADS: number of threads [1, num procs]
# - LIKWID_EVENTS: set of counters to be measured [FLOPS_SP, CACHE, MEM, ...]
#
# Example: 'OMP_NUM_THREADS=2 LIKWID_EVENTS="FLOPS_SP" python matmul_likwid.py'
#
# The available event set for your architecture can be found in the likwid
# groups folder: https://github.com/RRZE-HPC/likwid/tree/master/groups

## 1. Setup: SDFG + data
# Convert to SDFG
sdfg = matmul.to_sdfg()
sdfg.expand_library_nodes()
sdfg.simplify()

# Specialize SDFG for input sizes
m = 512
k = 512
n = 512
sdfg.specialize({M: m, N: n, K: k})
lukastruemper marked this conversation as resolved.
Show resolved Hide resolved

# Create arrays
A = np.random.rand(m, k).astype(np.float32)
B = np.random.rand(k, n).astype(np.float32)
C = np.zeros((m, n), dtype=np.float32)

sdfg = matmul.to_sdfg()
sdfg.simplify()
sdfg.expand_library_nodes()
sdfg.specialize({M: m, N: n, K: k})

## 2. Instrumentation
# We will now iterate through the SDFG and set the instrumentation
# type to LIKWID_Counters for all states and top-level map entries.
# Non-top-level map entries are currently not supported!
for nsdfg in sdfg.all_sdfgs_recursive():
for state in nsdfg.nodes():
state.instrument = dace.InstrumentationType.LIKWID_Counters
for node in state.nodes():
if isinstance(node, dace.nodes.MapEntry):
if isinstance(node, dace.nodes.MapEntry) and xfh.get_parent_map(state, node) is None:
node.instrument = dace.InstrumentationType.LIKWID_Counters

with dace.config.set_temporary("instrumentation", "report_each_invocation", value=False):
csdfg = sdfg.compile()
for _ in range(1):
csdfg(A=A, B=B, C=C)

csdfg.finalize()

## 3. Compile and execute
# During execution, the counters for different parts of the SDFG and different
# threads are measured by likwid and written into a performance report
# in form of events. This report is saved at .dacecache/matmul/perf.
csdfg = sdfg.compile()
csdfg(A=A, B=B, C=C)

## 4. Report
# We can now parse the performance report into a python-object
# and read different counters or timers. Furthermore, the report
# provides a table-like print.
report = sdfg.get_latest_report()

# Print human-readable table
# Tip: Try this feature with only a 1-2 on instrumented states/nodes.
print(report)

# Access counters
# We will now demonstrate how to access the raw values from the report
# on the example of number of SP FLOPS. Those are measured
# when executing the sample with LIKWID_EVENTS="FLOPS_SP".
#
# Counter values are grouped by the SDFG element which defines the scope
# of the intrumentation. Those elements are described as the triplet
# (sdfg_id, state_id, node_id).

measured_flops = 0
flops_report = report.counters[(0, 0, -1)]["RETIRED_SSE_AVX_FLOPS_SINGLE_ALL"]
for tid in flops_report:
measured_flops += flops_report[tid][0]

flops = m * k * (n * 2)
print(f"Expected {flops} FLOPS, measured {measured_flops} FLOPS, diff: {measured_flops - flops}")
# ~ expected FLOPS
expected_flops = m * k * (n * 2)

print(f"Expected {expected_flops} FLOPS, measured {measured_flops} FLOPS, diff: {measured_flops - expected_flops}")