diff --git a/cscs-checks/microbenchmarks/strided_bandwidth/src/strides.cpp b/cscs-checks/microbenchmarks/strided_bandwidth/src/strides.cpp new file mode 100644 index 0000000000..4e3f601888 --- /dev/null +++ b/cscs-checks/microbenchmarks/strided_bandwidth/src/strides.cpp @@ -0,0 +1,188 @@ +// contributed by Sebastian Keller, CSCS, 2019-10 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +template +size_t determine_loops(Callable const& func, double target_duration) +{ + // measure number of loops required + size_t loops = 1; + double elapsed = 0; + do { + loops *= 2; + auto t0 = std::chrono::high_resolution_clock::now(); + func(loops); + auto t1 = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration = t1 - t0; + elapsed = duration.count(); + } while (elapsed < 0.1); + loops *= target_duration/elapsed; + + return loops; +} + +template +double timed_run(Callable const& func) +{ + auto t0 = std::chrono::high_resolution_clock::now(); + func(); + auto t1 = std::chrono::high_resolution_clock::now(); + + std::chrono::duration duration = t1 - t0; + return duration.count(); +} + +class Buffer +{ +public: + Buffer() : data_size(0), data_(nullptr, free) {} + + explicit Buffer(size_t sz_) : data_size(sz_), data_(nullptr, free) + { + data_size = (sz_/sizeof(size_t)) * (sizeof(size_t)); + + void* buf; + if (posix_memalign(&buf, 32, data_size)) { + std::cout << "alloc failed\n"; exit(1); + } + // provide "free" as custom deleter + data_ = std::unique_ptr((size_t*)buf, free); + + init(); + } + + Buffer(Buffer&& rhs) : data_(std::move(rhs.data_)), data_size(rhs.data_size) {} + + Buffer& operator=(Buffer&& rhs) { + data_ = std::move(rhs.data_); + data_size = rhs.data_size; + return *this; + } + + size_t* get() const { return data_.get(); } + +private: + + void init() + { + for (size_t i = 0; i < data_size/sizeof(size_t); i++) + data_[i] = 1; + } + + std::unique_ptr data_; + size_t data_size; +}; + + +void set_affinity(std::thread& t, int i) +{ + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(i, &cpuset); + int rc = pthread_setaffinity_np(t.native_handle(), + sizeof(cpu_set_t), &cpuset); + if (rc != 0) + std::cerr << "Error calling pthread_setaffinity_np: " << rc << "\n"; +} + +int get_affinity(std::thread& t) +{ + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + int rc = pthread_getaffinity_np(t.native_handle(), + sizeof(cpu_set_t), &cpuset); + if (rc != 0) + std::cerr << "Error calling pthread_setaffinity_np: " << rc << "\n"; + int ret; + for (int i=0; i < 128; ++i) + if (CPU_ISSET(i, &cpuset)) return i; + + return -1; +} + +void update_stride(size_t*, size_t, size_t, size_t); + + +int main(int argc, char ** argv) +{ + if (argc != 4) + { + std::cout << "Usage: ./ buffer_size stride nthreads\n" + "\n" + "buffer_size: in bytes\n" + "stride: in multiples of 8 bytes\n"; + exit(1); + } + + size_t s = std::stoi(argv[1]); + size_t stride = std::stoi(argv[2]); + int nthreads = std::stoi(argv[3]); + + double time_per_run = 2.0; + + std::vector buf(nthreads); + // initialize buffers (first touch) + for (int i = 0; i < nthreads; ++i) + { + std::thread t([=, &buf](){ + std::this_thread::sleep_for(std::chrono::milliseconds(20)); + buf[i] = Buffer(s); + }); + set_affinity(t, i); + t.join(); + } + + auto func = [buf = buf[0].get(), s, stride](int loops) { update_stride(buf, s, loops, stride); }; + size_t loops = determine_loops(func, time_per_run); + + std::vector> tasks(nthreads); + for (int i = 0; i < nthreads; ++i) + tasks[i] = std::bind(&update_stride, buf[i].get(), s, loops, stride); + + std::vector workers(nthreads); + auto t0 = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < nthreads; ++i) + { + workers[i] = std::thread(tasks[i]); + set_affinity(workers[i], i); + } + + for (auto& t: workers) + t.join(); + + auto t1 = std::chrono::high_resolution_clock::now(); + std::chrono::duration elapsed = t1 - t0; + double time = elapsed.count(); + + size_t load_and_store = 2; + size_t bytes_moved = nthreads * s * load_and_store * loops / stride; + std::cout << "buffer size: " << s/1024 << "Kb, bandwidth: " + << double(bytes_moved)/(1024*1024*1024) / time + << " GB/s" << std::endl; +} + + + +void update_stride(size_t* src, size_t nbytes, size_t iterations, size_t stride) +{ + while (iterations--) + { + for (size_t i = 0; i < nbytes/sizeof(size_t); i+=stride) + { + src[i]++; // load & store + } + } +} diff --git a/cscs-checks/microbenchmarks/strided_bandwidth/strides.py b/cscs-checks/microbenchmarks/strided_bandwidth/strides.py new file mode 100644 index 0000000000..c1cfb1190a --- /dev/null +++ b/cscs-checks/microbenchmarks/strided_bandwidth/strides.py @@ -0,0 +1,138 @@ +import reframe as rfm +import reframe.utility.sanity as sn + + +class StridedBase(rfm.RegressionTest): + def __init__(self): + super().__init__() + self.sourcepath = 'strides.cpp' + self.build_system = 'SingleSource' + self.valid_systems = ['daint:gpu', 'dom:gpu', 'daint:mc', 'dom:mc'] + self.valid_prog_environs = ['PrgEnv-gnu'] + self.num_tasks = 1 + self.num_tasks_per_node = 1 + + self.sanity_patterns = sn.assert_eq( + sn.count(sn.findall(r'bandwidth', self.stdout)), + self.num_tasks_assigned) + + self.perf_patterns = { + 'bandwidth': sn.extractsingle( + r'bandwidth: (?P\S+) GB/s', + self.stdout, 'bw', float) + } + + self.system_num_cpus = { + 'daint:mc': 72, + 'daint:gpu': 24, + 'dom:mc': 72, + 'dom:gpu': 24, + } + + self.maintainers = ['SK'] + self.tags = {'benchmark', 'diagnostic'} + + @property + @sn.sanity_function + def num_tasks_assigned(self): + return self.job.num_tasks + + +@rfm.required_version('>=2.16-dev0') +@rfm.simple_test +class StridedBandwidthTest(StridedBase): + def __init__(self): + super().__init__() + + self.reference = { + 'dom:gpu': { + 'bandwidth': (50, -0.1, 0.1, 'GB/s') + }, + 'dom:mc': { + 'bandwidth': (100, -0.1, 0.1, 'GB/s') + }, + 'daint:gpu': { + 'bandwidth': (50, -0.1, 0.1, 'GB/s') + }, + 'daint:mc': { + 'bandwidth': (100, -0.1, 0.1, 'GB/s') + }, + '*': { + 'bandwidth': (0, None, None, 'GB/s') + } + } + + def setup(self, partition, environ, **job_opts): + self.num_cpus = self.system_num_cpus[partition.fullname] + + # 8-byte stride, using the full cacheline + self.executable_opts = ['100000000', '1', '%s' % self.num_cpus] + + super().setup(partition, environ, **job_opts) + + + +@rfm.required_version('>=2.16-dev0') +@rfm.simple_test +class StridedBandwidthTest64(StridedBase): + def __init__(self): + super().__init__() + + self.reference = { + 'dom:gpu': { + 'bandwidth': (6, -0.1, 0.2, 'GB/s') + }, + 'dom:mc': { + 'bandwidth': (12.5, -0.1, 0.2, 'GB/s') + }, + 'daint:gpu': { + 'bandwidth': (6, -0.05, 0.2, 'GB/s') + }, + 'daint:mc': { + 'bandwidth': (12.5, -0.1, 0.2, 'GB/s') + }, + '*': { + 'bandwidth': (0, None, None, 'GB/s') + } + } + + def setup(self, partition, environ, **job_opts): + self.num_cpus = self.system_num_cpus[partition.fullname] + + # 64-byte stride, using 1/8 of the cacheline + self.executable_opts = ['100000000', '8', '%s' % self.num_cpus] + + super().setup(partition, environ, **job_opts) + + +@rfm.required_version('>=2.16-dev0') +@rfm.simple_test +class StridedBandwidthTest128(StridedBase): + def __init__(self): + super().__init__() + + self.reference = { + 'dom:gpu': { + 'bandwidth': (4.5, -0.1, 0.2, 'GB/s') + }, + 'dom:mc': { + 'bandwidth': (9.1, -0.1, 0.2, 'GB/s') + }, + 'daint:gpu': { + 'bandwidth': (4.5, -0.1, 0.2, 'GB/s') + }, + 'daint:mc': { + 'bandwidth': (9.1, -0.1, 0.2, 'GB/s') + }, + '*': { + 'bandwidth': (0, None, None, 'GB/s') + } + } + + def setup(self, partition, environ, **job_opts): + self.num_cpus = self.system_num_cpus[partition.fullname] + + # 128-byte stride, using 1/8 of every 2nd cacheline + self.executable_opts = ['100000000', '16', '%s' % self.num_cpus] + + super().setup(partition, environ, **job_opts)