reframe-hpc · vkarak · Nov 27, 2019 · Oct 7, 2019 · Nov 27, 2019 · Nov 27, 2019
diff --git a/cscs-checks/microbenchmarks/strided_bandwidth/src/strides.cpp b/cscs-checks/microbenchmarks/strided_bandwidth/src/strides.cpp
@@ -0,0 +1,188 @@
+// contributed by Sebastian Keller, CSCS, 2019-10
+
+#include <iostream>
+#include <cstdlib>
+#include <chrono>
+#include <stdlib.h>
+#include <functional>
+#include <vector>
+#include <thread>
+#include <memory>
+#include <string>
+#include <chrono>
+#include <functional>
+
+#include <sched.h>
+#include <pthread.h>
+
+template <class Callable>
+size_t determine_loops(Callable const& func, double target_duration)
+{
+    // measure number of loops required
+    size_t loops = 1;
+    double elapsed = 0;
+    do {
+        loops *= 2;
+        auto t0 = std::chrono::high_resolution_clock::now();
+        func(loops);
+        auto t1 = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> duration = t1 - t0;
+        elapsed = duration.count();
+    } while (elapsed < 0.1);
+    loops *= target_duration/elapsed;
+
+    return loops;
+}
+
+template <class Callable>
+double timed_run(Callable const& func)
+{
+    auto t0 = std::chrono::high_resolution_clock::now();
+    func();
+    auto t1 = std::chrono::high_resolution_clock::now();
+
+    std::chrono::duration<double> duration = t1 - t0;
+    return duration.count();
+}
+
+class Buffer
+{
+public:
+    Buffer() : data_size(0), data_(nullptr, free) {}
+
+    explicit Buffer(size_t sz_) : data_size(sz_), data_(nullptr, free)
+    {
+        data_size = (sz_/sizeof(size_t)) * (sizeof(size_t));
+
+        void* buf;
+        if (posix_memalign(&buf, 32, data_size)) {
+            std::cout << "alloc failed\n"; exit(1);
+        } 
+        // provide "free" as custom deleter
+        data_ = std::unique_ptr<size_t[], decltype(free)*>((size_t*)buf, free);
+
+        init();
+    }
+
+    Buffer(Buffer&& rhs) : data_(std::move(rhs.data_)), data_size(rhs.data_size) {}
+
+    Buffer& operator=(Buffer&& rhs) {
+        data_ = std::move(rhs.data_);
+        data_size = rhs.data_size;
+        return *this;
+    }
+
+    size_t* get() const { return data_.get(); }
+
+private:
+
+    void init()
+    {
+        for (size_t i = 0; i < data_size/sizeof(size_t); i++)
+            data_[i] = 1;
+    }
+
+    std::unique_ptr<size_t[], decltype(free)*> data_;
+    size_t data_size;
+};
+
+
+void set_affinity(std::thread& t, int i)
+{
+    cpu_set_t cpuset;
+    CPU_ZERO(&cpuset);
+    CPU_SET(i, &cpuset);
+    int rc = pthread_setaffinity_np(t.native_handle(),
+                                    sizeof(cpu_set_t), &cpuset);
+    if (rc != 0)
+        std::cerr << "Error calling pthread_setaffinity_np: " << rc << "\n";
+}
+
+int get_affinity(std::thread& t)
+{
+    cpu_set_t cpuset;
+    CPU_ZERO(&cpuset);
+    int rc = pthread_getaffinity_np(t.native_handle(),
+                                    sizeof(cpu_set_t), &cpuset);
+    if (rc != 0)
+        std::cerr << "Error calling pthread_setaffinity_np: " << rc << "\n";
+    int ret;
+    for (int i=0; i < 128; ++i)
+        if (CPU_ISSET(i, &cpuset)) return i;
+
+    return -1;
+}
+
+void update_stride(size_t*, size_t, size_t, size_t);
+
+
+int main(int argc, char ** argv)
+{
+    if (argc != 4)
+    {
+        std::cout << "Usage: ./<prog_name> buffer_size stride nthreads\n"
+                     "\n"
+                     "buffer_size: in bytes\n"
+                     "stride: in multiples of 8 bytes\n";
+        exit(1);
+    }
+
+    size_t s = std::stoi(argv[1]);
+    size_t stride = std::stoi(argv[2]);
+    int nthreads = std::stoi(argv[3]);
+
+    double time_per_run = 2.0;
+
+    std::vector<Buffer> buf(nthreads);
+    // initialize buffers (first touch)
+    for (int i = 0; i < nthreads; ++i)
+    {
+        std::thread t([=, &buf](){
+                std::this_thread::sleep_for(std::chrono::milliseconds(20));
+                buf[i] = Buffer(s);
+            });
+        set_affinity(t, i);
+        t.join();
+    }
+
+    auto func = [buf = buf[0].get(), s, stride](int loops) { update_stride(buf, s, loops, stride); };
+    size_t loops = determine_loops(func, time_per_run);
+
+    std::vector<std::function<void()>> tasks(nthreads);
+    for (int i = 0; i < nthreads; ++i)
+        tasks[i] = std::bind(&update_stride, buf[i].get(), s, loops, stride);
+
+    std::vector<std::thread> workers(nthreads);
+    auto t0 = std::chrono::high_resolution_clock::now();
+    for (int i = 0; i < nthreads; ++i)
+    {
+        workers[i] = std::thread(tasks[i]);
+        set_affinity(workers[i], i);
+    }
+
+    for (auto& t: workers)
+        t.join();
+
+    auto t1 = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> elapsed = t1 - t0; 
+    double time = elapsed.count();
+
+    size_t load_and_store = 2;
+    size_t bytes_moved = nthreads * s * load_and_store * loops / stride;
+    std::cout << "buffer size: " << s/1024 << "Kb, bandwidth: "
+              << double(bytes_moved)/(1024*1024*1024) / time
+              << " GB/s" << std::endl;
+}
+
+
+
+void update_stride(size_t* src, size_t nbytes, size_t iterations, size_t stride)
+{
+    while (iterations--)
+    {
+        for (size_t i = 0; i < nbytes/sizeof(size_t); i+=stride)
+        {
+            src[i]++; // load & store
+        }
+    }
+}
diff --git a/cscs-checks/microbenchmarks/strided_bandwidth/strides.py b/cscs-checks/microbenchmarks/strided_bandwidth/strides.py
@@ -0,0 +1,138 @@
+import reframe as rfm
+import reframe.utility.sanity as sn
+
+
+class StridedBase(rfm.RegressionTest):
+    def __init__(self):
+        super().__init__()
+        self.sourcepath = 'strides.cpp'
+        self.build_system = 'SingleSource'
+        self.valid_systems = ['daint:gpu', 'dom:gpu', 'daint:mc', 'dom:mc']
+        self.valid_prog_environs = ['PrgEnv-gnu']
+        self.num_tasks = 1
+        self.num_tasks_per_node = 1
+
+        self.sanity_patterns = sn.assert_eq(
+            sn.count(sn.findall(r'bandwidth', self.stdout)),
+            self.num_tasks_assigned)
+
+        self.perf_patterns = {
+            'bandwidth': sn.extractsingle(
+                r'bandwidth: (?P<bw>\S+) GB/s',
+                self.stdout, 'bw', float)
+        }
+
+        self.system_num_cpus = {
+            'daint:mc':  72,
+            'daint:gpu': 24,
+            'dom:mc':  72,
+            'dom:gpu': 24,
+        }
+
+        self.maintainers = ['SK']
+        self.tags = {'benchmark', 'diagnostic'}
+
+    @property
+    @sn.sanity_function
+    def num_tasks_assigned(self):
+        return self.job.num_tasks
+
+
+@rfm.required_version('>=2.16-dev0')
+@rfm.simple_test
+class StridedBandwidthTest(StridedBase):
+    def __init__(self):
+        super().__init__()
+
+        self.reference = {
+            'dom:gpu': {
+                'bandwidth': (50, -0.1, 0.1, 'GB/s')
+            },
+            'dom:mc': {
+                'bandwidth': (100, -0.1, 0.1, 'GB/s')
+            },
+            'daint:gpu': {
+                'bandwidth': (50, -0.1, 0.1, 'GB/s')
+            },
+            'daint:mc': {
+                'bandwidth': (100, -0.1, 0.1, 'GB/s')
+            },
+            '*': {
+                'bandwidth': (0, None, None, 'GB/s')
+            }
+        }
+
+    def setup(self, partition, environ, **job_opts):
+        self.num_cpus = self.system_num_cpus[partition.fullname]
+
+        # 8-byte stride, using the full cacheline
+        self.executable_opts = ['100000000', '1', '%s' % self.num_cpus]
+
+        super().setup(partition, environ, **job_opts)
+
+
+
+@rfm.required_version('>=2.16-dev0')
+@rfm.simple_test
+class StridedBandwidthTest64(StridedBase):
+    def __init__(self):
+        super().__init__()
+
+        self.reference = {
+            'dom:gpu': {
+                'bandwidth': (6, -0.1, 0.2, 'GB/s')
+            },
+            'dom:mc': {
+                'bandwidth': (12.5, -0.1, 0.2, 'GB/s')
+            },
+            'daint:gpu': {
+                'bandwidth': (6, -0.05, 0.2, 'GB/s')
+            },
+            'daint:mc': {
+                'bandwidth': (12.5, -0.1, 0.2, 'GB/s')
+            },
+            '*': {
+                'bandwidth': (0, None, None, 'GB/s')
+            }
+        }
+
+    def setup(self, partition, environ, **job_opts):
+        self.num_cpus = self.system_num_cpus[partition.fullname]
+
+        # 64-byte stride, using 1/8 of the cacheline
+        self.executable_opts = ['100000000', '8', '%s' % self.num_cpus]
+
+        super().setup(partition, environ, **job_opts)
+
+
+@rfm.required_version('>=2.16-dev0')
+@rfm.simple_test
+class StridedBandwidthTest128(StridedBase):
+    def __init__(self):
+        super().__init__()
+
+        self.reference = {
+            'dom:gpu': {
+                'bandwidth': (4.5, -0.1, 0.2, 'GB/s')
+            },
+            'dom:mc': {
+                'bandwidth': (9.1, -0.1, 0.2, 'GB/s')
+            },
+            'daint:gpu': {
+                'bandwidth': (4.5, -0.1, 0.2, 'GB/s')
+            },
+            'daint:mc': {
+                'bandwidth': (9.1, -0.1, 0.2, 'GB/s')
+            },
+            '*': {
+                'bandwidth': (0, None, None, 'GB/s')
+            }
+        }
+
+    def setup(self, partition, environ, **job_opts):
+        self.num_cpus = self.system_num_cpus[partition.fullname]
+
+        # 128-byte stride, using 1/8 of every 2nd cacheline
+        self.executable_opts = ['100000000', '16', '%s' % self.num_cpus]
+
+        super().setup(partition, environ, **job_opts)