Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
188 changes: 188 additions & 0 deletions cscs-checks/microbenchmarks/strided_bandwidth/src/strides.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
// contributed by Sebastian Keller, CSCS, 2019-10

#include <iostream>
#include <cstdlib>
#include <chrono>
#include <stdlib.h>
#include <functional>
#include <vector>
#include <thread>
#include <memory>
#include <string>
#include <chrono>
#include <functional>

#include <sched.h>
#include <pthread.h>

template <class Callable>
size_t determine_loops(Callable const& func, double target_duration)
{
// measure number of loops required
size_t loops = 1;
double elapsed = 0;
do {
loops *= 2;
auto t0 = std::chrono::high_resolution_clock::now();
func(loops);
auto t1 = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> duration = t1 - t0;
elapsed = duration.count();
} while (elapsed < 0.1);
loops *= target_duration/elapsed;

return loops;
}

template <class Callable>
double timed_run(Callable const& func)
{
auto t0 = std::chrono::high_resolution_clock::now();
func();
auto t1 = std::chrono::high_resolution_clock::now();

std::chrono::duration<double> duration = t1 - t0;
return duration.count();
}

class Buffer
{
public:
Buffer() : data_size(0), data_(nullptr, free) {}

explicit Buffer(size_t sz_) : data_size(sz_), data_(nullptr, free)
{
data_size = (sz_/sizeof(size_t)) * (sizeof(size_t));

void* buf;
if (posix_memalign(&buf, 32, data_size)) {
std::cout << "alloc failed\n"; exit(1);
}
// provide "free" as custom deleter
data_ = std::unique_ptr<size_t[], decltype(free)*>((size_t*)buf, free);

init();
}

Buffer(Buffer&& rhs) : data_(std::move(rhs.data_)), data_size(rhs.data_size) {}

Buffer& operator=(Buffer&& rhs) {
data_ = std::move(rhs.data_);
data_size = rhs.data_size;
return *this;
}

size_t* get() const { return data_.get(); }

private:

void init()
{
for (size_t i = 0; i < data_size/sizeof(size_t); i++)
data_[i] = 1;
}

std::unique_ptr<size_t[], decltype(free)*> data_;
size_t data_size;
};


void set_affinity(std::thread& t, int i)
{
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(i, &cpuset);
int rc = pthread_setaffinity_np(t.native_handle(),
sizeof(cpu_set_t), &cpuset);
if (rc != 0)
std::cerr << "Error calling pthread_setaffinity_np: " << rc << "\n";
}

int get_affinity(std::thread& t)
{
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
int rc = pthread_getaffinity_np(t.native_handle(),
sizeof(cpu_set_t), &cpuset);
if (rc != 0)
std::cerr << "Error calling pthread_setaffinity_np: " << rc << "\n";
int ret;
for (int i=0; i < 128; ++i)
if (CPU_ISSET(i, &cpuset)) return i;

return -1;
}

void update_stride(size_t*, size_t, size_t, size_t);


int main(int argc, char ** argv)
{
if (argc != 4)
{
std::cout << "Usage: ./<prog_name> buffer_size stride nthreads\n"
"\n"
"buffer_size: in bytes\n"
"stride: in multiples of 8 bytes\n";
exit(1);
}

size_t s = std::stoi(argv[1]);
size_t stride = std::stoi(argv[2]);
int nthreads = std::stoi(argv[3]);

double time_per_run = 2.0;

std::vector<Buffer> buf(nthreads);
// initialize buffers (first touch)
for (int i = 0; i < nthreads; ++i)
{
std::thread t([=, &buf](){
std::this_thread::sleep_for(std::chrono::milliseconds(20));
buf[i] = Buffer(s);
});
set_affinity(t, i);
t.join();
}

auto func = [buf = buf[0].get(), s, stride](int loops) { update_stride(buf, s, loops, stride); };
size_t loops = determine_loops(func, time_per_run);

std::vector<std::function<void()>> tasks(nthreads);
for (int i = 0; i < nthreads; ++i)
tasks[i] = std::bind(&update_stride, buf[i].get(), s, loops, stride);

std::vector<std::thread> workers(nthreads);
auto t0 = std::chrono::high_resolution_clock::now();
for (int i = 0; i < nthreads; ++i)
{
workers[i] = std::thread(tasks[i]);
set_affinity(workers[i], i);
}

for (auto& t: workers)
t.join();

auto t1 = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = t1 - t0;
double time = elapsed.count();

size_t load_and_store = 2;
size_t bytes_moved = nthreads * s * load_and_store * loops / stride;
std::cout << "buffer size: " << s/1024 << "Kb, bandwidth: "
<< double(bytes_moved)/(1024*1024*1024) / time
<< " GB/s" << std::endl;
}



void update_stride(size_t* src, size_t nbytes, size_t iterations, size_t stride)
{
while (iterations--)
{
for (size_t i = 0; i < nbytes/sizeof(size_t); i+=stride)
{
src[i]++; // load & store
}
}
}
138 changes: 138 additions & 0 deletions cscs-checks/microbenchmarks/strided_bandwidth/strides.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import reframe as rfm
import reframe.utility.sanity as sn


class StridedBase(rfm.RegressionTest):
def __init__(self):
super().__init__()
self.sourcepath = 'strides.cpp'
self.build_system = 'SingleSource'
self.valid_systems = ['daint:gpu', 'dom:gpu', 'daint:mc', 'dom:mc']
self.valid_prog_environs = ['PrgEnv-gnu']
self.num_tasks = 1
self.num_tasks_per_node = 1

self.sanity_patterns = sn.assert_eq(
sn.count(sn.findall(r'bandwidth', self.stdout)),
self.num_tasks_assigned)

self.perf_patterns = {
'bandwidth': sn.extractsingle(
r'bandwidth: (?P<bw>\S+) GB/s',
self.stdout, 'bw', float)
}

self.system_num_cpus = {
'daint:mc': 72,
'daint:gpu': 24,
'dom:mc': 72,
'dom:gpu': 24,
}

self.maintainers = ['SK']
self.tags = {'benchmark', 'diagnostic'}

@property
@sn.sanity_function
def num_tasks_assigned(self):
return self.job.num_tasks


@rfm.required_version('>=2.16-dev0')
@rfm.simple_test
class StridedBandwidthTest(StridedBase):
def __init__(self):
super().__init__()

self.reference = {
'dom:gpu': {
'bandwidth': (50, -0.1, 0.1, 'GB/s')
},
'dom:mc': {
'bandwidth': (100, -0.1, 0.1, 'GB/s')
},
'daint:gpu': {
'bandwidth': (50, -0.1, 0.1, 'GB/s')
},
'daint:mc': {
'bandwidth': (100, -0.1, 0.1, 'GB/s')
},
'*': {
'bandwidth': (0, None, None, 'GB/s')
}
}

def setup(self, partition, environ, **job_opts):
self.num_cpus = self.system_num_cpus[partition.fullname]

# 8-byte stride, using the full cacheline
self.executable_opts = ['100000000', '1', '%s' % self.num_cpus]

super().setup(partition, environ, **job_opts)



@rfm.required_version('>=2.16-dev0')
@rfm.simple_test
class StridedBandwidthTest64(StridedBase):
def __init__(self):
super().__init__()

self.reference = {
'dom:gpu': {
'bandwidth': (6, -0.1, 0.2, 'GB/s')
},
'dom:mc': {
'bandwidth': (12.5, -0.1, 0.2, 'GB/s')
},
'daint:gpu': {
'bandwidth': (6, -0.05, 0.2, 'GB/s')
},
'daint:mc': {
'bandwidth': (12.5, -0.1, 0.2, 'GB/s')
},
'*': {
'bandwidth': (0, None, None, 'GB/s')
}
}

def setup(self, partition, environ, **job_opts):
self.num_cpus = self.system_num_cpus[partition.fullname]

# 64-byte stride, using 1/8 of the cacheline
self.executable_opts = ['100000000', '8', '%s' % self.num_cpus]

super().setup(partition, environ, **job_opts)


@rfm.required_version('>=2.16-dev0')
@rfm.simple_test
class StridedBandwidthTest128(StridedBase):
def __init__(self):
super().__init__()

self.reference = {
'dom:gpu': {
'bandwidth': (4.5, -0.1, 0.2, 'GB/s')
},
'dom:mc': {
'bandwidth': (9.1, -0.1, 0.2, 'GB/s')
},
'daint:gpu': {
'bandwidth': (4.5, -0.1, 0.2, 'GB/s')
},
'daint:mc': {
'bandwidth': (9.1, -0.1, 0.2, 'GB/s')
},
'*': {
'bandwidth': (0, None, None, 'GB/s')
}
}

def setup(self, partition, environ, **job_opts):
self.num_cpus = self.system_num_cpus[partition.fullname]

# 128-byte stride, using 1/8 of every 2nd cacheline
self.executable_opts = ['100000000', '16', '%s' % self.num_cpus]

super().setup(partition, environ, **job_opts)