Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 71 additions & 27 deletions cscs-checks/microbenchmarks/osu/osu_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import reframe.utility.sanity as sn


@rfm.required_version('>=2.14')
@rfm.required_version('>=2.16')
@rfm.parameterized_test(['production'])
class AlltoallTest(rfm.RegressionTest):
def __init__(self, variant):
Expand All @@ -21,16 +21,16 @@ def __init__(self, variant):
self.maintainers = ['RS', 'VK']
self.sanity_patterns = sn.assert_found(r'^8', self.stdout)
self.perf_patterns = {
'perf': sn.extractsingle(r'^8\s+(?P<perf>\S+)',
self.stdout, 'perf', float)
'latency': sn.extractsingle(r'^8\s+(?P<latency>\S+)',
self.stdout, 'latency', float)
}
self.tags = {variant}
self.reference = {
'dom:gpu': {
'perf': (8.23, None, 0.1)
'latency': (8.23, None, 0.1, 'us')
},
'daint:gpu': {
'perf': (20.73, None, 2.0)
'latency': (20.73, None, 2.0, 'us')
},
}
self.num_tasks_per_node = 1
Expand Down Expand Up @@ -72,6 +72,50 @@ def __init__(self):
self.tags = {'diagnostic', 'ops'}


@rfm.required_version('>=2.16')
@rfm.simple_test
class AllreduceTest(rfm.RegressionTest):
def __init__(self):
super().__init__()
self.strict_check = False
self.valid_systems = ['daint:gpu', 'dom:gpu']
self.descr = 'Allreduce OSU microbenchmark'
self.build_system = 'Make'
self.build_system.makefile = 'Makefile_allreduce'
self.executable = './osu_allreduce'
# The -x option controls the number of warm-up iterations
# The -i option controls the number of iterations
self.executable_opts = ['-m', '8', '-x', '1000', '-i', '20000']
self.valid_prog_environs = ['PrgEnv-gnu']
self.maintainers = ['RS', 'VK']
self.sanity_patterns = sn.assert_found(r'^8', self.stdout)
self.perf_patterns = {
'latency': sn.extractsingle(r'^8\s+(?P<latency>\S+)',
self.stdout, 'latency', float)
}
self.tags = {'production'}
self.reference = {
'dom:gpu': {
'latency': (6.0, None, 0.1, 'us')
},
'daint:gpu': {
'latency': (20.5, None, 2.0, 'us')
},
}
self.num_tasks_per_node = 1
self.num_gpus_per_node = 1
if self.current_system.name == 'dom':
self.num_tasks = 6
elif self.current_system.name == 'daint':
self.num_tasks = 16

self.extra_resources = {
'switches': {
'num_switches': 1
}
}


# FIXME: This test is obsolete; it is kept only for reference.
@rfm.parameterized_test(*({'num_tasks': i} for i in range(2, 10, 2)))
class AlltoallMonchAcceptanceTest(AlltoallTest):
Expand Down Expand Up @@ -125,7 +169,7 @@ def __init__(self):
}


@rfm.required_version('>=2.14')
@rfm.required_version('>=2.16')
@rfm.simple_test
class P2PCPUBandwidthTest(P2PBaseTest):
def __init__(self):
Expand All @@ -137,22 +181,22 @@ def __init__(self):

self.reference = {
'daint:gpu': {
'bw': (9798.29, -0.1, None)
'bw': (9798.29, -0.1, None, 'MB/s')
},
'daint:mc': {
'bw': (9865.00, -0.2, None)
'bw': (9865.00, -0.2, None, 'MB/s')
},
'dom:gpu': {
'bw': (9815.66, -0.1, None)
'bw': (9815.66, -0.1, None, 'MB/s')
},
'dom:mc': {
'bw': (9472.59, -0.20, None)
'bw': (9472.59, -0.20, None, 'MB/s')
},
'monch:compute': {
'bw': (6317.84, -0.15, None)
'bw': (6317.84, -0.15, None, 'MB/s')
},
'kesch:cn': {
'bw': (6311.48, -0.15, None)
'bw': (6311.48, -0.15, None, 'MB/s')
}
}
self.perf_patterns = {
Expand All @@ -162,7 +206,7 @@ def __init__(self):
self.tags |= {'monch_acceptance'}


@rfm.required_version('>=2.14')
@rfm.required_version('>=2.16')
@rfm.simple_test
class P2PCPULatencyTest(P2PBaseTest):
def __init__(self):
Expand All @@ -174,22 +218,22 @@ def __init__(self):
self.executable = './p2p_osu_latency'
self.reference = {
'daint:gpu': {
'latency': (1.16, None, 1.0)
'latency': (1.16, None, 1.0, 'us')
},
'daint:mc': {
'latency': (1.15, None, 0.6)
'latency': (1.15, None, 0.6, 'us')
},
'dom:gpu': {
'latency': (1.13, None, 0.1)
'latency': (1.13, None, 0.1, 'us')
},
'dom:mc': {
'latency': (1.27, None, 0.2)
'latency': (1.27, None, 0.2, 'us')
},
'monch:compute': {
'latency': (1.27, None, 0.1)
'latency': (1.27, None, 0.1, 'us')
},
'kesch:cn': {
'latency': (1.17, None, 0.1)
'latency': (1.17, None, 0.1, 'us')
}
}
self.perf_patterns = {
Expand All @@ -199,7 +243,7 @@ def __init__(self):
self.tags |= {'monch_acceptance'}


@rfm.required_version('>=2.14')
@rfm.required_version('>=2.16')
@rfm.simple_test
class G2GBandwidthTest(P2PBaseTest):
def __init__(self):
Expand All @@ -212,13 +256,13 @@ def __init__(self):

self.reference = {
'dom:gpu': {
'bw': (8897.86, -0.1, None)
'bw': (8897.86, -0.1, None, 'MB/s')
},
'daint:gpu': {
'bw': (8765.65, -0.1, None)
'bw': (8765.65, -0.1, None, 'MB/s')
},
'kesch:cn': {
'bw': (6288.98, -0.1, None)
'bw': (6288.98, -0.1, None, 'MB/s')
},
}
self.perf_patterns = {
Expand All @@ -236,7 +280,7 @@ def __init__(self):
self.build_system.cppflags = ['-D_ENABLE_CUDA_']


@rfm.required_version('>=2.14')
@rfm.required_version('>=2.16')
@rfm.simple_test
class G2GLatencyTest(P2PBaseTest):
def __init__(self):
Expand All @@ -249,13 +293,13 @@ def __init__(self):

self.reference = {
'dom:gpu': {
'latency': (5.49, None, 0.1)
'latency': (5.49, None, 0.1, 'us')
},
'daint:gpu': {
'latency': (5.73, None, 1.0)
'latency': (5.73, None, 1.0, 'us')
},
'kesch:cn': {
'latency': (23.09, None, 0.1)
'latency': (23.09, None, 0.1, 'us')
},
}
self.perf_patterns = {
Expand Down
17 changes: 17 additions & 0 deletions cscs-checks/microbenchmarks/osu/src/Makefile_allreduce
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
EXECUTABLE := osu_allreduce

all: $(EXECUTABLE)

SRCS += osu_util.c \
osu_allreduce.c

OBJS := $(SRCS:.c=.o)

$(OBJS):
$(CC) $(CPPFLAGS) $(CFLAGS) -I. -o $(@) -c $(@:.o=.c)

$(EXECUTABLE): $(OBJS)
$(CC) $(CPPFLAGS) $(CFLAGS) -o $(@) $(OBJS) $(LDFLAGS)

clean:
rm -f $(OBJS) $(EXECUTABLE)
143 changes: 143 additions & 0 deletions cscs-checks/microbenchmarks/osu/src/osu_allreduce.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
#define BENCHMARK "OSU MPI%s Allreduce Latency Test"
/*
* Copyright (C) 2002-2018 the Network-Based Computing Laboratory
* (NBCL), The Ohio State University.
*
* Contact: Dr. D. K. Panda (panda@cse.ohio-state.edu)
*
* For detailed copyright and licensing information, please refer to the
* copyright file COPYRIGHT in the top level OMB directory.
*/
#include <osu_util.h>

int main(int argc, char *argv[])
{
int i, numprocs, rank, size;
double latency = 0.0, t_start = 0.0, t_stop = 0.0;
double timer=0.0;
double avg_time = 0.0, max_time = 0.0, min_time = 0.0;
float *sendbuf, *recvbuf;
int po_ret;
size_t bufsize;
options.bench = COLLECTIVE;
options.subtype = LAT;

set_header(HEADER);
set_benchmark_name("osu_allreduce");
po_ret = process_options(argc, argv);

if (PO_OKAY == po_ret && NONE != options.accel) {
if (init_accel()) {
fprintf(stderr, "Error initializing device\n");
exit(EXIT_FAILURE);
}
}

MPI_CHECK(MPI_Init(&argc, &argv));
MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &numprocs));

switch (po_ret) {
case PO_BAD_USAGE:
print_bad_usage_message(rank);
MPI_CHECK(MPI_Finalize());
exit(EXIT_FAILURE);
case PO_HELP_MESSAGE:
print_help_message(rank);
MPI_CHECK(MPI_Finalize());
exit(EXIT_SUCCESS);
case PO_VERSION_MESSAGE:
print_version_message(rank);
MPI_CHECK(MPI_Finalize());
exit(EXIT_SUCCESS);
case PO_OKAY:
break;
}

if(numprocs < 2) {
if (rank == 0) {
fprintf(stderr, "This test requires at least two processes\n");
}

MPI_CHECK(MPI_Finalize());
exit(EXIT_FAILURE);
}

if (options.max_message_size > options.max_mem_limit) {
if (rank == 0) {
fprintf(stderr, "Warning! Increase the Max Memory Limit to be able to run up to %ld bytes.\n"
"Continuing with max message size of %ld bytes\n",
options.max_message_size, options.max_mem_limit);
}
options.max_message_size = options.max_mem_limit;
}

options.min_message_size /= sizeof(float);
if (options.min_message_size < MIN_MESSAGE_SIZE) {
options.min_message_size = MIN_MESSAGE_SIZE;
}

bufsize = sizeof(float)*(options.max_message_size/sizeof(float));
if (allocate_memory_coll((void**)&sendbuf, bufsize, options.accel)) {
fprintf(stderr, "Could Not Allocate Memory [rank %d]\n", rank);
MPI_CHECK(MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE));
}
set_buffer(sendbuf, options.accel, 1, bufsize);

bufsize = sizeof(float)*(options.max_message_size/sizeof(float));
if (allocate_memory_coll((void**)&recvbuf, bufsize, options.accel)) {
fprintf(stderr, "Could Not Allocate Memory [rank %d]\n", rank);
MPI_CHECK(MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE));
}
set_buffer(recvbuf, options.accel, 0, bufsize);

print_preamble(rank);

for(size=options.min_message_size; size*sizeof(float) <= options.max_message_size; size *= 2) {

if(size > LARGE_MESSAGE_SIZE) {
options.skip = options.skip_large;
options.iterations = options.iterations_large;
}

MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

timer=0.0;
for(i=0; i < options.iterations + options.skip ; i++) {
t_start = MPI_Wtime();
MPI_CHECK(MPI_Allreduce(sendbuf, recvbuf, size, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD ));
t_stop=MPI_Wtime();
if(i>=options.skip){

timer+=t_stop-t_start;
}
MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
}
latency = (double)(timer * 1e6) / options.iterations;

MPI_CHECK(MPI_Reduce(&latency, &min_time, 1, MPI_DOUBLE, MPI_MIN, 0,
MPI_COMM_WORLD));
MPI_CHECK(MPI_Reduce(&latency, &max_time, 1, MPI_DOUBLE, MPI_MAX, 0,
MPI_COMM_WORLD));
MPI_CHECK(MPI_Reduce(&latency, &avg_time, 1, MPI_DOUBLE, MPI_SUM, 0,
MPI_COMM_WORLD));
avg_time = avg_time/numprocs;

print_stats(rank, size * sizeof(float), avg_time, min_time, max_time);
MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
}

free_buffer(sendbuf, options.accel);
free_buffer(recvbuf, options.accel);

MPI_CHECK(MPI_Finalize());

if (NONE != options.accel) {
if (cleanup_accel()) {
fprintf(stderr, "Error cleaning up device\n");
exit(EXIT_FAILURE);
}
}

return EXIT_SUCCESS;
}