Skip to content

Commit

Permalink
Merge pull request #686 from ajocksch/benchmarks/osu_allreduce
Browse files Browse the repository at this point in the history
[test] Add OSU Allreduce test
  • Loading branch information
vkarak committed Mar 20, 2019
2 parents 7c63333 + d669518 commit 28fbb8f
Show file tree
Hide file tree
Showing 3 changed files with 231 additions and 27 deletions.
98 changes: 71 additions & 27 deletions cscs-checks/microbenchmarks/osu/osu_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import reframe.utility.sanity as sn


@rfm.required_version('>=2.14')
@rfm.required_version('>=2.16')
@rfm.parameterized_test(['production'])
class AlltoallTest(rfm.RegressionTest):
def __init__(self, variant):
Expand All @@ -21,16 +21,16 @@ def __init__(self, variant):
self.maintainers = ['RS', 'VK']
self.sanity_patterns = sn.assert_found(r'^8', self.stdout)
self.perf_patterns = {
'perf': sn.extractsingle(r'^8\s+(?P<perf>\S+)',
self.stdout, 'perf', float)
'latency': sn.extractsingle(r'^8\s+(?P<latency>\S+)',
self.stdout, 'latency', float)
}
self.tags = {variant}
self.reference = {
'dom:gpu': {
'perf': (8.23, None, 0.1)
'latency': (8.23, None, 0.1, 'us')
},
'daint:gpu': {
'perf': (20.73, None, 2.0)
'latency': (20.73, None, 2.0, 'us')
},
}
self.num_tasks_per_node = 1
Expand Down Expand Up @@ -72,6 +72,50 @@ def __init__(self):
self.tags = {'diagnostic', 'ops'}


@rfm.required_version('>=2.16')
@rfm.simple_test
class AllreduceTest(rfm.RegressionTest):
def __init__(self):
super().__init__()
self.strict_check = False
self.valid_systems = ['daint:gpu', 'dom:gpu']
self.descr = 'Allreduce OSU microbenchmark'
self.build_system = 'Make'
self.build_system.makefile = 'Makefile_allreduce'
self.executable = './osu_allreduce'
# The -x option controls the number of warm-up iterations
# The -i option controls the number of iterations
self.executable_opts = ['-m', '8', '-x', '1000', '-i', '20000']
self.valid_prog_environs = ['PrgEnv-gnu']
self.maintainers = ['RS', 'VK']
self.sanity_patterns = sn.assert_found(r'^8', self.stdout)
self.perf_patterns = {
'latency': sn.extractsingle(r'^8\s+(?P<latency>\S+)',
self.stdout, 'latency', float)
}
self.tags = {'production'}
self.reference = {
'dom:gpu': {
'latency': (6.0, None, 0.1, 'us')
},
'daint:gpu': {
'latency': (20.5, None, 2.0, 'us')
},
}
self.num_tasks_per_node = 1
self.num_gpus_per_node = 1
if self.current_system.name == 'dom':
self.num_tasks = 6
elif self.current_system.name == 'daint':
self.num_tasks = 16

self.extra_resources = {
'switches': {
'num_switches': 1
}
}


# FIXME: This test is obsolete; it is kept only for reference.
@rfm.parameterized_test(*({'num_tasks': i} for i in range(2, 10, 2)))
class AlltoallMonchAcceptanceTest(AlltoallTest):
Expand Down Expand Up @@ -125,7 +169,7 @@ def __init__(self):
}


@rfm.required_version('>=2.14')
@rfm.required_version('>=2.16')
@rfm.simple_test
class P2PCPUBandwidthTest(P2PBaseTest):
def __init__(self):
Expand All @@ -137,22 +181,22 @@ def __init__(self):

self.reference = {
'daint:gpu': {
'bw': (9798.29, -0.1, None)
'bw': (9798.29, -0.1, None, 'MB/s')
},
'daint:mc': {
'bw': (9865.00, -0.2, None)
'bw': (9865.00, -0.2, None, 'MB/s')
},
'dom:gpu': {
'bw': (9815.66, -0.1, None)
'bw': (9815.66, -0.1, None, 'MB/s')
},
'dom:mc': {
'bw': (9472.59, -0.20, None)
'bw': (9472.59, -0.20, None, 'MB/s')
},
'monch:compute': {
'bw': (6317.84, -0.15, None)
'bw': (6317.84, -0.15, None, 'MB/s')
},
'kesch:cn': {
'bw': (6311.48, -0.15, None)
'bw': (6311.48, -0.15, None, 'MB/s')
}
}
self.perf_patterns = {
Expand All @@ -162,7 +206,7 @@ def __init__(self):
self.tags |= {'monch_acceptance'}


@rfm.required_version('>=2.14')
@rfm.required_version('>=2.16')
@rfm.simple_test
class P2PCPULatencyTest(P2PBaseTest):
def __init__(self):
Expand All @@ -174,22 +218,22 @@ def __init__(self):
self.executable = './p2p_osu_latency'
self.reference = {
'daint:gpu': {
'latency': (1.16, None, 1.0)
'latency': (1.16, None, 1.0, 'us')
},
'daint:mc': {
'latency': (1.15, None, 0.6)
'latency': (1.15, None, 0.6, 'us')
},
'dom:gpu': {
'latency': (1.13, None, 0.1)
'latency': (1.13, None, 0.1, 'us')
},
'dom:mc': {
'latency': (1.27, None, 0.2)
'latency': (1.27, None, 0.2, 'us')
},
'monch:compute': {
'latency': (1.27, None, 0.1)
'latency': (1.27, None, 0.1, 'us')
},
'kesch:cn': {
'latency': (1.17, None, 0.1)
'latency': (1.17, None, 0.1, 'us')
}
}
self.perf_patterns = {
Expand All @@ -199,7 +243,7 @@ def __init__(self):
self.tags |= {'monch_acceptance'}


@rfm.required_version('>=2.14')
@rfm.required_version('>=2.16')
@rfm.simple_test
class G2GBandwidthTest(P2PBaseTest):
def __init__(self):
Expand All @@ -212,13 +256,13 @@ def __init__(self):

self.reference = {
'dom:gpu': {
'bw': (8897.86, -0.1, None)
'bw': (8897.86, -0.1, None, 'MB/s')
},
'daint:gpu': {
'bw': (8765.65, -0.1, None)
'bw': (8765.65, -0.1, None, 'MB/s')
},
'kesch:cn': {
'bw': (6288.98, -0.1, None)
'bw': (6288.98, -0.1, None, 'MB/s')
},
}
self.perf_patterns = {
Expand All @@ -236,7 +280,7 @@ def __init__(self):
self.build_system.cppflags = ['-D_ENABLE_CUDA_']


@rfm.required_version('>=2.14')
@rfm.required_version('>=2.16')
@rfm.simple_test
class G2GLatencyTest(P2PBaseTest):
def __init__(self):
Expand All @@ -249,13 +293,13 @@ def __init__(self):

self.reference = {
'dom:gpu': {
'latency': (5.49, None, 0.1)
'latency': (5.49, None, 0.1, 'us')
},
'daint:gpu': {
'latency': (5.73, None, 1.0)
'latency': (5.73, None, 1.0, 'us')
},
'kesch:cn': {
'latency': (23.09, None, 0.1)
'latency': (23.09, None, 0.1, 'us')
},
}
self.perf_patterns = {
Expand Down
17 changes: 17 additions & 0 deletions cscs-checks/microbenchmarks/osu/src/Makefile_allreduce
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
EXECUTABLE := osu_allreduce

all: $(EXECUTABLE)

SRCS += osu_util.c \
osu_allreduce.c

OBJS := $(SRCS:.c=.o)

$(OBJS):
$(CC) $(CPPFLAGS) $(CFLAGS) -I. -o $(@) -c $(@:.o=.c)

$(EXECUTABLE): $(OBJS)
$(CC) $(CPPFLAGS) $(CFLAGS) -o $(@) $(OBJS) $(LDFLAGS)

clean:
rm -f $(OBJS) $(EXECUTABLE)
143 changes: 143 additions & 0 deletions cscs-checks/microbenchmarks/osu/src/osu_allreduce.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
#define BENCHMARK "OSU MPI%s Allreduce Latency Test"
/*
* Copyright (C) 2002-2018 the Network-Based Computing Laboratory
* (NBCL), The Ohio State University.
*
* Contact: Dr. D. K. Panda (panda@cse.ohio-state.edu)
*
* For detailed copyright and licensing information, please refer to the
* copyright file COPYRIGHT in the top level OMB directory.
*/
#include <osu_util.h>

int main(int argc, char *argv[])
{
int i, numprocs, rank, size;
double latency = 0.0, t_start = 0.0, t_stop = 0.0;
double timer=0.0;
double avg_time = 0.0, max_time = 0.0, min_time = 0.0;
float *sendbuf, *recvbuf;
int po_ret;
size_t bufsize;
options.bench = COLLECTIVE;
options.subtype = LAT;

set_header(HEADER);
set_benchmark_name("osu_allreduce");
po_ret = process_options(argc, argv);

if (PO_OKAY == po_ret && NONE != options.accel) {
if (init_accel()) {
fprintf(stderr, "Error initializing device\n");
exit(EXIT_FAILURE);
}
}

MPI_CHECK(MPI_Init(&argc, &argv));
MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &numprocs));

switch (po_ret) {
case PO_BAD_USAGE:
print_bad_usage_message(rank);
MPI_CHECK(MPI_Finalize());
exit(EXIT_FAILURE);
case PO_HELP_MESSAGE:
print_help_message(rank);
MPI_CHECK(MPI_Finalize());
exit(EXIT_SUCCESS);
case PO_VERSION_MESSAGE:
print_version_message(rank);
MPI_CHECK(MPI_Finalize());
exit(EXIT_SUCCESS);
case PO_OKAY:
break;
}

if(numprocs < 2) {
if (rank == 0) {
fprintf(stderr, "This test requires at least two processes\n");
}

MPI_CHECK(MPI_Finalize());
exit(EXIT_FAILURE);
}

if (options.max_message_size > options.max_mem_limit) {
if (rank == 0) {
fprintf(stderr, "Warning! Increase the Max Memory Limit to be able to run up to %ld bytes.\n"
"Continuing with max message size of %ld bytes\n",
options.max_message_size, options.max_mem_limit);
}
options.max_message_size = options.max_mem_limit;
}

options.min_message_size /= sizeof(float);
if (options.min_message_size < MIN_MESSAGE_SIZE) {
options.min_message_size = MIN_MESSAGE_SIZE;
}

bufsize = sizeof(float)*(options.max_message_size/sizeof(float));
if (allocate_memory_coll((void**)&sendbuf, bufsize, options.accel)) {
fprintf(stderr, "Could Not Allocate Memory [rank %d]\n", rank);
MPI_CHECK(MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE));
}
set_buffer(sendbuf, options.accel, 1, bufsize);

bufsize = sizeof(float)*(options.max_message_size/sizeof(float));
if (allocate_memory_coll((void**)&recvbuf, bufsize, options.accel)) {
fprintf(stderr, "Could Not Allocate Memory [rank %d]\n", rank);
MPI_CHECK(MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE));
}
set_buffer(recvbuf, options.accel, 0, bufsize);

print_preamble(rank);

for(size=options.min_message_size; size*sizeof(float) <= options.max_message_size; size *= 2) {

if(size > LARGE_MESSAGE_SIZE) {
options.skip = options.skip_large;
options.iterations = options.iterations_large;
}

MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

timer=0.0;
for(i=0; i < options.iterations + options.skip ; i++) {
t_start = MPI_Wtime();
MPI_CHECK(MPI_Allreduce(sendbuf, recvbuf, size, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD ));
t_stop=MPI_Wtime();
if(i>=options.skip){

timer+=t_stop-t_start;
}
MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
}
latency = (double)(timer * 1e6) / options.iterations;

MPI_CHECK(MPI_Reduce(&latency, &min_time, 1, MPI_DOUBLE, MPI_MIN, 0,
MPI_COMM_WORLD));
MPI_CHECK(MPI_Reduce(&latency, &max_time, 1, MPI_DOUBLE, MPI_MAX, 0,
MPI_COMM_WORLD));
MPI_CHECK(MPI_Reduce(&latency, &avg_time, 1, MPI_DOUBLE, MPI_SUM, 0,
MPI_COMM_WORLD));
avg_time = avg_time/numprocs;

print_stats(rank, size * sizeof(float), avg_time, min_time, max_time);
MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
}

free_buffer(sendbuf, options.accel);
free_buffer(recvbuf, options.accel);

MPI_CHECK(MPI_Finalize());

if (NONE != options.accel) {
if (cleanup_accel()) {
fprintf(stderr, "Error cleaning up device\n");
exit(EXIT_FAILURE);
}
}

return EXIT_SUCCESS;
}

0 comments on commit 28fbb8f

Please sign in to comment.