diff --git a/cscs-checks/microbenchmarks/osu/osu_tests.py b/cscs-checks/microbenchmarks/osu/osu_tests.py index 5e5919b321..f9d006edef 100644 --- a/cscs-checks/microbenchmarks/osu/osu_tests.py +++ b/cscs-checks/microbenchmarks/osu/osu_tests.py @@ -2,7 +2,7 @@ import reframe.utility.sanity as sn -@rfm.required_version('>=2.14') +@rfm.required_version('>=2.16') @rfm.parameterized_test(['production']) class AlltoallTest(rfm.RegressionTest): def __init__(self, variant): @@ -21,16 +21,16 @@ def __init__(self, variant): self.maintainers = ['RS', 'VK'] self.sanity_patterns = sn.assert_found(r'^8', self.stdout) self.perf_patterns = { - 'perf': sn.extractsingle(r'^8\s+(?P\S+)', - self.stdout, 'perf', float) + 'latency': sn.extractsingle(r'^8\s+(?P\S+)', + self.stdout, 'latency', float) } self.tags = {variant} self.reference = { 'dom:gpu': { - 'perf': (8.23, None, 0.1) + 'latency': (8.23, None, 0.1, 'us') }, 'daint:gpu': { - 'perf': (20.73, None, 2.0) + 'latency': (20.73, None, 2.0, 'us') }, } self.num_tasks_per_node = 1 @@ -72,6 +72,50 @@ def __init__(self): self.tags = {'diagnostic', 'ops'} +@rfm.required_version('>=2.16') +@rfm.simple_test +class AllreduceTest(rfm.RegressionTest): + def __init__(self): + super().__init__() + self.strict_check = False + self.valid_systems = ['daint:gpu', 'dom:gpu'] + self.descr = 'Allreduce OSU microbenchmark' + self.build_system = 'Make' + self.build_system.makefile = 'Makefile_allreduce' + self.executable = './osu_allreduce' + # The -x option controls the number of warm-up iterations + # The -i option controls the number of iterations + self.executable_opts = ['-m', '8', '-x', '1000', '-i', '20000'] + self.valid_prog_environs = ['PrgEnv-gnu'] + self.maintainers = ['RS', 'VK'] + self.sanity_patterns = sn.assert_found(r'^8', self.stdout) + self.perf_patterns = { + 'latency': sn.extractsingle(r'^8\s+(?P\S+)', + self.stdout, 'latency', float) + } + self.tags = {'production'} + self.reference = { + 'dom:gpu': { + 'latency': (6.0, None, 0.1, 'us') + }, + 'daint:gpu': { + 'latency': (20.5, None, 2.0, 'us') + }, + } + self.num_tasks_per_node = 1 + self.num_gpus_per_node = 1 + if self.current_system.name == 'dom': + self.num_tasks = 6 + elif self.current_system.name == 'daint': + self.num_tasks = 16 + + self.extra_resources = { + 'switches': { + 'num_switches': 1 + } + } + + # FIXME: This test is obsolete; it is kept only for reference. @rfm.parameterized_test(*({'num_tasks': i} for i in range(2, 10, 2))) class AlltoallMonchAcceptanceTest(AlltoallTest): @@ -125,7 +169,7 @@ def __init__(self): } -@rfm.required_version('>=2.14') +@rfm.required_version('>=2.16') @rfm.simple_test class P2PCPUBandwidthTest(P2PBaseTest): def __init__(self): @@ -137,22 +181,22 @@ def __init__(self): self.reference = { 'daint:gpu': { - 'bw': (9798.29, -0.1, None) + 'bw': (9798.29, -0.1, None, 'MB/s') }, 'daint:mc': { - 'bw': (9865.00, -0.2, None) + 'bw': (9865.00, -0.2, None, 'MB/s') }, 'dom:gpu': { - 'bw': (9815.66, -0.1, None) + 'bw': (9815.66, -0.1, None, 'MB/s') }, 'dom:mc': { - 'bw': (9472.59, -0.20, None) + 'bw': (9472.59, -0.20, None, 'MB/s') }, 'monch:compute': { - 'bw': (6317.84, -0.15, None) + 'bw': (6317.84, -0.15, None, 'MB/s') }, 'kesch:cn': { - 'bw': (6311.48, -0.15, None) + 'bw': (6311.48, -0.15, None, 'MB/s') } } self.perf_patterns = { @@ -162,7 +206,7 @@ def __init__(self): self.tags |= {'monch_acceptance'} -@rfm.required_version('>=2.14') +@rfm.required_version('>=2.16') @rfm.simple_test class P2PCPULatencyTest(P2PBaseTest): def __init__(self): @@ -174,22 +218,22 @@ def __init__(self): self.executable = './p2p_osu_latency' self.reference = { 'daint:gpu': { - 'latency': (1.16, None, 1.0) + 'latency': (1.16, None, 1.0, 'us') }, 'daint:mc': { - 'latency': (1.15, None, 0.6) + 'latency': (1.15, None, 0.6, 'us') }, 'dom:gpu': { - 'latency': (1.13, None, 0.1) + 'latency': (1.13, None, 0.1, 'us') }, 'dom:mc': { - 'latency': (1.27, None, 0.2) + 'latency': (1.27, None, 0.2, 'us') }, 'monch:compute': { - 'latency': (1.27, None, 0.1) + 'latency': (1.27, None, 0.1, 'us') }, 'kesch:cn': { - 'latency': (1.17, None, 0.1) + 'latency': (1.17, None, 0.1, 'us') } } self.perf_patterns = { @@ -199,7 +243,7 @@ def __init__(self): self.tags |= {'monch_acceptance'} -@rfm.required_version('>=2.14') +@rfm.required_version('>=2.16') @rfm.simple_test class G2GBandwidthTest(P2PBaseTest): def __init__(self): @@ -212,13 +256,13 @@ def __init__(self): self.reference = { 'dom:gpu': { - 'bw': (8897.86, -0.1, None) + 'bw': (8897.86, -0.1, None, 'MB/s') }, 'daint:gpu': { - 'bw': (8765.65, -0.1, None) + 'bw': (8765.65, -0.1, None, 'MB/s') }, 'kesch:cn': { - 'bw': (6288.98, -0.1, None) + 'bw': (6288.98, -0.1, None, 'MB/s') }, } self.perf_patterns = { @@ -236,7 +280,7 @@ def __init__(self): self.build_system.cppflags = ['-D_ENABLE_CUDA_'] -@rfm.required_version('>=2.14') +@rfm.required_version('>=2.16') @rfm.simple_test class G2GLatencyTest(P2PBaseTest): def __init__(self): @@ -249,13 +293,13 @@ def __init__(self): self.reference = { 'dom:gpu': { - 'latency': (5.49, None, 0.1) + 'latency': (5.49, None, 0.1, 'us') }, 'daint:gpu': { - 'latency': (5.73, None, 1.0) + 'latency': (5.73, None, 1.0, 'us') }, 'kesch:cn': { - 'latency': (23.09, None, 0.1) + 'latency': (23.09, None, 0.1, 'us') }, } self.perf_patterns = { diff --git a/cscs-checks/microbenchmarks/osu/src/Makefile_allreduce b/cscs-checks/microbenchmarks/osu/src/Makefile_allreduce new file mode 100644 index 0000000000..f5027a4bc7 --- /dev/null +++ b/cscs-checks/microbenchmarks/osu/src/Makefile_allreduce @@ -0,0 +1,17 @@ +EXECUTABLE := osu_allreduce + +all: $(EXECUTABLE) + +SRCS += osu_util.c \ + osu_allreduce.c + +OBJS := $(SRCS:.c=.o) + +$(OBJS): + $(CC) $(CPPFLAGS) $(CFLAGS) -I. -o $(@) -c $(@:.o=.c) + +$(EXECUTABLE): $(OBJS) + $(CC) $(CPPFLAGS) $(CFLAGS) -o $(@) $(OBJS) $(LDFLAGS) + +clean: + rm -f $(OBJS) $(EXECUTABLE) diff --git a/cscs-checks/microbenchmarks/osu/src/osu_allreduce.c b/cscs-checks/microbenchmarks/osu/src/osu_allreduce.c new file mode 100644 index 0000000000..be31b667c0 --- /dev/null +++ b/cscs-checks/microbenchmarks/osu/src/osu_allreduce.c @@ -0,0 +1,143 @@ +#define BENCHMARK "OSU MPI%s Allreduce Latency Test" +/* + * Copyright (C) 2002-2018 the Network-Based Computing Laboratory + * (NBCL), The Ohio State University. + * + * Contact: Dr. D. K. Panda (panda@cse.ohio-state.edu) + * + * For detailed copyright and licensing information, please refer to the + * copyright file COPYRIGHT in the top level OMB directory. + */ +#include + +int main(int argc, char *argv[]) +{ + int i, numprocs, rank, size; + double latency = 0.0, t_start = 0.0, t_stop = 0.0; + double timer=0.0; + double avg_time = 0.0, max_time = 0.0, min_time = 0.0; + float *sendbuf, *recvbuf; + int po_ret; + size_t bufsize; + options.bench = COLLECTIVE; + options.subtype = LAT; + + set_header(HEADER); + set_benchmark_name("osu_allreduce"); + po_ret = process_options(argc, argv); + + if (PO_OKAY == po_ret && NONE != options.accel) { + if (init_accel()) { + fprintf(stderr, "Error initializing device\n"); + exit(EXIT_FAILURE); + } + } + + MPI_CHECK(MPI_Init(&argc, &argv)); + MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &rank)); + MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &numprocs)); + + switch (po_ret) { + case PO_BAD_USAGE: + print_bad_usage_message(rank); + MPI_CHECK(MPI_Finalize()); + exit(EXIT_FAILURE); + case PO_HELP_MESSAGE: + print_help_message(rank); + MPI_CHECK(MPI_Finalize()); + exit(EXIT_SUCCESS); + case PO_VERSION_MESSAGE: + print_version_message(rank); + MPI_CHECK(MPI_Finalize()); + exit(EXIT_SUCCESS); + case PO_OKAY: + break; + } + + if(numprocs < 2) { + if (rank == 0) { + fprintf(stderr, "This test requires at least two processes\n"); + } + + MPI_CHECK(MPI_Finalize()); + exit(EXIT_FAILURE); + } + + if (options.max_message_size > options.max_mem_limit) { + if (rank == 0) { + fprintf(stderr, "Warning! Increase the Max Memory Limit to be able to run up to %ld bytes.\n" + "Continuing with max message size of %ld bytes\n", + options.max_message_size, options.max_mem_limit); + } + options.max_message_size = options.max_mem_limit; + } + + options.min_message_size /= sizeof(float); + if (options.min_message_size < MIN_MESSAGE_SIZE) { + options.min_message_size = MIN_MESSAGE_SIZE; + } + + bufsize = sizeof(float)*(options.max_message_size/sizeof(float)); + if (allocate_memory_coll((void**)&sendbuf, bufsize, options.accel)) { + fprintf(stderr, "Could Not Allocate Memory [rank %d]\n", rank); + MPI_CHECK(MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE)); + } + set_buffer(sendbuf, options.accel, 1, bufsize); + + bufsize = sizeof(float)*(options.max_message_size/sizeof(float)); + if (allocate_memory_coll((void**)&recvbuf, bufsize, options.accel)) { + fprintf(stderr, "Could Not Allocate Memory [rank %d]\n", rank); + MPI_CHECK(MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE)); + } + set_buffer(recvbuf, options.accel, 0, bufsize); + + print_preamble(rank); + + for(size=options.min_message_size; size*sizeof(float) <= options.max_message_size; size *= 2) { + + if(size > LARGE_MESSAGE_SIZE) { + options.skip = options.skip_large; + options.iterations = options.iterations_large; + } + + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + + timer=0.0; + for(i=0; i < options.iterations + options.skip ; i++) { + t_start = MPI_Wtime(); + MPI_CHECK(MPI_Allreduce(sendbuf, recvbuf, size, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD )); + t_stop=MPI_Wtime(); + if(i>=options.skip){ + + timer+=t_stop-t_start; + } + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + } + latency = (double)(timer * 1e6) / options.iterations; + + MPI_CHECK(MPI_Reduce(&latency, &min_time, 1, MPI_DOUBLE, MPI_MIN, 0, + MPI_COMM_WORLD)); + MPI_CHECK(MPI_Reduce(&latency, &max_time, 1, MPI_DOUBLE, MPI_MAX, 0, + MPI_COMM_WORLD)); + MPI_CHECK(MPI_Reduce(&latency, &avg_time, 1, MPI_DOUBLE, MPI_SUM, 0, + MPI_COMM_WORLD)); + avg_time = avg_time/numprocs; + + print_stats(rank, size * sizeof(float), avg_time, min_time, max_time); + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + } + + free_buffer(sendbuf, options.accel); + free_buffer(recvbuf, options.accel); + + MPI_CHECK(MPI_Finalize()); + + if (NONE != options.accel) { + if (cleanup_accel()) { + fprintf(stderr, "Error cleaning up device\n"); + exit(EXIT_FAILURE); + } + } + + return EXIT_SUCCESS; +}