diff --git a/cscs-checks/microbenchmarks/mpi/osu/osu_tests.py b/cscs-checks/microbenchmarks/mpi/osu/osu_tests.py
index 76d983608a..5610e5eca9 100644
--- a/cscs-checks/microbenchmarks/mpi/osu/osu_tests.py
+++ b/cscs-checks/microbenchmarks/mpi/osu/osu_tests.py
@@ -3,139 +3,142 @@
 #
 # SPDX-License-Identifier: BSD-3-Clause
 
+import contextlib
 import reframe as rfm
-import reframe.utility.sanity as sn
 
+from hpctestlib.microbenchmarks.mpi.osu import (build_osu_benchmarks,
+                                                osu_build_run)
 
-@rfm.simple_test
-class AlltoallTest(rfm.RegressionTest):
-    variant = parameter(['production'])
-    strict_check = False
-    valid_systems = ['daint:gpu', 'dom:gpu']
-    descr = 'Alltoall OSU microbenchmark'
-    build_system = 'Make'
-    executable = './osu_alltoall'
-    # The -m option sets the maximum message size
-    # The -x option sets the number of warm-up iterations
-    # The -i option sets the number of iterations
-    executable_opts = ['-m', '8', '-x', '1000', '-i', '20000']
-    valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-gnu',
-                           'PrgEnv-intel', 'PrgEnv-nvidia']
-    maintainers = ['RS', 'AJ']
-    reference = {
-        'dom:gpu': {
-            'latency': (8.23, None, 0.1, 'us')
-        },
-        'daint:gpu': {
-            'latency': (20.73, None, 2.0, 'us')
-        }
-    }
-    num_tasks_per_node = 1
-    num_gpus_per_node  = 1
-    extra_resources = {
-        'switches': {
-            'num_switches': 1
-        }
-    }
 
-    @run_after('init')
-    def set_tags(self):
-        self.tags = {self.variant, 'benchmark', 'craype'}
+class cscs_build_osu_benchmarks(build_osu_benchmarks):
+    build_type = parameter(['cpu', 'cuda'])
 
-    @run_before('compile')
-    def set_makefile(self):
-        self.build_system.makefile = 'Makefile_alltoall'
+    @run_after('init')
+    def setup_modules(self):
+        if self.build_type != 'cuda':
+            return
 
-    @run_before('run')
-    def set_num_tasks(self):
-        if self.current_system.name == 'daint':
-            self.num_tasks = 16
-        else:
-            self.num_tasks = 6
+        if self.current_system.name in ('daint', 'dom'):
+            self.modules = ['cudatoolkit/21.3_11.2']
+        elif self.current_system.name in ('arolla', 'tsa'):
+            self.modules = ['cuda/10.1.243']
+            self.build_system.ldflags = ['-L$EBROOTCUDA/lib64',
+                                         '-lcudart', '-lcuda']
 
-    @sanity_function
-    def assert_found_8MB_latency(self):
-        return sn.assert_found(r'^8', self.stdout)
 
-    @run_before('performance')
-    def set_performance_patterns(self):
-        self.perf_patterns = {
-            'latency': sn.extractsingle(r'^8\s+(?P<latency>\S+)',
-                                        self.stdout, 'latency', float)
-        }
+class cscs_osu_benchmarks(osu_build_run):
+    exclusive_access = True
+    tags = {'production', 'benchmark', 'craype'}
+    maintainers = ['@rsarm', '@vkarak']
 
 
 @rfm.simple_test
-class FlexAlltoallTest(rfm.RegressionTest):
+class cscs_osu_pt2pt_check(cscs_osu_benchmarks):
     valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc',
-                     'arolla:cn', 'arolla:pn', 'tsa:cn', 'tsa:pn']
-    valid_prog_environs = ['PrgEnv-cray']
-    descr = 'Flexible Alltoall OSU test'
-    build_system = 'Make'
-    executable = './osu_alltoall'
-    maintainers = ['RS', 'AJ']
-    num_tasks_per_node = 1
-    num_tasks = 0
-    tags = {'diagnostic', 'ops', 'benchmark', 'craype'}
+                     'eiger:mc', 'pilatus:mc', 'arolla:cn', 'tsa:cn']
+    valid_prog_environs = ['PrgEnv-gnu']
+    benchmark_info = parameter([
+        ('mpi.pt2pt.osu_bw', 'bandwidth'),
+        ('mpi.pt2pt.osu_latency', 'latency')
+    ], fmt=lambda x: x[0], loggable=True)
+    osu_binaries = fixture(cscs_build_osu_benchmarks, scope='environment')
+    allref = {
+        'mpi.pt2pt.osu_bw': {
+            'cpu': {
+                'daint:gpu': {
+                    'bandwidth': (9481.0, -0.10, None, 'MB/s')
+                },
+                'daint:mc': {
+                    'bandwidth': (8507, -0.15, None, 'MB/s')
+                },
+                'dom:gpu': {
+                    'bandwidth': (9476.3, -0.05, None, 'MB/s')
+                },
+                'dom:mc': {
+                    'bandwidth': (9528.0, -0.20, None, 'MB/s')
+                },
+                'eiger:mc': {
+                    'bandwidth': (12240.0, -0.10, None, 'MB/s')
+                },
+                'pilatus:mc': {
+                    'bandwidth': (12240.0, -0.10, None, 'MB/s')
+                }
+            },
+            'cuda': {
+                'daint:gpu': {
+                    'bandwidth': (8560, -0.10, None, 'MB/s')
+                },
+                'dom:gpu': {
+                    'bandwidth': (8813.09, -0.05, None, 'MB/s')
+                }
+            }
+        },
+        'mpi.pt2pt.osu_latency': {
+            'cpu': {
+                'daint:gpu': {
+                    'latency': (1.40, None, 0.80, 'us')
+                },
+                'daint:mc': {
+                    'latency': (1.61, None, 0.70, 'us')
+                },
+                'dom:gpu': {
+                    'latency': (1.138, None, 0.10, 'us')
+                },
+                'dom:mc': {
+                    'latency': (1.47, None, 0.10, 'us')
+                },
+                'eiger:mc': {
+                    'latency': (2.33, None, 0.15, 'us')
+                },
+                'pilatus:mc': {
+                    'latency': (2.33, None, 0.15, 'us')
+                }
+            },
+            'cuda': {
+                'daint:gpu': {
+                    'latency': (6.82, None, 0.50, 'us')
+                },
+                'dom:gpu': {
+                    'latency': (5.56, None, 0.1, 'us')
+                },
+            }
+        }
+    }
 
     @run_after('init')
-    def add_prog_environ(self):
-        if self.current_system.name in ['arolla', 'tsa']:
-            self.exclusive_access = True
-            self.valid_prog_environs = ['PrgEnv-gnu', 'PrgEnv-pgi']
+    def setup_per_build_type(self):
+        build_type = self.osu_binaries.build_type
+        if build_type == 'cuda':
+            self.device_buffers = 'cuda'
+            self.num_gpus_per_node = 1
+            self.valid_systems = ['daint:gpu',
+                                  'dom:gpu', 'arolla:cn', 'tsa:cn']
+            if self.current_system.name in ('daint', 'dom'):
+                self.valid_prog_environs = ['PrgEnv-nvidia']
+                self.variables = {'MPICH_RDMA_ENABLED_CUDA': '1'}
 
-    @run_before('compile')
-    def set_makefile(self):
-        self.build_system.makefile = 'Makefile_alltoall'
-
-    @sanity_function
-    def assert_found_1KB_bw(self):
-        return sn.assert_found(r'^1048576', self.stdout)
+        with contextlib.suppress(KeyError):
+            self.reference = self.allref[self.benchmark_info[0]][build_type]
 
 
 @rfm.simple_test
-class AllreduceTest(rfm.RegressionTest):
-    variant = parameter(['small'], ['large'])
-    strict_check = False
-    valid_systems = ['daint:gpu', 'daint:mc']
-    descr = 'Allreduce OSU microbenchmark'
-    build_system = 'Make'
-    executable = './osu_allreduce'
-    # The -x option controls the number of warm-up iterations
-    # The -i option controls the number of iterations
-    executable_opts = ['-m', '8', '-x', '1000', '-i', '20000']
-    valid_prog_environs = ['PrgEnv-gnu', 'PrgEnv-nvidia']
-    maintainers = ['RS', 'AJ']
-    tags = {'production', 'benchmark', 'craype'}
-    num_tasks_per_node = 1
-    num_gpus_per_node  = 1
+class cscs_osu_collective_check(cscs_osu_benchmarks):
+    benchmark_info = parameter([
+        ('mpi.collective.osu_alltoall', 'latency'),
+        ('mpi.collective.osu_allreduce', 'latency'),
+    ], fmt=lambda x: x[0], loggable=True)
+    num_nodes = parameter([6, 16])
     extra_resources = {
         'switches': {
             'num_switches': 1
         }
     }
-
-    @run_after('init')
-    def add_valid_systems(self):
-        if self.variant == 'small':
-            self.valid_systems += ['dom:gpu', 'dom:mc']
-
-    @run_before('compile')
-    def set_makefile(self):
-        self.build_system.makefile = 'Makefile_allreduce'
-
-    @run_before('run')
-    def set_num_tasks(self):
-        self.num_tasks = 6 if self.variant == 'small' else 16
-
-    @sanity_function
-    def assert_found_8MB_latency(self):
-        return sn.assert_found(r'^8', self.stdout)
-
-    @run_before('performance')
-    def set_performance_patterns(self):
-        if self.variant == 'small':
-            self.reference = {
+    valid_systems = ['daint:gpu', 'daint:mc']
+    valid_prog_environs = ['PrgEnv-gnu']
+    osu_binaries = fixture(cscs_build_osu_benchmarks, scope='environment')
+    allref = {
+        'mpi.collective.osu_allreduce': {
+            6: {
                 'dom:gpu': {
                     'latency': (5.67, None, 0.05, 'us')
                 },
@@ -145,9 +148,8 @@ def set_performance_patterns(self):
                 'daint:mc': {
                     'latency': (10.90, None, 1.90, 'us')
                 }
-            }
-        else:
-            self.reference = {
+            },
+            16: {
                 'daint:gpu': {
                     'latency': (13.62, None, 1.16, 'us')
                 },
@@ -155,209 +157,43 @@ def set_performance_patterns(self):
                     'latency': (19.07, None, 1.64, 'us')
                 }
             }
-        self.perf_patterns = {
-            'latency': sn.extractsingle(r'^8\s+(?P<latency>\S+)',
-                                        self.stdout, 'latency', float)
-        }
-
-
-class P2PBaseTest(rfm.RegressionTest):
-    exclusive_access = True
-    strict_check = False
-    num_tasks = 2
-    num_tasks_per_node = 1
-    descr = 'P2P microbenchmark'
-    build_system = 'Make'
-    maintainers = ['RS', 'AJ']
-    tags = {'production', 'benchmark', 'craype'}
-    extra_resources = {
-        'switches': {
-            'num_switches': 1
-        }
-    }
-
-    @run_after('init')
-    def add_valid_prog_environs(self):
-        if self.current_system.name in ['arolla', 'tsa']:
-            self.exclusive_access = True
-            self.valid_prog_environs = ['PrgEnv-gnu', 'PrgEnv-pgi']
-        else:
-            self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-gnu',
-                                        'PrgEnv-intel', 'PrgEnv-nvidia']
-
-    @run_before('compile')
-    def set_makefile(self):
-        self.build_system.makefile = 'Makefile_p2p'
-
-    @sanity_function
-    def assert_found_4KB_bw(self):
-        return sn.assert_found(r'^4194304', self.stdout)
-
-
-@rfm.simple_test
-class P2PCPUBandwidthTest(P2PBaseTest):
-    valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc',
-                     'arolla:cn', 'tsa:cn', 'eiger:mc', 'pilatus:mc']
-    executable = './p2p_osu_bw'
-    executable_opts = ['-x', '100', '-i', '1000']
-    reference = {
-        'daint:gpu': {
-            'bw': (9481.0, -0.10, None, 'MB/s')
-        },
-        'daint:mc': {
-            'bw': (8507, -0.15, None, 'MB/s')
-        },
-        'dom:gpu': {
-            'bw': (9476.3, -0.05, None, 'MB/s')
-        },
-        'dom:mc': {
-            'bw': (9528.0, -0.20, None, 'MB/s')
-        },
-        'eiger:mc': {
-            'bw': (12240.0, -0.10, None, 'MB/s')
-        },
-        'pilatus:mc': {
-            'bw': (12240.0, -0.10, None, 'MB/s')
-        },
-        # keeping as reference:
-        # 'monch:compute': {
-        #     'bw': (6317.84, -0.15, None, 'MB/s')
-        # },
-    }
-
-    @run_before('performance')
-    def set_performance_patterns(self):
-        self.perf_patterns = {
-            'bw': sn.extractsingle(r'^4194304\s+(?P<bw>\S+)',
-                                   self.stdout, 'bw', float)
-        }
-
-
-@rfm.simple_test
-class P2PCPULatencyTest(P2PBaseTest):
-    valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc',
-                     'arolla:cn', 'tsa:cn', 'eiger:mc', 'pilatus:mc']
-    executable = './p2p_osu_latency'
-    reference = {
-        'daint:gpu': {
-            'latency': (1.40, None, 0.80, 'us')
-        },
-        'daint:mc': {
-            'latency': (1.61, None, 0.70, 'us')
-        },
-        'dom:gpu': {
-            'latency': (1.138, None, 0.10, 'us')
-        },
-        'dom:mc': {
-            'latency': (1.47, None, 0.10, 'us')
-        },
-        'eiger:mc': {
-            'latency': (2.33, None, 0.15, 'us')
-        },
-        'pilatus:mc': {
-            'latency': (2.33, None, 0.15, 'us')
-        },
-        # keeping as reference:
-        # 'monch:compute': {
-        #     'latency': (1.27, None, 0.1, 'us')
-        # },
-    }
-
-    @run_before('performance')
-    def set_performance_patterns(self):
-        self.perf_patterns = {
-            'bw': sn.extractsingle(r'^4194304\s+(?P<bw>\S+)',
-                                   self.stdout, 'bw', float)
-        }
-        self.perf_patterns = {
-            'latency': sn.extractsingle(r'^8\s+(?P<latency>\S+)',
-                                        self.stdout, 'latency', float)
-        }
-
-
-@rfm.simple_test
-class G2GBandwidthTest(P2PBaseTest):
-    valid_systems = ['daint:gpu', 'dom:gpu', 'arolla:cn', 'tsa:cn']
-    num_gpus_per_node = 1
-    executable = './p2p_osu_bw'
-    executable_opts = ['-x', '100', '-i', '1000', '-d',
-                       'cuda', 'D', 'D']
-    reference = {
-        'dom:gpu': {
-            'bw': (8813.09, -0.05, None, 'MB/s')
         },
-        'daint:gpu': {
-            'bw': (8560, -0.10, None, 'MB/s')
-        },
-        '*': {
-            'bw': (0, None, None, 'MB/s')
-        }
-    }
-
-    @run_before('performance')
-    def set_performance_patterns(self):
-        self.perf_patterns = {
-            'bw': sn.extractsingle(r'^4194304\s+(?P<bw>\S+)',
-                                   self.stdout, 'bw', float)
+        'mpi.collective.osu_alltoall': {
+            6: {
+                'dom:gpu': {
+                    'latency': (8.23, None, 0.1, 'us')
+                },
+                'daint:gpu': {
+                    'latency': (20.73, None, 2.0, 'us')
+                },
+                'dom:mc': {
+                    'latency': (0, None, None, 'us')
+                },
+                'daint:mc': {
+                    'latency': (0, None, None, 'us')
+                }
+            },
+            16: {
+                'daint:gpu': {
+                    'latency': (0, None, None, 'us')
+                },
+                'daint:mc': {
+                    'latency': (0, None, None, 'us')
+                }
+            }
         }
-
-    @run_before('compile')
-    def set_cpp_flags(self):
-        self.build_system.cppflags = ['-D_ENABLE_CUDA_']
-
-    @run_before('compile')
-    def set_modules(self):
-        if self.current_system.name in ['daint', 'dom']:
-            self.num_gpus_per_node  = 1
-            self.variables = {'MPICH_RDMA_ENABLED_CUDA': '1'}
-            if self.current_environ.name == 'PrgEnv-nvidia':
-                self.modules = ['cudatoolkit/21.3_11.2']
-            else:
-                self.modules = ['craype-accel-nvidia60']
-        elif self.current_system.name in ['arolla', 'tsa']:
-            self.modules = ['cuda/10.1.243']
-            self.build_system.ldflags = ['-L$EBROOTCUDA/lib64',
-                                         '-lcudart', '-lcuda']
-
-
-@rfm.simple_test
-class G2GLatencyTest(P2PBaseTest):
-    valid_systems = ['daint:gpu', 'dom:gpu', 'arolla:cn', 'tsa:cn']
-    num_gpus_per_node = 1
-    executable = './p2p_osu_latency'
-    executable_opts = ['-x', '100', '-i', '1000', '-d',
-                       'cuda', 'D', 'D']
-
-    reference = {
-        'dom:gpu': {
-            'latency': (5.56, None, 0.1, 'us')
-        },
-        'daint:gpu': {
-            'latency': (6.82, None, 0.50, 'us')
-        },
     }
 
-    @run_before('performance')
-    def set_performance_patterns(self):
-        self.perf_patterns = {
-            'latency': sn.extractsingle(r'^8\s+(?P<latency>\S+)',
-                                        self.stdout, 'latency', float)
-        }
-
-    @run_before('compile')
-    def set_cpp_flags(self):
-        self.build_system.cppflags = ['-D_ENABLE_CUDA_']
+    @run_after('init')
+    def setup_by_scale(self):
+        if self.osu_binaries.build_type == 'cuda':
+            # Filter out CUDA-aware versions
+            self.valid_systems = []
+            return
+
+        self.num_tasks = self.num_nodes
+        if self.num_nodes == 6:
+            self.valid_systems += ['dom:gpu', 'dom:mc']
 
-    @run_before('compile')
-    def set_modules(self):
-        if self.current_system.name in ['daint', 'dom']:
-            self.num_gpus_per_node  = 1
-            self.variables = {'MPICH_RDMA_ENABLED_CUDA': '1'}
-            if self.current_environ.name == 'PrgEnv-nvidia':
-                self.modules = ['cudatoolkit/21.3_11.2']
-            else:
-                self.modules = ['craype-accel-nvidia60']
-        elif self.current_system.name in ['arolla', 'tsa']:
-            self.modules = ['cuda/10.1.243']
-            self.build_system.ldflags = ['-L$EBROOTCUDA/lib64',
-                                         '-lcudart', '-lcuda']
+        with contextlib.suppress(KeyError):
+            self.reference = self.allref[self.num_nodes]
diff --git a/cscs-checks/microbenchmarks/mpi/osu/src/Makefile_allreduce b/cscs-checks/microbenchmarks/mpi/osu/src/Makefile_allreduce
deleted file mode 100644
index f5027a4bc7..0000000000
--- a/cscs-checks/microbenchmarks/mpi/osu/src/Makefile_allreduce
+++ /dev/null
@@ -1,17 +0,0 @@
-EXECUTABLE := osu_allreduce
-
-all: $(EXECUTABLE)
-
-SRCS += osu_util.c \
-		osu_allreduce.c
-
-OBJS := $(SRCS:.c=.o)
-
-$(OBJS):
-	$(CC) $(CPPFLAGS) $(CFLAGS) -I. -o $(@) -c $(@:.o=.c)
-
-$(EXECUTABLE): $(OBJS)
-	$(CC) $(CPPFLAGS) $(CFLAGS) -o $(@) $(OBJS) $(LDFLAGS)
-
-clean:
-	rm -f $(OBJS) $(EXECUTABLE)
diff --git a/cscs-checks/microbenchmarks/mpi/osu/src/Makefile_alltoall b/cscs-checks/microbenchmarks/mpi/osu/src/Makefile_alltoall
deleted file mode 100644
index 5c514508cb..0000000000
--- a/cscs-checks/microbenchmarks/mpi/osu/src/Makefile_alltoall
+++ /dev/null
@@ -1,17 +0,0 @@
-EXECUTABLE := osu_alltoall
-
-all: $(EXECUTABLE)
-
-SRCS += osu_util.c \
-		osu_alltoall.c
-
-OBJS := $(SRCS:.c=.o)
-
-$(OBJS):
-	$(CC) $(CPPFLAGS) $(CFLAGS) -o $(@) -c $(@:.o=.c)
-
-$(EXECUTABLE): $(OBJS)
-	$(CC) $(CPPFLAGS) $(CFLAGS) -o $(@) $(OBJS) $(LDFLAGS)
-
-clean:
-	rm -f $(OBJS) $(EXECUTABLE)
diff --git a/cscs-checks/microbenchmarks/mpi/osu/src/Makefile_p2p b/cscs-checks/microbenchmarks/mpi/osu/src/Makefile_p2p
deleted file mode 100644
index 4700f3638c..0000000000
--- a/cscs-checks/microbenchmarks/mpi/osu/src/Makefile_p2p
+++ /dev/null
@@ -1,24 +0,0 @@
-EXECUTABLES := p2p_osu_bw p2p_osu_latency
-
-all: $(EXECUTABLES)
-
-SRCS += osu_util.c \
-		osu_latency.c \
-		osu_bw.c
-
-OBJS_BW = osu_util.o osu_bw.o
-OBJS_LT = osu_util.o osu_latency.o
-
-OBJS := $(SRCS:.c=.o)
-
-$(OBJS):
-	$(CC) $(CPPFLAGS) $(CXXFLAGS) -o $(@) -c $(@:.o=.c)
-
-p2p_osu_bw: $(OBJS_BW)
-	$(CC) $(CPPFLAGS) $(CXXFLAGS) -o $(@) $(OBJS_BW) $(LDFLAGS)
-
-p2p_osu_latency: $(OBJS_LT)
-	$(CC) $(CPPFLAGS) $(CXXFLAGS) -o $(@) $(OBJS_LT) $(LDFLAGS)
-
-clean:
-	rm -f $(OBJS) $(EXECUTABLES)
diff --git a/cscs-checks/microbenchmarks/mpi/osu/src/osu_allreduce.c b/cscs-checks/microbenchmarks/mpi/osu/src/osu_allreduce.c
deleted file mode 100644
index be31b667c0..0000000000
--- a/cscs-checks/microbenchmarks/mpi/osu/src/osu_allreduce.c
+++ /dev/null
@@ -1,143 +0,0 @@
-#define BENCHMARK "OSU MPI%s Allreduce Latency Test"
-/*
- * Copyright (C) 2002-2018 the Network-Based Computing Laboratory
- * (NBCL), The Ohio State University.
- *
- * Contact: Dr. D. K. Panda (panda@cse.ohio-state.edu)
- *
- * For detailed copyright and licensing information, please refer to the
- * copyright file COPYRIGHT in the top level OMB directory.
- */
-#include <osu_util.h>
-
-int main(int argc, char *argv[])
-{
-    int i, numprocs, rank, size;
-    double latency = 0.0, t_start = 0.0, t_stop = 0.0;
-    double timer=0.0;
-    double avg_time = 0.0, max_time = 0.0, min_time = 0.0;
-    float *sendbuf, *recvbuf;
-    int po_ret;
-    size_t bufsize;
-    options.bench = COLLECTIVE;
-    options.subtype = LAT;
-
-    set_header(HEADER);
-    set_benchmark_name("osu_allreduce");
-    po_ret = process_options(argc, argv);
-
-    if (PO_OKAY == po_ret && NONE != options.accel) {
-        if (init_accel()) {
-            fprintf(stderr, "Error initializing device\n");
-            exit(EXIT_FAILURE);
-        }
-    }
-
-    MPI_CHECK(MPI_Init(&argc, &argv));
-    MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
-    MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &numprocs));
-
-    switch (po_ret) {
-        case PO_BAD_USAGE:
-            print_bad_usage_message(rank);
-            MPI_CHECK(MPI_Finalize());
-            exit(EXIT_FAILURE);
-        case PO_HELP_MESSAGE:
-            print_help_message(rank);
-            MPI_CHECK(MPI_Finalize());
-            exit(EXIT_SUCCESS);
-        case PO_VERSION_MESSAGE:
-            print_version_message(rank);
-            MPI_CHECK(MPI_Finalize());
-            exit(EXIT_SUCCESS);
-        case PO_OKAY:
-            break;
-    }
-
-    if(numprocs < 2) {
-        if (rank == 0) {
-            fprintf(stderr, "This test requires at least two processes\n");
-        }
-
-        MPI_CHECK(MPI_Finalize());
-        exit(EXIT_FAILURE);
-    }
-
-    if (options.max_message_size > options.max_mem_limit) {
-        if (rank == 0) {
-            fprintf(stderr, "Warning! Increase the Max Memory Limit to be able to run up to %ld bytes.\n"
-                            "Continuing with max message size of %ld bytes\n", 
-                            options.max_message_size, options.max_mem_limit);
-        }
-        options.max_message_size = options.max_mem_limit;
-    }
-
-    options.min_message_size /= sizeof(float);
-    if (options.min_message_size < MIN_MESSAGE_SIZE) {
-        options.min_message_size = MIN_MESSAGE_SIZE;
-    }
-
-    bufsize = sizeof(float)*(options.max_message_size/sizeof(float));
-    if (allocate_memory_coll((void**)&sendbuf, bufsize, options.accel)) {
-        fprintf(stderr, "Could Not Allocate Memory [rank %d]\n", rank);
-        MPI_CHECK(MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE));
-    }
-    set_buffer(sendbuf, options.accel, 1, bufsize);
-
-    bufsize = sizeof(float)*(options.max_message_size/sizeof(float));
-    if (allocate_memory_coll((void**)&recvbuf, bufsize, options.accel)) {
-        fprintf(stderr, "Could Not Allocate Memory [rank %d]\n", rank);
-        MPI_CHECK(MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE));
-    }
-    set_buffer(recvbuf, options.accel, 0, bufsize);
-
-    print_preamble(rank);
-
-    for(size=options.min_message_size; size*sizeof(float) <= options.max_message_size; size *= 2) {
-
-        if(size > LARGE_MESSAGE_SIZE) {
-            options.skip = options.skip_large;
-            options.iterations = options.iterations_large;
-        }
-
-        MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
-
-        timer=0.0;
-        for(i=0; i < options.iterations + options.skip ; i++) {
-            t_start = MPI_Wtime();
-            MPI_CHECK(MPI_Allreduce(sendbuf, recvbuf, size, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD ));
-            t_stop=MPI_Wtime();
-            if(i>=options.skip){
-
-            timer+=t_stop-t_start;
-            }
-            MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
-        }
-        latency = (double)(timer * 1e6) / options.iterations;
-
-        MPI_CHECK(MPI_Reduce(&latency, &min_time, 1, MPI_DOUBLE, MPI_MIN, 0,
-                MPI_COMM_WORLD));
-        MPI_CHECK(MPI_Reduce(&latency, &max_time, 1, MPI_DOUBLE, MPI_MAX, 0,
-                MPI_COMM_WORLD));
-        MPI_CHECK(MPI_Reduce(&latency, &avg_time, 1, MPI_DOUBLE, MPI_SUM, 0,
-                MPI_COMM_WORLD));
-        avg_time = avg_time/numprocs;
-
-        print_stats(rank, size * sizeof(float), avg_time, min_time, max_time);
-        MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
-    }
-
-    free_buffer(sendbuf, options.accel);
-    free_buffer(recvbuf, options.accel);
-
-    MPI_CHECK(MPI_Finalize());
-
-    if (NONE != options.accel) {
-        if (cleanup_accel()) {
-            fprintf(stderr, "Error cleaning up device\n");
-            exit(EXIT_FAILURE);
-        }
-    }
-
-    return EXIT_SUCCESS;
-}
diff --git a/cscs-checks/microbenchmarks/mpi/osu/src/osu_alltoall.c b/cscs-checks/microbenchmarks/mpi/osu/src/osu_alltoall.c
deleted file mode 100644
index 255605d31e..0000000000
--- a/cscs-checks/microbenchmarks/mpi/osu/src/osu_alltoall.c
+++ /dev/null
@@ -1,138 +0,0 @@
-#define BENCHMARK "OSU MPI%s All-to-All Personalized Exchange Latency Test"
-/*
- * Copyright (C) 2002-2017 the Network-Based Computing Laboratory
- * (NBCL), The Ohio State University.
- *
- * Contact: Dr. D. K. Panda (panda@cse.ohio-state.edu)
- *
- * For detailed copyright and licensing information, please refer to the
- * copyright file COPYRIGHT in the top level OMB directory.
- */
-#include "osu_util.h"
-
-int
-main (int argc, char *argv[])
-{
-    int i, numprocs, rank, size;
-    double latency = 0.0, t_start = 0.0, t_stop = 0.0;
-    double timer=0.0;
-    double avg_time = 0.0, max_time = 0.0, min_time = 0.0;
-    char * sendbuf = NULL, * recvbuf = NULL;
-    int po_ret;
-    size_t bufsize;
-    options.bench = COLLECTIVE;
-    options.subtype = LAT;
-
-    set_header(HEADER);
-    set_benchmark_name("osu_alltoall");
-    po_ret = process_options(argc, argv);
-
-    if (PO_OKAY == po_ret && NONE != options.accel) {
-        if (init_accel()) {
-            fprintf(stderr, "Error initializing device\n");
-            exit(EXIT_FAILURE);
-        }
-    }
-
-    MPI_CHECK(MPI_Init(&argc, &argv));
-    MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
-    MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &numprocs));
-
-    switch (po_ret) {
-        case PO_BAD_USAGE:
-            print_bad_usage_message(rank);
-            MPI_CHECK(MPI_Finalize());
-            exit(EXIT_FAILURE);
-        case PO_HELP_MESSAGE:
-            print_help_message(rank);
-            MPI_CHECK(MPI_Finalize());
-            exit(EXIT_SUCCESS);
-        case PO_VERSION_MESSAGE:
-            print_version_message(rank);
-            MPI_CHECK(MPI_Finalize());
-            exit(EXIT_SUCCESS);
-        case PO_OKAY:
-            break;
-    }
-
-    if(numprocs < 2) {
-        if (rank == 0) {
-            fprintf(stderr, "This test requires at least two processes\n");
-        }
-
-        MPI_CHECK(MPI_Finalize());
-        exit(EXIT_FAILURE);
-    }
-
-    if ((options.max_message_size * numprocs) > options.max_mem_limit) {
-        options.max_message_size = options.max_mem_limit / numprocs;
-    }
-
-    bufsize = options.max_message_size * numprocs;
-
-    if (allocate_memory_coll((void**)&sendbuf, bufsize, options.accel)) {
-        fprintf(stderr, "Could Not Allocate Memory [rank %d]\n", rank);
-        MPI_CHECK(MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE));
-    }
-
-    set_buffer(sendbuf, options.accel, 1, bufsize);
-
-    if (allocate_memory_coll((void**)&recvbuf, options.max_message_size * numprocs,
-                options.accel)) {
-        fprintf(stderr, "Could Not Allocate Memory [rank %d]\n", rank);
-        MPI_CHECK(MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE));
-    }
-
-    set_buffer(recvbuf, options.accel, 0, bufsize);
-    print_preamble(rank);
-
-    for(size=options.min_message_size; size <= options.max_message_size; size *= 2) {
-        if (size > LARGE_MESSAGE_SIZE) {
-            options.skip = options.skip_large;
-            options.iterations = options.iterations_large;
-        }
-
-        MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
-        timer=0.0;
-
-        for (i=0; i < options.iterations + options.skip ; i++) {
-            t_start = MPI_Wtime();
-            MPI_CHECK(MPI_Alltoall(sendbuf, size, MPI_CHAR, recvbuf, size, MPI_CHAR,
-                    MPI_COMM_WORLD));
-            t_stop = MPI_Wtime();
-
-            if (i >= options.skip) {
-                timer+=t_stop-t_start;
-            }
-            MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
-        }
-        latency = (double)(timer * 1e6) / options.iterations;
-
-        MPI_CHECK(MPI_Reduce(&latency, &min_time, 1, MPI_DOUBLE, MPI_MIN, 0,
-                MPI_COMM_WORLD));
-        MPI_CHECK(MPI_Reduce(&latency, &max_time, 1, MPI_DOUBLE, MPI_MAX, 0,
-                MPI_COMM_WORLD));
-        MPI_CHECK(MPI_Reduce(&latency, &avg_time, 1, MPI_DOUBLE, MPI_SUM, 0,
-                MPI_COMM_WORLD));
-        avg_time = avg_time/numprocs;
-
-        print_stats(rank, size, avg_time, min_time, max_time);
-        MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
-    }
-
-    free_buffer(sendbuf, options.accel);
-    free_buffer(recvbuf, options.accel);
-
-    MPI_CHECK(MPI_Finalize());
-
-    if (NONE != options.accel) {
-        if (cleanup_accel()) {
-            fprintf(stderr, "Error cleaning up device\n");
-            exit(EXIT_FAILURE);
-        }
-    }
-
-    return EXIT_SUCCESS;
-}
-
-/* vi: set sw=4 sts=4 tw=80: */
diff --git a/cscs-checks/microbenchmarks/mpi/osu/src/osu_bw.c b/cscs-checks/microbenchmarks/mpi/osu/src/osu_bw.c
deleted file mode 100644
index 78217261b8..0000000000
--- a/cscs-checks/microbenchmarks/mpi/osu/src/osu_bw.c
+++ /dev/null
@@ -1,162 +0,0 @@
-#define BENCHMARK "OSU MPI%s Bandwidth Test"
-/*
- * Copyright (C) 2002-2017 the Network-Based Computing Laboratory
- * (NBCL), The Ohio State University. 
- *
- * Contact: Dr. D. K. Panda (panda@cse.ohio-state.edu)
- *
- * For detailed copyright and licensing information, please refer to the
- * copyright file COPYRIGHT in the top level OMB directory.
- */
-
-#include "osu_util.h"
-
-int
-main (int argc, char *argv[])
-{
-    int myid, numprocs, i, j;
-    int size;
-    char *s_buf, *r_buf;
-    double t_start = 0.0, t_end = 0.0, t = 0.0;
-    int window_size = 64;
-    int po_ret = 0;
-    options.bench = PT2PT;
-    options.subtype = BW;
-
-    set_header(HEADER);
-    set_benchmark_name("osu_bw");
-
-    po_ret = process_options(argc, argv);
-
-    if (PO_OKAY == po_ret && NONE != options.accel) {
-        if (init_accel()) {
-            fprintf(stderr, "Error initializing device\n");
-            exit(EXIT_FAILURE);
-        }
-    }
-    
-    MPI_CHECK(MPI_Init(&argc, &argv));
-    MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &numprocs));
-    MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &myid));
-
-    if (0 == myid) {
-        switch (po_ret) {
-            case PO_CUDA_NOT_AVAIL:
-                fprintf(stderr, "CUDA support not enabled.  Please recompile "
-                        "benchmark with CUDA support.\n");
-                break;
-            case PO_OPENACC_NOT_AVAIL:
-                fprintf(stderr, "OPENACC support not enabled.  Please "
-                        "recompile benchmark with OPENACC support.\n");
-                break;
-            case PO_BAD_USAGE:
-                print_bad_usage_message(myid);
-                break;
-            case PO_HELP_MESSAGE:
-                print_help_message(myid);
-                break;
-            case PO_VERSION_MESSAGE:
-                print_version_message(myid);
-                MPI_CHECK(MPI_Finalize());
-                exit(EXIT_SUCCESS);
-            case PO_OKAY:
-                break;
-        }
-    }
-
-    switch (po_ret) {
-        case PO_CUDA_NOT_AVAIL:
-        case PO_OPENACC_NOT_AVAIL:
-        case PO_BAD_USAGE:
-            MPI_CHECK(MPI_Finalize());
-            exit(EXIT_FAILURE);
-        case PO_HELP_MESSAGE:
-        case PO_VERSION_MESSAGE:
-            MPI_CHECK(MPI_Finalize());
-            exit(EXIT_SUCCESS);
-        case PO_OKAY:
-            break;
-    }
-
-    if(numprocs != 2) {
-        if(myid == 0) {
-            fprintf(stderr, "This test requires exactly two processes\n");
-        }
-
-        MPI_CHECK(MPI_Finalize());
-        exit(EXIT_FAILURE);
-    }
-
-    if (allocate_memory_pt2pt(&s_buf, &r_buf, myid)) {
-        /* Error allocating memory */
-        MPI_CHECK(MPI_Finalize());
-        exit(EXIT_FAILURE);
-    }
-
-    print_header(myid, BW);
-
-    /* Bandwidth test */
-    for(size = options.min_message_size; size <= options.max_message_size; size *= 2) {
-        set_buffer(s_buf, options.accel, 'a', size);
-        set_buffer(r_buf, options.accel, 'b', size);
-
-
-        if(size > LARGE_MESSAGE_SIZE) {
-            options.iterations = options.iterations_large;
-            options.skip = options.skip_large;
-            window_size = options.window_size_large;
-        }
-
-        if(myid == 0) {
-            for(i = 0; i < options.iterations + options.skip; i++) {
-                if(i == options.skip) {
-                    t_start = MPI_Wtime();
-                }
-
-                for(j = 0; j < window_size; j++) {
-                    MPI_CHECK(MPI_Isend(s_buf, size, MPI_CHAR, 1, 100, MPI_COMM_WORLD,
-                            request + j));
-                }
-
-                MPI_CHECK(MPI_Waitall(window_size, request, reqstat));
-                MPI_CHECK(MPI_Recv(r_buf, 4, MPI_CHAR, 1, 101, MPI_COMM_WORLD,
-                        &reqstat[0]));
-            }
-
-            t_end = MPI_Wtime();
-            t = t_end - t_start;
-        }
-
-        else if(myid == 1) {
-            for(i = 0; i < options.iterations + options.skip; i++) {
-                for(j = 0; j < window_size; j++) {
-                    MPI_CHECK(MPI_Irecv(r_buf, size, MPI_CHAR, 0, 100, MPI_COMM_WORLD,
-                            request + j));
-                }
-
-                MPI_CHECK(MPI_Waitall(window_size, request, reqstat));
-                MPI_CHECK(MPI_Send(s_buf, 4, MPI_CHAR, 0, 101, MPI_COMM_WORLD));
-            }
-        }
-
-        if(myid == 0) {
-            double tmp = size / 1e6 * options.iterations * window_size;
-
-            fprintf(stdout, "%-*d%*.*f\n", 10, size, FIELD_WIDTH,
-                    FLOAT_PRECISION, tmp / t);
-            fflush(stdout);
-        }
-    }
-
-    free_memory(s_buf, r_buf, myid);
-    MPI_CHECK(MPI_Finalize());
-
-    if (NONE != options.accel) {
-        if (cleanup_accel()) {
-            fprintf(stderr, "Error cleaning up device\n");
-            exit(EXIT_FAILURE);
-        }
-    }
-
-    return EXIT_SUCCESS;
-}
diff --git a/cscs-checks/microbenchmarks/mpi/osu/src/osu_latency.c b/cscs-checks/microbenchmarks/mpi/osu/src/osu_latency.c
deleted file mode 100644
index b1beed5d14..0000000000
--- a/cscs-checks/microbenchmarks/mpi/osu/src/osu_latency.c
+++ /dev/null
@@ -1,151 +0,0 @@
-#define BENCHMARK "OSU MPI%s Latency Test"
-/*
- * Copyright (C) 2002-2017 the Network-Based Computing Laboratory
- * (NBCL), The Ohio State University. 
- *
- * Contact: Dr. D. K. Panda (panda@cse.ohio-state.edu)
- *
- * For detailed copyright and licensing information, please refer to the
- * copyright file COPYRIGHT in the top level OMB directory.
- */
-#include "osu_util.h"
-
-int
-main (int argc, char *argv[])
-{
-    int myid, numprocs, i;
-    int size;
-    MPI_Status reqstat;
-    char *s_buf, *r_buf;
-    double t_start = 0.0, t_end = 0.0;
-    int po_ret = 0;
-    options.bench = PT2PT;
-    options.subtype = LAT;
-
-    set_header(HEADER);
-    set_benchmark_name("osu_latency");
-
-    po_ret = process_options(argc, argv);
-
-    if (PO_OKAY == po_ret && NONE != options.accel) {
-        if (init_accel()) {
-            fprintf(stderr, "Error initializing device\n");
-            exit(EXIT_FAILURE);
-        }
-    }
-
-    MPI_CHECK(MPI_Init(&argc, &argv));
-    MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &numprocs));
-    MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &myid));
-
-    if (0 == myid) {
-        switch (po_ret) {
-            case PO_CUDA_NOT_AVAIL:
-                fprintf(stderr, "CUDA support not enabled.  Please recompile "
-                        "benchmark with CUDA support.\n");
-                break;
-            case PO_OPENACC_NOT_AVAIL:
-                fprintf(stderr, "OPENACC support not enabled.  Please "
-                        "recompile benchmark with OPENACC support.\n");
-                break;
-            case PO_BAD_USAGE:
-                print_bad_usage_message(myid);
-                break;
-            case PO_HELP_MESSAGE:
-                print_help_message(myid);
-                break;
-            case PO_VERSION_MESSAGE:
-                print_version_message(myid);
-                MPI_CHECK(MPI_Finalize());
-                exit(EXIT_SUCCESS);
-            case PO_OKAY:
-                break;
-        }
-    }
-
-    switch (po_ret) {
-        case PO_CUDA_NOT_AVAIL:
-        case PO_OPENACC_NOT_AVAIL:
-        case PO_BAD_USAGE:
-            MPI_CHECK(MPI_Finalize());
-            exit(EXIT_FAILURE);
-        case PO_HELP_MESSAGE:
-        case PO_VERSION_MESSAGE:
-            MPI_CHECK(MPI_Finalize());
-            exit(EXIT_SUCCESS);
-        case PO_OKAY:
-            break;
-    }
-
-    if(numprocs != 2) {
-        if(myid == 0) {
-            fprintf(stderr, "This test requires exactly two processes\n");
-        }
-
-        MPI_CHECK(MPI_Finalize());
-        exit(EXIT_FAILURE);
-    }
-
-    if (allocate_memory_pt2pt(&s_buf, &r_buf, myid)) {
-        /* Error allocating memory */
-        MPI_CHECK(MPI_Finalize());
-        exit(EXIT_FAILURE);
-    }
-
-    print_header(myid, LAT);
-
-    
-    /* Latency test */
-    for(size = options.min_message_size; size <= options.max_message_size; size = (size ? size * 2 : 1)) {
-        set_buffer(s_buf, options.accel, 'a', size);
-        set_buffer(r_buf, options.accel, 'b', size);
-
-        if(size > LARGE_MESSAGE_SIZE) {
-            options.iterations = options.iterations_large;
-            options.skip = options.skip_large;
-        }
-
-        MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
-
-        if(myid == 0) {
-            for(i = 0; i < options.iterations + options.skip; i++) {
-                if(i == options.skip) {
-                    t_start = MPI_Wtime();
-                }
-
-                MPI_CHECK(MPI_Send(s_buf, size, MPI_CHAR, 1, 1, MPI_COMM_WORLD));
-                MPI_CHECK(MPI_Recv(r_buf, size, MPI_CHAR, 1, 1, MPI_COMM_WORLD, &reqstat));
-            }
-
-            t_end = MPI_Wtime();
-        }
-
-        else if(myid == 1) {
-            for(i = 0; i < options.iterations + options.skip; i++) {
-                MPI_CHECK(MPI_Recv(r_buf, size, MPI_CHAR, 0, 1, MPI_COMM_WORLD, &reqstat));
-                MPI_CHECK(MPI_Send(s_buf, size, MPI_CHAR, 0, 1, MPI_COMM_WORLD));
-            }
-        }
-
-        if(myid == 0) {
-            double latency = (t_end - t_start) * 1e6 / (2.0 * options.iterations);
-
-            fprintf(stdout, "%-*d%*.*f\n", 10, size, FIELD_WIDTH,
-                    FLOAT_PRECISION, latency);
-            fflush(stdout);
-        }
-    }
-
-    free_memory(s_buf, r_buf, myid);
-    MPI_CHECK(MPI_Finalize());
-
-    if (NONE != options.accel) {
-        if (cleanup_accel()) {
-            fprintf(stderr, "Error cleaning up device\n");
-            exit(EXIT_FAILURE);
-        }
-    }
-
-    return EXIT_SUCCESS;
-}
-
diff --git a/cscs-checks/microbenchmarks/mpi/osu/src/osu_util.c b/cscs-checks/microbenchmarks/mpi/osu/src/osu_util.c
deleted file mode 100644
index 86653e81e5..0000000000
--- a/cscs-checks/microbenchmarks/mpi/osu/src/osu_util.c
+++ /dev/null
@@ -1,2186 +0,0 @@
-/*
- * Copyright (C) 2002-2017 the Network-Based Computing Laboratory
- * (NBCL), The Ohio State University.
- *
- * Contact: Dr. D. K. Panda (panda@cse.ohio-state.edu)
- *
- * For detailed copyright and licensing information, please refer to the
- * copyright file COPYRIGHT in the top level directory.
- */
-
-#include "osu_util.h"
-
-MPI_Request request[MAX_REQ_NUM];
-MPI_Status  reqstat[MAX_REQ_NUM];
-MPI_Request send_request[MAX_REQ_NUM];
-MPI_Request recv_request[MAX_REQ_NUM];
-
-#ifdef _ENABLE_OPENACC_
-#include <openacc.h>
-#endif
-
-/*
- * GLOBAL VARIABLES
- */
-#ifdef _ENABLE_CUDA_
-CUcontext cuContext;
-#endif
-
-char const *win_info[20] = {
-    "MPI_Win_create",
-#if MPI_VERSION >=3
-    "MPI_Win_allocate",
-    "MPI_Win_create_dynamic",
-#endif
-};
-
-char const *sync_info[20] = {
-    "MPI_Win_lock/unlock",
-    "MPI_Win_post/start/complete/wait",
-    "MPI_Win_fence",
-#if MPI_VERSION >=3
-    "MPI_Win_flush",
-    "MPI_Win_flush_local",
-    "MPI_Win_lock_all/unlock_all",
-#endif
-};
-
-MPI_Aint disp_remote;
-MPI_Aint disp_local;
-
-int mem_on_dev;
-
-static char const * benchmark_header = NULL;
-static char const * benchmark_name = NULL;
-static int benchmark_type;
-static int accel_enabled = 0;
-struct options_t options;
-
-/* A is the A in DAXPY for the Compute Kernel */
-#define A 2.0
-#define DEBUG 0
-/*
- * We are using a 2-D matrix to perform dummy
- * computation in non-blocking collective benchmarks
- */
-#define DIM 25
-static float **a, *x, *y;
-
-#ifdef _ENABLE_CUDA_KERNEL_
-/* Using new stream for kernels on gpu */
-static cudaStream_t stream;
-
-static int is_alloc = 0;
-
-/* Arrays on device for dummy compute */
-static float *d_x, *d_y;
-#endif
-
-
-static struct {
-    char const * message;
-    char const * optarg;
-    int opt;
-} bad_usage;
-
-
-void
-print_header(int rank, int full)
-{
-    switch(options.bench) {
-        case PT2PT :
-            if (0 == rank) {
-                switch (options.accel) {
-                    case CUDA:
-                        printf(benchmark_header, "-CUDA");
-                        break;
-                    case OPENACC:
-                        printf(benchmark_header, "-OPENACC");
-                        break;
-                    default:
-                        printf(benchmark_header, "");
-                        break;
-                }
-
-                switch (options.accel) {
-                    case CUDA:
-                    case OPENACC:
-                        fprintf(stdout, "# Send Buffer on %s and Receive Buffer on %s\n",
-                               'M' == options.src ? "MANAGED (M)" : ('D' == options.src ? "DEVICE (D)" : "HOST (H)"),
-                               'M' == options.dst ? "MANAGED (M)" : ('D' == options.dst ? "DEVICE (D)" : "HOST (H)"));
-                    default:
-                        if (options.subtype == BW) {
-                            fprintf(stdout, "%-*s%*s\n", 10, "# Size", FIELD_WIDTH, "Bandwidth (MB/s)");
-                        }
-                        else {
-                            fprintf(stdout, "%-*s%*s\n", 10, "# Size", FIELD_WIDTH, "Latency (us)");
-                        }
-                        fflush(stdout);
-                }
-            }
-            break;
-        case COLLECTIVE :
-            if(rank == 0) {
-                fprintf(stdout, HEADER, "");
-
-                if (options.show_size) {
-                    fprintf(stdout, "%-*s", 10, "# Size");
-                    fprintf(stdout, "%*s", FIELD_WIDTH, "Avg Latency(us)");
-                }
-
-                else {
-                    fprintf(stdout, "# Avg Latency(us)");
-                }
-
-                if (full) {
-                    fprintf(stdout, "%*s", FIELD_WIDTH, "Min Latency(us)");
-                    fprintf(stdout, "%*s", FIELD_WIDTH, "Max Latency(us)");
-                    fprintf(stdout, "%*s\n", 12, "Iterations");
-                }
-
-                else {
-                    fprintf(stdout, "\n");
-                }
-
-                fflush(stdout);
-            }
-            break;
-        default:
-            break;
-    }
-}
-
-void print_header_pgas (const char *header, int rank, int full)
-{
-    if(rank == 0) {
-        fprintf(stdout, header, "");
-
-        if (options.show_size) {
-            fprintf(stdout, "%-*s", 10, "# Size");
-            fprintf(stdout, "%*s", FIELD_WIDTH, "Avg Latency(us)");
-        }
-
-        else {
-            fprintf(stdout, "# Avg Latency(us)");
-        }
-
-        if (full) {
-            fprintf(stdout, "%*s", FIELD_WIDTH, "Min Latency(us)");
-            fprintf(stdout, "%*s", FIELD_WIDTH, "Max Latency(us)");
-            fprintf(stdout, "%*s\n", 12, "Iterations");
-        }
-
-        else {
-            fprintf(stdout, "\n");
-        }
-
-        fflush(stdout);
-    }
-}
-
-void print_data_pgas (int rank, int full, int size, double avg_time, double
-min_time, double max_time, int iterations)
-{
-    if(rank == 0) {
-        if (size) {
-            fprintf(stdout, "%-*d", 10, size);
-            fprintf(stdout, "%*.*f", FIELD_WIDTH, FLOAT_PRECISION, avg_time);
-        }
-
-        else {
-            fprintf(stdout, "%*.*f", 17, FLOAT_PRECISION, avg_time);
-        }
-
-        if (full) {
-            fprintf(stdout, "%*.*f%*.*f%*d\n",
-                    FIELD_WIDTH, FLOAT_PRECISION, min_time,
-                    FIELD_WIDTH, FLOAT_PRECISION, max_time,
-                    12, iterations);
-        }
-
-        else {
-            fprintf(stdout, "\n");
-        }
-
-        fflush(stdout);
-    }
-}
-
-void print_header_one_sided (int rank, enum WINDOW win, enum SYNC sync)
-{
-    if(rank == 0) {
-        switch (options.accel) {
-            case CUDA:
-                printf(benchmark_header, "-CUDA");
-                break;
-            case OPENACC:
-                printf(benchmark_header, "-OPENACC");
-                break;
-            default:
-                printf(benchmark_header, "");
-                break;
-        }
-        fprintf(stdout, "# Window creation: %s\n",
-                win_info[win]);
-        fprintf(stdout, "# Synchronization: %s\n",
-                sync_info[sync]);
-
-        switch (options.accel) {
-            case CUDA:
-            case OPENACC:
-                fprintf(stdout, "# Rank 0 Memory on %s and Rank 1 Memory on %s\n",
-                       'D' == options.src ? "DEVICE (D)" : "HOST (H)",
-                       'D' == options.dst ? "DEVICE (D)" : "HOST (H)");
-            default:
-                if (options.subtype == BW) {
-                    fprintf(stdout, "%-*s%*s\n", 10, "# Size", FIELD_WIDTH, "Bandwidth (MB/s)");
-                } else {
-                    fprintf(stdout, "%-*s%*s\n", 10, "# Size", FIELD_WIDTH, "Latency (us)");
-                }
-                fflush(stdout);
-        }
-    }
-}
-
-void print_data (int rank, int full, int size, double avg_time,
-                 double min_time, double max_time, int iterations)
-{
-    if(rank == 0) {
-        if (options.show_size) {
-            fprintf(stdout, "%-*d", 10, size);
-            fprintf(stdout, "%*.*f", FIELD_WIDTH, FLOAT_PRECISION, avg_time);
-        } else {
-            fprintf(stdout, "%*.*f", 17, FLOAT_PRECISION, avg_time);
-        }
-
-        if (full) {
-            fprintf(stdout, "%*.*f%*.*f%*d\n",
-                    FIELD_WIDTH, FLOAT_PRECISION, min_time,
-                    FIELD_WIDTH, FLOAT_PRECISION, max_time,
-                    12, iterations);
-        } else {
-            fprintf(stdout, "\n");
-
-        }
-
-        fflush(stdout);
-    }
-}
-
-
-static int
-set_min_message_size (long long value)
-{
-    if (0 >= value) {
-        return -1;
-    }
-
-    options.min_message_size = value;
-
-    return 0;
-}
-
-static int
-set_max_message_size (long long value)
-{
-    if (0 > value) {
-        return -1;
-    }
-
-    options.max_message_size = value;
-
-    return 0;
-}
-
-static int
-set_message_size (char *val_str)
-{
-    int retval = -1;
-    int i, count = 0;
-    char *val1, *val2;
-
-    for (i=0; val_str[i]; i++) {
-        if (val_str[i] == ':')
-            count++;
-    }
-
-    if (!count) {
-        retval = set_max_message_size(atoll(val_str));
-    } else if (count == 1) {
-        val1 = strtok(val_str, ":");
-        val2 = strtok(NULL, ":");
-
-        if (val1 && val2) {
-            retval = set_min_message_size(atoll(val1));
-            retval = set_max_message_size(atoll(val2));
-        } else if (val1) {
-            if (val_str[0] == ':') {
-                retval = set_max_message_size(atoll(val1));
-            } else {
-                retval = set_min_message_size(atoll(val1));
-            }
-        }
-    }
-
-    return retval;
-}
-
-static int set_num_warmup (int value)
-{
-    if (0 > value) {
-        return -1;
-    }
-
-    options.skip = value;
-    options.skip_large = value;
-
-    return 0;
-}
-
-static int set_num_iterations (int value)
-{
-    if (1 > value) {
-        return -1;
-    }
-
-    options.iterations = value;
-    options.iterations_large = value;
-
-    return 0;
-}
-
-static int set_window_size_large (int value)
-{
-    if (1 > value) {
-        return -1;
-    }
-
-    options.window_size_large = value;
-
-    return 0;
-}
-
-static int set_window_size (int value)
-{
-    if (1 > value) {
-        return -1;
-    }
-
-    options.window_size = value;
-
-    return 0;
-}
-
-static int set_device_array_size (int value)
-{
-    if (value < 1 ) {
-        return -1;
-    }
-
-    options.device_array_size = value;
-
-    return 0;
-}
-
-void set_device_memory (void * ptr, int data, size_t size)
-{
-#ifdef _ENABLE_OPENACC_
-    size_t i;
-    char * p = (char *)ptr;
-#endif
-
-    switch (options.accel) {
-#ifdef _ENABLE_CUDA_
-        case CUDA:
-            cudaMemset(ptr, data, size);
-            break;
-#endif
-#ifdef _ENABLE_OPENACC_
-        case OPENACC:
-#pragma acc parallel copyin(size) deviceptr(p)
-            for(i = 0; i < size; i++) {
-                p[i] = data;
-            }
-            break;
-#endif
-        default:
-            break;
-    }
-}
-
-int free_device_buffer (void * buf)
-{
-    switch (options.accel) {
-#ifdef _ENABLE_CUDA_
-        case CUDA:
-            cudaFree(buf);
-            break;
-#endif
-#ifdef _ENABLE_OPENACC_
-        case OPENACC:
-            acc_free(buf);
-            break;
-#endif
-        default:
-            /* unknown device */
-            return 1;
-    }
-
-    return 0;
-}
-
-void *align_buffer (void * ptr, unsigned long align_size)
-{
-    unsigned long buf = (((unsigned long)ptr + (align_size - 1)) / align_size * align_size);
-    return (void *) buf;
-}
-
-
-static int set_num_probes (int value)
-{
-    if (value < 0 ) {
-        return -1;
-    }
-
-    options.num_probes = value;
-
-    return 0;
-}
-
-static int set_max_memlimit (int value)
-{
-    options.max_mem_limit = value;
-
-    if (value < MAX_MEM_LOWER_LIMIT) {
-        options.max_mem_limit = MAX_MEM_LOWER_LIMIT;
-        fprintf(stderr,"Requested memory limit too low, using [%d] instead.",
-                MAX_MEM_LOWER_LIMIT);
-    }
-
-    return 0;
-}
-
-void set_header (const char * header)
-{
-    benchmark_header = header;
-}
-
-void set_benchmark_name (const char * name)
-{
-    benchmark_name = name;
-}
-
-void enable_accel_support (void)
-{
-    accel_enabled = (CUDA_ENABLED || OPENACC_ENABLED);
-}
-
-void usage_one_sided (char const * name)
-{
-    if (accel_enabled) {
-        fprintf(stdout, "Usage: %s [options] [RANK0 RANK1] \n", name);
-        fprintf(stdout, "RANK0 and RANK1 may be `D' or `H' which specifies whether\n"
-                       "the buffer is allocated on the accelerator device or host\n"
-                       "memory for each mpi rank\n\n");
-    } else {
-        fprintf(stdout, "Usage: %s [options] \n", name);
-    }
-
-    fprintf(stdout, "Options:\n");
-
-    fprintf(stdout, "  -d --accelerator <type>       accelerator device buffers can be of <type> "
-                   "`cuda' or `openacc'\n");
-    fprintf(stdout, "\n");
-
-#if MPI_VERSION >= 3
-    fprintf(stdout, "  -w --win-option <win_option>\n");
-    fprintf(stdout, "            <win_option> can be any of the follows:\n");
-    fprintf(stdout, "            create            use MPI_Win_create to create an MPI Window object\n");
-    if (accel_enabled) {
-        fprintf(stdout, "            allocate          use MPI_Win_allocate to create an MPI Window object (not valid when using device memory)\n");
-    } else {
-        fprintf(stdout, "            allocate          use MPI_Win_allocate to create an MPI Window object\n");
-    }
-    fprintf(stdout, "            dynamic           use MPI_Win_create_dynamic to create an MPI Window object\n");
-    fprintf(stdout, "\n");
-#endif
-
-    fprintf(stdout, "  -s, --sync-option <sync_option>\n");
-    fprintf(stdout, "            <sync_option> can be any of the follows:\n");
-    fprintf(stdout, "            pscw              use Post/Start/Complete/Wait synchronization calls \n");
-    fprintf(stdout, "            fence             use MPI_Win_fence synchronization call\n");
-    if (options.synctype == ALL_SYNC) {
-        fprintf(stdout, "            lock              use MPI_Win_lock/unlock synchronizations calls\n");
-#if MPI_VERSION >= 3
-        fprintf(stdout, "            flush             use MPI_Win_flush synchronization call\n");
-        fprintf(stdout, "            flush_local       use MPI_Win_flush_local synchronization call\n");
-        fprintf(stdout, "            lock_all          use MPI_Win_lock_all/unlock_all synchronization calls\n");
-#endif
-    }
-    fprintf(stdout, "\n");
-    if (options.show_size) {
-        fprintf(stdout, "  -m, --message-size          [MIN:]MAX  set the minimum and/or the maximum message size to MIN and/or MAX\n");
-        fprintf(stdout, "                              bytes respectively. Examples:\n");
-        fprintf(stdout, "                              -m 128      // min = default, max = 128\n");
-        fprintf(stdout, "                              -m 2:128    // min = 2, max = 128\n");
-        fprintf(stdout, "                              -m 2:       // min = 2, max = default\n");
-        fprintf(stdout, "  -M, --mem-limit SIZE        set per process maximum memory consumption to SIZE bytes\n");
-        fprintf(stdout, "                              (default %d)\n", MAX_MEM_LIMIT);
-    }
-    fprintf(stdout, "  -x, --warmup ITER           number of warmup iterations to skip before timing"
-                   "(default 100)\n");
-    fprintf(stdout, "  -i, --iterations ITER       number of iterations for timing (default 10000)\n");
-
-    fprintf(stdout, "  -h, --help                  print this help message\n");
-    fflush(stdout);
-}
-
-void usage_mbw_mr() {
-    fprintf(stdout, "Options:\n");
-    fprintf(stdout, "  -R=<0,1>, --print-rate         Print uni-directional message rate (default 1)\n");
-    fprintf(stdout, "  -p=<pairs>, --num-pairs        Number of pairs involved (default np / 2)\n");
-    fprintf(stdout, "  -W=<window>, --window-size     Number of messages sent before acknowledgement (64, 10)\n");
-    fprintf(stdout, "                                 [cannot be used with -v]\n");
-    fprintf(stdout, "  -V, --vary-window              Vary the window size (default no)\n");
-    fprintf(stdout, "                                 [cannot be used with -w]\n");
-    if (options.show_size) {
-        fprintf(stdout, "  -m, --message-size          [MIN:]MAX  set the minimum and/or the maximum message size to MIN and/or MAX\n");
-        fprintf(stdout, "                              bytes respectively. Examples:\n");
-        fprintf(stdout, "                              -m 128      // min = default, max = 128\n");
-        fprintf(stdout, "                              -m 2:128    // min = 2, max = 128\n");
-        fprintf(stdout, "                              -m 2:       // min = 2, max = default\n");
-        fprintf(stdout, "  -M, --mem-limit SIZE        set per process maximum memory consumption to SIZE bytes\n");
-        fprintf(stdout, "                              (default %d)\n", MAX_MEM_LIMIT);
-    }
-    fprintf(stdout, "  -h, --help                     Print this help\n");
-    fprintf(stdout, "\n");
-    fprintf(stdout, "  Note: This benchmark relies on block ordering of the ranks.  Please see\n");
-    fprintf(stdout, "        the README for more information.\n");
-    fflush(stdout);
-}
-
-void print_usage_pgas(int rank, const char * prog, int has_size)
-{
-    if (rank == 0) {
-        if (has_size) {
-            fprintf(stdout, " USAGE : %s [-m SIZE] [-i ITER] [-f] [-hv] [-M SIZE]\n", prog);
-            fprintf(stdout, "  -m, --message-size : Set maximum message size to SIZE.\n");
-            fprintf(stdout, "                       By default, the value of SIZE is 1MB.\n");
-            fprintf(stdout, "  -i, --iterations   : Set number of iterations per message size to ITER.\n");
-            fprintf(stdout, "                       By default, the value of ITER is 1000 for small messages\n");
-            fprintf(stdout, "                       and 100 for large messages.\n");
-            fprintf(stdout, "  -M, --mem-limit    : Set maximum memory consumption (per process) to SIZE. \n");
-            fprintf(stdout, "                       By default, the value of SIZE is 512MB.\n");
-        }
-
-        else {
-            fprintf(stdout, " USAGE : %s [-i ITER] [-f] [-hv] \n", prog);
-            fprintf(stdout, "  -i, --iterations   : Set number of iterations to ITER.\n");
-            fprintf(stdout, "                       By default, the value of ITER is 1000.\n");
-        }
-
-        fprintf(stdout, "  -f, --full         : Print full format listing.  With this option\n");
-        fprintf(stdout, "                      the MIN/MAX latency and number of ITERATIONS are\n");
-        fprintf(stdout, "                      printed out in addition to the AVERAGE latency.\n");
-
-        fprintf(stdout, "  -h, --help         : Print this help.\n");
-        fprintf(stdout, "  -v, --version      : Print version info.\n");
-        fprintf(stdout, "\n");
-        fflush(stdout);
-    }
-}
-
-void usage_oshm_pt2pt(int myid)
-{
-    if(myid == 0) {
-        fprintf(stderr, "Invalid arguments. Usage: <prog_name> <heap|global>\n");
-    }
-}
-
-void print_bad_usage_message (int rank)
-{
-    if (rank) {
-        return;
-    }
-
-    if (bad_usage.optarg) {
-        fprintf(stderr, "%s [-%c %s]\n\n", bad_usage.message,
-                (char)bad_usage.opt, bad_usage.optarg);
-    } else {
-        fprintf(stderr, "%s [-%c]\n\n", bad_usage.message,
-                (char)bad_usage.opt);
-    }
-
-    print_help_message(rank);
-}
-
-int process_options (int argc, char *argv[])
-{
-    extern char * optarg;
-    extern int optind, optopt;
-
-    char const * optstring = NULL;
-    int c;
-
-    int option_index = 0;
-
-    static struct option long_options[] = {
-            {"help",            no_argument,        0,  'h'},
-            {"version",         no_argument,        0,  'v'},
-            {"full",            no_argument,        0,  'f'},
-            {"message-size",    required_argument,  0,  'm'},
-            {"window-size",     required_argument,  0,  'W'},
-            {"num-test-calls",  required_argument,  0,  't'},
-            {"iterations",      required_argument,  0,  'i'},
-            {"warmup",          required_argument,  0,  'x'},
-            {"array-size",      required_argument,  0,  'a'},
-            {"sync-option",     required_argument,  0,  's'},
-            {"win-options",     required_argument,  0,  'w'},
-            {"mem-limit",       required_argument,  0,  'M'},
-            {"accelerator",     required_argument,  0,  'd'},
-            {"cuda-target",     required_argument,  0,  'r'},
-            {"print-rate",      required_argument,  0,  'R'},
-            {"num-pairs",       required_argument,  0,  'p'},
-            {"vary-window",     required_argument,  0,  'V'}
-    };
-
-    enable_accel_support();
-
-    if(options.bench == PT2PT) {
-        if (accel_enabled) {
-            optstring = (LAT_MT == options.subtype) ? "+:x:i:t:m:hv" : "+:x:i:m:d:hv";
-        } else{
-            optstring = (LAT_MT == options.subtype) ? "+:hvm:x:i:t:" : "+:hvm:x:i:";
-        }
-    } else if (options.bench == COLLECTIVE) {
-        optstring = "+:hvfm:i:x:M:t:a:";
-        if (accel_enabled) {
-            optstring = (CUDA_KERNEL_ENABLED) ? "+:d:hvfm:i:x:M:t:r:a:" : "+:d:hvfm:i:x:M:t:a:";
-        }
-    } else if (options.bench == ONE_SIDED) {
-#if MPI_VERSION >= 3
-        optstring = (accel_enabled) ? "+:w:s:hvm:d:x:i:" : "+:w:s:hvm:x:i:";
-#else
-        optstring = (accel_enabled) ? "+:s:hvm:d:x:i:" : "+s:hvm:x:i:";
-#endif
-    } else if (options.bench == MBW_MR){
-        optstring = "p:W:R:x:i:m:Vhv";
-    } else if (options.bench == OSHM || options.bench == UPC || options.bench == UPCXX) {
-        optstring = ":hvfm:i:M:";
-    } else {
-        fprintf(stderr,"Invalid benchmark type");
-        exit(1);
-    }
-
-    /* Set default options*/
-    options.accel = NONE;
-    options.show_size = 1;
-    options.show_full = 0;
-    options.num_probes = 0;
-    options.device_array_size = 32;
-    options.target = CPU;
-    options.min_message_size = MIN_MESSAGE_SIZE;
-    if (options.bench == COLLECTIVE) {
-        options.max_message_size = MAX_MSG_SIZE_COLL;
-    } else {
-        options.max_message_size = MAX_MESSAGE_SIZE;
-    }
-    options.max_mem_limit = MAX_MEM_LIMIT;
-    options.window_size_large = WINDOW_SIZE_LARGE;
-    options.window_size = WINDOW_SIZE_LARGE;
-    options.window_varied = 0;
-    options.print_rate = 1;
-
-    options.src = 'H';
-    options.dst = 'H';
-
-    switch (options.subtype) {
-        case BW:
-            options.iterations = BW_LOOP_SMALL;
-            options.skip = BW_SKIP_SMALL;
-            options.iterations_large = BW_LOOP_LARGE;
-            options.skip_large = BW_SKIP_LARGE;
-            break;
-        case LAT_MT:
-            options.num_threads = DEF_NUM_THREADS;
-            options.min_message_size = 0;
-        case LAT:
-            if (options.bench == COLLECTIVE) {
-                options.iterations = COLL_LOOP_SMALL;
-                options.skip = COLL_SKIP_SMALL;
-                options.iterations_large = COLL_LOOP_LARGE;
-                options.skip_large = COLL_SKIP_LARGE;
-            } else {
-                options.iterations = LAT_LOOP_SMALL;
-                options.skip = LAT_SKIP_SMALL;
-                options.iterations_large = LAT_LOOP_LARGE;
-                options.skip_large = LAT_SKIP_LARGE;
-            }
-            if (options.bench == PT2PT) {
-                options.min_message_size = 0;
-            }
-            break;
-        default:
-            break;
-    }
-
-    switch (options.bench) {
-        case UPCXX:
-        case UPC:
-            options.show_size = 0;
-        case OSHM:
-            options.iterations = OSHM_LOOP_SMALL;
-            options.skip = OSHM_SKIP_SMALL;
-            options.iterations_large = OSHM_LOOP_LARGE;
-            options.skip_large = OSHM_SKIP_LARGE;
-            options.max_message_size = 1<<20;
-            break;
-        default:
-            break;
-    }
-
-    while ((c = getopt_long(argc, argv, optstring, long_options, &option_index)) != -1) {
-        bad_usage.opt = c;
-        bad_usage.optarg = NULL;
-        bad_usage.message = NULL;
-
-        switch(c) {
-            case 'h':
-                return PO_HELP_MESSAGE;
-            case 'v':
-                return PO_VERSION_MESSAGE;
-            case 'm':
-                if (set_message_size(optarg)) {
-                    bad_usage.message = "Invalid Message Size";
-                    bad_usage.optarg = optarg;
-
-                    return PO_BAD_USAGE;
-                }
-                break;
-            case 't':
-                if (options.bench == COLLECTIVE) {
-                    if (set_num_probes(atoi(optarg))){
-                        bad_usage.message = "Invalid Number of Probes";
-                        bad_usage.optarg = optarg;
-
-                        return PO_BAD_USAGE;
-                    }
-                } else if (options.bench == PT2PT) {
-                    options.num_threads = atoi(optarg);
-                    if (options.num_threads < MIN_NUM_THREADS
-                        || options.num_threads >= MAX_NUM_THREADS) {
-                        bad_usage.message = "Invalid Number of Threads";
-                        bad_usage.optarg = optarg;
-
-                        return PO_BAD_USAGE;
-                    }
-                }
-                break;
-            case 'i':
-                if (set_num_iterations(atoi(optarg))) {
-                    bad_usage.message = "Invalid Number of Iterations";
-                    bad_usage.optarg = optarg;
-
-                    return PO_BAD_USAGE;
-                }
-                break;
-            case 'x':
-                if (set_num_warmup(atoi(optarg))) {
-                    bad_usage.message = "Invalid Number of Warmup Iterations";
-                    bad_usage.optarg = optarg;
-
-                    return PO_BAD_USAGE;
-                }
-                break;
-            case 'R':
-                options.print_rate = atoi(optarg);
-                if(0 != options.print_rate && 1 != options.print_rate) {
-                    return PO_BAD_USAGE;
-                }
-                break;
-            case 'W':
-                if (options.bench == MBW_MR) {
-                    if (set_window_size(atoi(optarg))) {
-                        bad_usage.message = "Invalid Number of Iterations";
-                        bad_usage.optarg = optarg;
-
-                        return PO_BAD_USAGE;
-                    }
-                }
-                else {
-                    if (set_window_size_large(atoi(optarg))) {
-                        bad_usage.message = "Invalid Number of Iterations";
-                        bad_usage.optarg = optarg;
-
-                        return PO_BAD_USAGE;
-                    }
-                }
-                break;
-            case 'V':
-                options.window_varied = 1;
-                break;
-            case 'p':
-                options.pairs = atoi(optarg);
-                break;
-            case 'a':
-                if (set_device_array_size(atoi(optarg))){
-                    bad_usage.message = "Invalid Device Array Size";
-                    bad_usage.optarg = optarg;
-
-                    return PO_BAD_USAGE;
-                }
-                break;
-            case 'f':
-                options.show_full = 1;
-                break;
-            case 'M':
-                /*
-                 * This function does not error but prints a warning message if
-                 * the value is too low.
-                 */
-                set_max_memlimit(atoll(optarg));
-                break;
-            case 'd':
-                if (!accel_enabled) {
-                    bad_usage.message = "Benchmark Does Not Support "
-                            "Accelerator Transfers";
-                    bad_usage.optarg = optarg;
-                    return PO_BAD_USAGE;
-                } else if (0 == strncasecmp(optarg, "cuda", 10)) {
-                    if (CUDA_ENABLED) {
-                        options.accel = CUDA;
-                    } else {
-                        bad_usage.message = "CUDA Support Not Enabled\n"
-                                "Please recompile benchmark with CUDA support";
-                        bad_usage.optarg = optarg;
-                        return PO_BAD_USAGE;
-                    }
-                } else if (0 == strncasecmp(optarg, "managed", 10)) {
-                    if (CUDA_ENABLED) {
-                        options.accel = MANAGED;
-                    } else {
-                        bad_usage.message = "CUDA Managed Memory Support Not Enabled\n"
-                                "Please recompile benchmark with CUDA support";
-                        bad_usage.optarg = optarg;
-                        return PO_BAD_USAGE;
-                    }
-                } else if (0 == strncasecmp(optarg, "openacc", 10)) {
-                    if (OPENACC_ENABLED) {
-                        options.accel = OPENACC;
-                    } else {
-                        bad_usage.message = "OpenACC Support Not Enabled\n"
-                                "Please recompile benchmark with OpenACC support";
-                        bad_usage.optarg = optarg;
-                        return PO_BAD_USAGE;
-                    }
-                } else {
-                    bad_usage.message = "Invalid Accel Type Specified";
-                    bad_usage.optarg = optarg;
-                    return PO_BAD_USAGE;
-                }
-                break;
-            case 'r':
-                if (CUDA_KERNEL_ENABLED) {
-                    if (0 == strncasecmp(optarg, "cpu", 10)) {
-                        options.target = CPU;
-                    } else if (0 == strncasecmp(optarg, "gpu", 10)) {
-                        options.target = GPU;
-                    } else if (0 == strncasecmp(optarg, "both", 10)) {
-                        options.target = BOTH;
-                    } else {
-                        bad_usage.message = "Please use cpu, gpu, or both";
-                        bad_usage.optarg = optarg;
-                        return PO_BAD_USAGE;
-                    }
-                } else {
-                    bad_usage.message = "CUDA Kernel Support Not Enabled\n"
-                            "Please recompile benchmark with CUDA Kernel support";
-                    bad_usage.optarg = optarg;
-                    return PO_BAD_USAGE;
-                }
-                break;
-
-#if MPI_VERSION >= 3
-            case 'w':
-                if (0 == strcasecmp(optarg, "create")) {
-                    options.win = WIN_CREATE;
-                } else if (0 == strcasecmp(optarg, "allocate")) {
-                    options.win = WIN_ALLOCATE;
-                } else if (0 == strcasecmp(optarg, "dynamic")) {
-                    options.win = WIN_DYNAMIC;
-                } else {
-                    return PO_BAD_USAGE;
-                }
-                break;
-#endif
-            case 's':
-                if (0 == strcasecmp(optarg, "pscw")) {
-                    options.sync = PSCW;
-                } else if (0 == strcasecmp(optarg, "fence")) {
-                    options.sync = FENCE;
-                } else if (options.synctype== ALL_SYNC) {
-                    if (0 == strcasecmp(optarg, "lock")) {
-                        options.sync = LOCK;
-                    }
-#if MPI_VERSION >= 3
-                        else if (0 == strcasecmp(optarg, "flush")) {
-                        options.sync = FLUSH;
-                    } else if (0 == strcasecmp(optarg, "flush_local")) {
-                        options.sync = FLUSH_LOCAL;
-                    } else if (0 == strcasecmp(optarg, "lock_all")) {
-                        options.sync = LOCK_ALL;
-                    }
-#endif
-                    else {
-                        return PO_BAD_USAGE;
-                    }
-                } else {
-                    return PO_BAD_USAGE;
-                }
-                break;
-
-            case ':':
-                bad_usage.message = "Option Missing Required Argument";
-                bad_usage.opt = optopt;
-                return PO_BAD_USAGE;
-            default:
-                bad_usage.message = "Invalid Option";
-                bad_usage.opt = optopt;
-                return PO_BAD_USAGE;
-        }
-
-    }
-
-    if (accel_enabled) {
-
-        if ((optind + 2) == argc) {
-            options.src = argv[optind][0];
-            options.dst = argv[optind + 1][0];
-
-            switch (options.src) {
-                case 'D':
-                case 'H':
-                case 'M':
-                    if (options.bench != PT2PT && options.bench != ONE_SIDED) {
-                        bad_usage.opt = options.src;
-                        bad_usage.message = "This argument is only supported for one-sided and pt2pt benchmarks";
-                        return PO_BAD_USAGE;
-                    }
-                    options.accel = CUDA;
-                    break;
-                default:
-                    return PO_BAD_USAGE;
-            }
-
-            switch (options.dst) {
-                case 'D':
-                case 'H':
-                case 'M':
-                    if (options.bench != PT2PT && options.bench != ONE_SIDED) {
-                        bad_usage.opt = options.dst;
-                        bad_usage.message = "This argument is only supported for one-sided and pt2pt benchmarks";
-                        return PO_BAD_USAGE;
-                    }
-                    options.accel = CUDA;
-                    break;
-                default:
-                    return PO_BAD_USAGE;
-            }
-        } else if (optind != argc) {
-            return PO_BAD_USAGE;
-        }
-    }
-
-    return PO_OKAY;
-}
-
-
-void print_help_message (int rank)
-{
-    if (rank) {
-        return;
-    }
-
-    if (accel_enabled && (options.bench == PT2PT)) {
-        fprintf(stdout, "Usage: %s [options] [RANK0 RANK1]\n\n", benchmark_name);
-        fprintf(stdout, "RANK0 and RANK1 may be `D', `H', or 'M' which specifies whether\n"
-                       "the buffer is allocated on the accelerator device memory, host\n"
-                       "memory or using CUDA Unified memory respectively for each mpi rank\n\n");
-    } else {
-        fprintf(stdout, "Usage: %s [options]\n", benchmark_name);
-        fprintf(stdout, "Options:\n");
-    }
-
-    if (accel_enabled && (options.subtype != LAT_MT)) {
-        fprintf(stdout, "  -d, --accelerator  TYPE     use accelerator device buffers, which can be of TYPE `cuda', \n");
-        fprintf(stdout, "                              `managed' or `openacc' (uses standard host buffers if not specified)\n");
-    }
-
-    if (options.show_size) {
-        fprintf(stdout, "  -m, --message-size          [MIN:]MAX  set the minimum and/or the maximum message size to MIN and/or MAX\n");
-        fprintf(stdout, "                              bytes respectively. Examples:\n");
-        fprintf(stdout, "                              -m 128      // min = default, max = 128\n");
-        fprintf(stdout, "                              -m 2:128    // min = 2, max = 128\n");
-        fprintf(stdout, "                              -m 2:       // min = 2, max = default\n");
-        fprintf(stdout, "  -M, --mem-limit SIZE        set per process maximum memory consumption to SIZE bytes\n");
-        fprintf(stdout, "                              (default %d)\n", MAX_MEM_LIMIT);
-    }
-
-    fprintf(stdout, "  -i, --iterations ITER       set iterations per message size to ITER (default 1000 for small\n");
-    fprintf(stdout, "                              messages, 100 for large messages)\n");
-    fprintf(stdout, "  -x, --warmup ITER           set number of warmup iterations to skip before timing (default 200)\n");
-
-    if (options.bench == COLLECTIVE) {
-        fprintf(stdout, "  -f, --full                  print full format listing (MIN/MAX latency and ITERATIONS\n");
-        fprintf(stdout, "                              displayed in addition to AVERAGE latency)\n");
-
-        fprintf(stdout, "  -t, --num_test_calls CALLS  set the number of MPI_Test() calls during the dummy computation, \n");
-        fprintf(stdout, "                              set CALLS to 100, 1000, or any number > 0.\n");
-
-        if (CUDA_KERNEL_ENABLED) {
-            fprintf(stdout, "  -r, --cuda-target TARGET    set the compute target for dummy computation\n");
-            fprintf(stdout, "                              set TARGET to cpu (default) to execute \n");
-            fprintf(stdout, "                              on CPU only, set to gpu for executing kernel \n");
-            fprintf(stdout, "                              on the GPU only, and set to both for compute on both.\n");
-
-            fprintf(stdout, "  -a, --array-size SIZE       set the size of arrays to be allocated on device (GPU) \n");
-            fprintf(stdout, "                              for dummy compute on device (GPU) (default 32) \n");
-        }
-    }
-    if (LAT_MT == options.subtype) {
-        fprintf(stdout, "  -t, --num_threads            number of recv threads to test with (min: %d, "
-                       "default: %d, max: %d)\n", MIN_NUM_THREADS, DEF_NUM_THREADS,
-               MAX_NUM_THREADS);
-    }
-
-    fprintf(stdout, "  -h, --help                  print this help\n");
-    fprintf(stdout, "  -v, --version               print version info\n");
-    fprintf(stdout, "\n");
-    fflush(stdout);
-}
-
-void print_help_message_get_acc_lat (int rank)
-{
-    if (rank) {
-        return;
-    }
-
-    fprintf(stdout, "Usage: ./osu_get_acc_latency -w <win_option>  -s < sync_option> [-x ITER] [-i ITER]\n");
-    if (options.show_size) {
-        fprintf(stdout, "  -m, --message-size          [MIN:]MAX  set the minimum and/or the maximum message size to MIN and/or MAX\n");
-        fprintf(stdout, "                              bytes respectively. Examples:\n");
-        fprintf(stdout, "                              -m 128      // min = default, max = 128\n");
-        fprintf(stdout, "                              -m 2:128    // min = 2, max = 128\n");
-        fprintf(stdout, "                              -m 2:       // min = 2, max = default\n");
-        fprintf(stdout, "  -M, --mem-limit SIZE        set per process maximum memory consumption to SIZE bytes\n");
-        fprintf(stdout, "                              (default %d)\n", MAX_MEM_LIMIT);
-    }
-
-    fprintf(stdout, "  -x ITER       number of warmup iterations to skip before timing"
-            "(default 100)\n");
-    fprintf(stdout, "  -i ITER       number of iterations for timing (default 10000)\n");
-    fprintf(stdout, "\n");
-    fprintf(stdout, "win_option:\n");
-    fprintf(stdout, "  create            use MPI_Win_create to create an MPI Window object\n");
-    fprintf(stdout, "  allocate          use MPI_Win_allocate to create an MPI Window object\n");
-    fprintf(stdout, "  dynamic           use MPI_Win_create_dynamic to create an MPI Window object\n");
-    fprintf(stdout, "\n");
-
-    fprintf(stdout, "sync_option:\n");
-    fprintf(stdout, "  lock              use MPI_Win_lock/unlock synchronizations calls\n");
-    fprintf(stdout, "  flush             use MPI_Win_flush synchronization call\n");
-    fprintf(stdout, "  flush_local       use MPI_Win_flush_local synchronization call\n");
-    fprintf(stdout, "  lock_all          use MPI_Win_lock_all/unlock_all synchronization calls\n");
-    fprintf(stdout, "  pscw              use Post/Start/Complete/Wait synchronization calls \n");
-    fprintf(stdout, "  fence             use MPI_Win_fence synchronization call\n");
-    fprintf(stdout, "\n");
-
-    fflush(stdout);
-}
-
-void print_version_pgas(const char *header)
-{
-    fprintf(stdout, header, "");
-    fflush(stdout);
-}
-
-void print_version_message (int rank)
-{
-    if (rank) {
-        return;
-    }
-
-    switch (options.accel) {
-        case CUDA:
-            printf(benchmark_header, "-CUDA");
-            break;
-        case OPENACC:
-            printf(benchmark_header, "-OPENACC");
-            break;
-        case MANAGED:
-            printf(benchmark_header, "-CUDA MANAGED");
-            break;
-        default:
-            printf(benchmark_header, "");
-            break;
-    }
-
-    fflush(stdout);
-}
-
-void print_preamble_nbc (int rank)
-{
-    if (rank) {
-        return;
-    }
-
-    fprintf(stdout, "\n");
-
-    switch (options.accel) {
-        case CUDA:
-            printf(benchmark_header, "-CUDA");
-            break;
-        case OPENACC:
-            printf(benchmark_header, "-OPENACC");
-            break;
-        case MANAGED:
-            printf(benchmark_header, "-MANAGED");
-            break;
-        default:
-            printf(benchmark_header, "");
-            break;
-    }
-
-    fprintf(stdout, "# Overall = Coll. Init + Compute + MPI_Test + MPI_Wait\n\n");
-
-    if (options.show_size) {
-        fprintf(stdout, "%-*s", 10, "# Size");
-        fprintf(stdout, "%*s", FIELD_WIDTH, "Overall(us)");
-    } else {
-        fprintf(stdout, "%*s", FIELD_WIDTH, "Overall(us)");
-    }
-
-    if (options.show_full) {
-        fprintf(stdout, "%*s", FIELD_WIDTH, "Compute(us)");
-        fprintf(stdout, "%*s", FIELD_WIDTH, "Coll. Init(us)");
-        fprintf(stdout, "%*s", FIELD_WIDTH, "MPI_Test(us)");
-        fprintf(stdout, "%*s", FIELD_WIDTH, "MPI_Wait(us)");
-        fprintf(stdout, "%*s", FIELD_WIDTH, "Pure Comm.(us)");
-        fprintf(stdout, "%*s\n", FIELD_WIDTH, "Overlap(%)");
-
-    } else {
-        fprintf(stdout, "%*s", FIELD_WIDTH, "Compute(us)");
-        fprintf(stdout, "%*s", FIELD_WIDTH, "Pure Comm.(us)");
-        fprintf(stdout, "%*s\n", FIELD_WIDTH, "Overlap(%)");
-    }
-
-    fflush(stdout);
-}
-
-void display_nbc_params()
-{
-    if (options.show_full) {
-        fprintf(stdout, "%*s", FIELD_WIDTH, "Compute(us)");
-        fprintf(stdout, "%*s", FIELD_WIDTH, "Coll. Init(us)");
-        fprintf(stdout, "%*s", FIELD_WIDTH, "MPI_Test(us)");
-        fprintf(stdout, "%*s", FIELD_WIDTH, "MPI_Wait(us)");
-        fprintf(stdout, "%*s", FIELD_WIDTH, "Pure Comm.(us)");
-        fprintf(stdout, "%*s\n", FIELD_WIDTH, "Overlap(%)");
-
-    } else {
-        fprintf(stdout, "%*s", FIELD_WIDTH, "Compute(us)");
-        fprintf(stdout, "%*s", FIELD_WIDTH, "Pure Comm.(us)");
-        fprintf(stdout, "%*s\n", FIELD_WIDTH, "Overlap(%)");
-    }
-}
-
-void print_preamble (int rank)
-{
-    if (rank) {
-        return;
-    }
-
-    fprintf(stdout, "\n");
-
-    switch (options.accel) {
-        case CUDA:
-            printf(benchmark_header, "-CUDA");
-            break;
-        case OPENACC:
-            printf(benchmark_header, "-OPENACC");
-            break;
-        default:
-            printf(benchmark_header, "");
-            break;
-    }
-
-    if (options.show_size) {
-        fprintf(stdout, "%-*s", 10, "# Size");
-        fprintf(stdout, "%*s", FIELD_WIDTH, "Avg Latency(us)");
-    } else {
-        fprintf(stdout, "# Avg Latency(us)");
-    }
-
-    if (options.show_full) {
-        fprintf(stdout, "%*s", FIELD_WIDTH, "Min Latency(us)");
-        fprintf(stdout, "%*s", FIELD_WIDTH, "Max Latency(us)");
-        fprintf(stdout, "%*s\n", 12, "Iterations");
-    } else {
-        fprintf(stdout, "\n");
-    }
-
-    fflush(stdout);
-}
-
-void calculate_and_print_stats(int rank, int size, int numprocs,
-                          double timer, double latency,
-                          double test_time, double cpu_time,
-                          double wait_time, double init_time)
-{
-    double test_total   = (test_time * 1e6) / options.iterations;
-    double tcomp_total  = (cpu_time * 1e6) / options.iterations;
-    double overall_time = (timer * 1e6) / options.iterations;
-    double wait_total   = (wait_time * 1e6) / options.iterations;
-    double init_total   = (init_time * 1e6) / options.iterations;
-    double comm_time   = latency;
-
-    if(rank != 0) {
-        MPI_CHECK(MPI_Reduce(&test_total, &test_total, 1, MPI_DOUBLE, MPI_SUM, 0,
-                   MPI_COMM_WORLD));
-        MPI_CHECK(MPI_Reduce(&comm_time, &comm_time, 1, MPI_DOUBLE, MPI_SUM, 0,
-                   MPI_COMM_WORLD));
-        MPI_CHECK(MPI_Reduce(&overall_time, &overall_time, 1, MPI_DOUBLE, MPI_SUM, 0,
-                   MPI_COMM_WORLD));
-        MPI_CHECK(MPI_Reduce(&tcomp_total, &tcomp_total, 1, MPI_DOUBLE, MPI_SUM, 0,
-                   MPI_COMM_WORLD));
-        MPI_CHECK(MPI_Reduce(&wait_total, &wait_total, 1, MPI_DOUBLE, MPI_SUM, 0,
-                   MPI_COMM_WORLD));
-        MPI_CHECK(MPI_Reduce(&init_total, &init_total, 1, MPI_DOUBLE, MPI_SUM, 0,
-                   MPI_COMM_WORLD));
-    } else {
-        MPI_CHECK(MPI_Reduce(MPI_IN_PLACE, &test_total, 1, MPI_DOUBLE, MPI_SUM, 0,
-                   MPI_COMM_WORLD));
-        MPI_CHECK(MPI_Reduce(MPI_IN_PLACE, &comm_time, 1, MPI_DOUBLE, MPI_SUM, 0,
-                   MPI_COMM_WORLD));
-        MPI_CHECK(MPI_Reduce(MPI_IN_PLACE, &overall_time, 1, MPI_DOUBLE, MPI_SUM, 0,
-                   MPI_COMM_WORLD));
-        MPI_CHECK(MPI_Reduce(MPI_IN_PLACE, &tcomp_total, 1, MPI_DOUBLE, MPI_SUM, 0,
-                   MPI_COMM_WORLD));
-        MPI_CHECK(MPI_Reduce(MPI_IN_PLACE, &wait_total, 1, MPI_DOUBLE, MPI_SUM, 0,
-                   MPI_COMM_WORLD));
-        MPI_CHECK(MPI_Reduce(MPI_IN_PLACE, &init_total, 1, MPI_DOUBLE, MPI_SUM, 0,
-                   MPI_COMM_WORLD));
-    }
-
-    MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
-
-    /* Overall Time (Overlapped) */
-    overall_time = overall_time/numprocs;
-    /* Computation Time */
-    tcomp_total = tcomp_total/numprocs;
-    /* Time taken by MPI_Test calls */
-    test_total = test_total/numprocs;
-    /* Pure Communication Time */
-    comm_time = comm_time/numprocs;
-    /* Time for MPI_Wait() call */
-    wait_total = wait_total/numprocs;
-    /* Time for the NBC call */
-    init_total = init_total/numprocs;
-
-    print_stats_nbc(rank, size, overall_time, tcomp_total, comm_time,
-                    wait_total, init_total, test_total);
-
-}
-
-void print_stats_nbc (int rank, int size, double overall_time,
-                 double cpu_time, double comm_time,
-                 double wait_time, double init_time,
-                 double test_time)
-{
-    if (rank) {
-        return;
-    }
-
-    double overlap;
-
-    /* Note : cpu_time received in this function includes time for
-       *      dummy compute as well as test calls so we will subtract
-       *      the test_time for overlap calculation as test is an
-       *      overhead
-       */
-
-    overlap = MAX(0, 100 - (((overall_time - (cpu_time - test_time)) / comm_time) * 100));
-
-    if (options.show_size) {
-        fprintf(stdout, "%-*d", 10, size);
-        fprintf(stdout, "%*.*f", FIELD_WIDTH, FLOAT_PRECISION, overall_time);
-    } else {
-        fprintf(stdout, "%*.*f", FIELD_WIDTH, FLOAT_PRECISION, overall_time);
-    }
-
-    if (options.show_full) {
-        fprintf(stdout, "%*.*f%*.*f%*.*f%*.*f%*.*f%*.*f\n",
-                FIELD_WIDTH, FLOAT_PRECISION, (cpu_time - test_time),
-                FIELD_WIDTH, FLOAT_PRECISION, init_time,
-                FIELD_WIDTH, FLOAT_PRECISION, test_time,
-                FIELD_WIDTH, FLOAT_PRECISION, wait_time,
-                FIELD_WIDTH, FLOAT_PRECISION, comm_time,
-                FIELD_WIDTH, FLOAT_PRECISION, overlap);
-    } else {
-        fprintf(stdout, "%*.*f", FIELD_WIDTH, FLOAT_PRECISION, (cpu_time - test_time));
-        fprintf(stdout, "%*.*f", FIELD_WIDTH, FLOAT_PRECISION, comm_time);
-        fprintf(stdout, "%*.*f\n", FIELD_WIDTH, FLOAT_PRECISION, overlap);
-    }
-
-    fflush(stdout);
-}
-
-void print_stats (int rank, int size, double avg_time, double min_time, double max_time)
-{
-    if (rank) {
-        return;
-    }
-
-    if (options.show_size) {
-        fprintf(stdout, "%-*d", 10, size);
-        fprintf(stdout, "%*.*f", FIELD_WIDTH, FLOAT_PRECISION, avg_time);
-    } else {
-        fprintf(stdout, "%*.*f", 17, FLOAT_PRECISION, avg_time);
-    }
-
-    if (options.show_full) {
-        fprintf(stdout, "%*.*f%*.*f%*lu\n",
-                FIELD_WIDTH, FLOAT_PRECISION, min_time,
-                FIELD_WIDTH, FLOAT_PRECISION, max_time,
-                12, options.iterations);
-    } else {
-        fprintf(stdout, "\n");
-    }
-
-    fflush(stdout);
-}
-
-double getMicrosecondTimeStamp()
-{
-    double retval;
-    struct timeval tv;
-    if (gettimeofday(&tv, NULL)) {
-        perror("gettimeofday");
-        abort();
-    }
-    retval = ((double)tv.tv_sec) * 1000000 + tv.tv_usec;
-    return retval;
-}
-
-void set_buffer (void * buffer, enum accel_type type, int data, size_t size)
-{
-#ifdef _ENABLE_OPENACC_
-    size_t i;
-    char * p = (char *)buffer;
-#endif
-    switch (type) {
-        case NONE:
-            memset(buffer, data, size);
-            break;
-        case CUDA:
-        case MANAGED:
-#ifdef _ENABLE_CUDA_
-            cudaMemset(buffer, data, size);
-#endif
-            break;
-        case OPENACC:
-#ifdef _ENABLE_OPENACC_
-            #pragma acc parallel loop deviceptr(p)
-            for (i = 0; i < size; i++) {
-                p[i] = data;
-            }
-#endif
-            break;
-    }
-}
-
-int allocate_memory_coll (void ** buffer, size_t size, enum accel_type type)
-{
-    if (options.target == CPU || options.target == BOTH) {
-        allocate_host_arrays();
-    }
-
-    size_t alignment = sysconf(_SC_PAGESIZE);
-#ifdef _ENABLE_CUDA_
-    cudaError_t cuerr = cudaSuccess;
-#endif
-
-    switch (type) {
-        case NONE:
-            return posix_memalign(buffer, alignment, size);
-#ifdef _ENABLE_CUDA_
-        case CUDA:
-            cuerr = cudaMalloc(buffer, size);
-            if (cudaSuccess != cuerr) {
-                return 1;
-            } else {
-                return 0;
-            }
-        case MANAGED:
-            cuerr = cudaMallocManaged(buffer, size, cudaMemAttachGlobal);
-            if (cudaSuccess != cuerr) {
-                return 1;
-            } else {
-                return 0;
-            }
-#endif
-#ifdef _ENABLE_OPENACC_
-        case OPENACC:
-            *buffer = acc_malloc(size);
-            if (NULL == *buffer) {
-                return 1;
-            } else {
-                return 0;
-            }
-#endif
-        default:
-            return 1;
-    }
-}
-
-int allocate_device_buffer (char ** buffer)
-{
-#ifdef _ENABLE_CUDA_
-    cudaError_t cuerr = cudaSuccess;
-#endif
-
-    switch (options.accel) {
-#ifdef _ENABLE_CUDA_
-        case CUDA:
-            cuerr = cudaMalloc((void **)buffer, options.max_message_size);
-
-            if (cudaSuccess != cuerr) {
-                fprintf(stderr, "Could not allocate device memory\n");
-                return 1;
-            }
-            break;
-#endif
-#ifdef _ENABLE_OPENACC_
-        case OPENACC:
-            *buffer = acc_malloc(options.max_message_size);
-            if (NULL == *buffer) {
-                fprintf(stderr, "Could not allocate device memory\n");
-                return 1;
-            }
-            break;
-#endif
-        default:
-            fprintf(stderr, "Could not allocate device memory\n");
-            return 1;
-    }
-
-    return 0;
-}
-
-int allocate_managed_buffer (char ** buffer)
-{
-#ifdef _ENABLE_CUDA_
-    cudaError_t cuerr = cudaSuccess;
-#endif
-
-    switch (options.accel) {
-#ifdef _ENABLE_CUDA_
-        case CUDA:
-            cuerr = cudaMallocManaged((void **)buffer, options.max_message_size, cudaMemAttachGlobal);
-
-            if (cudaSuccess != cuerr) {
-                fprintf(stderr, "Could not allocate device memory\n");
-                return 1;
-            }
-            break;
-#endif
-        default:
-            fprintf(stderr, "Could not allocate device memory\n");
-            return 1;
-
-    }
-    return 0;
-}
-
-int allocate_memory_pt2pt (char ** sbuf, char ** rbuf, int rank)
-{
-    unsigned long align_size = sysconf(_SC_PAGESIZE);
-
-    switch (rank) {
-        case 0:
-            if ('D' == options.src) {
-                if (allocate_device_buffer(sbuf)) {
-                    fprintf(stderr, "Error allocating cuda memory\n");
-                    return 1;
-                }
-
-                if (allocate_device_buffer(rbuf)) {
-                    fprintf(stderr, "Error allocating cuda memory\n");
-                    return 1;
-                }
-            } else if ('M' == options.src) {
-                if (allocate_managed_buffer(sbuf)) {
-                    fprintf(stderr, "Error allocating cuda unified memory\n");
-                    return 1;
-                }
-
-                if (allocate_managed_buffer(rbuf)) {
-                    fprintf(stderr, "Error allocating cuda unified memory\n");
-                    return 1;
-                }
-            } else {
-                if (posix_memalign((void**)sbuf, align_size, options.max_message_size)) {
-                    fprintf(stderr, "Error allocating host memory\n");
-                    return 1;
-                }
-
-                if (posix_memalign((void**)rbuf, align_size, options.max_message_size)) {
-                    fprintf(stderr, "Error allocating host memory\n");
-                    return 1;
-                }
-            }
-            break;
-        case 1:
-            if ('D' == options.dst) {
-                if (allocate_device_buffer(sbuf)) {
-                    fprintf(stderr, "Error allocating cuda memory\n");
-                    return 1;
-                }
-
-                if (allocate_device_buffer(rbuf)) {
-                    fprintf(stderr, "Error allocating cuda memory\n");
-                    return 1;
-                }
-            } else if ('M' == options.dst) {
-                if (allocate_managed_buffer(sbuf)) {
-                    fprintf(stderr, "Error allocating cuda unified memory\n");
-                    return 1;
-                }
-
-                if (allocate_managed_buffer(rbuf)) {
-                    fprintf(stderr, "Error allocating cuda unified memory\n");
-                    return 1;
-                }
-            } else {
-                if (posix_memalign((void**)sbuf, align_size, options.max_message_size)) {
-                    fprintf(stderr, "Error allocating host memory\n");
-                    return 1;
-                }
-
-                if (posix_memalign((void**)rbuf, align_size, options.max_message_size)) {
-                    fprintf(stderr, "Error allocating host memory\n");
-                    return 1;
-                }
-            }
-            break;
-    }
-
-    return 0;
-}
-
-void allocate_memory_one_sided(int rank, char *sbuf_orig, char *rbuf_orig, char **sbuf, char **rbuf,
-                char **win_base, int size, enum WINDOW type, MPI_Win *win)
-{
-    int page_size;
-
-    page_size = getpagesize();
-    assert(page_size <= MAX_ALIGNMENT);
-
-    if (rank == 0) {
-        mem_on_dev = ('D' == options.src) ? 1 : 0;
-    } else {
-        mem_on_dev = ('D' == options.dst) ? 1 : 0;
-    }
-
-    if (mem_on_dev) {
-        CHECK(allocate_device_buffer(sbuf));
-        set_device_memory(*sbuf, 'a', size);
-        CHECK(allocate_device_buffer(rbuf));
-        set_device_memory(*rbuf, 'b', size);
-    } else {
-        *sbuf = (char *)align_buffer((void *)sbuf_orig, page_size);
-        memset(*sbuf, 'a', size);
-        *rbuf = (char *)align_buffer((void *)rbuf_orig, page_size);
-        memset(*rbuf, 'b', size);
-    }
-
-#if MPI_VERSION >= 3
-    MPI_Status  reqstat;
-
-    switch (type) {
-        case WIN_CREATE:
-            MPI_CHECK(MPI_Win_create(*win_base, size, 1, MPI_INFO_NULL, MPI_COMM_WORLD, win));
-            break;
-        case WIN_DYNAMIC:
-            MPI_CHECK(MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, win));
-            MPI_CHECK(MPI_Win_attach(*win, (void *)*win_base, size));
-            MPI_CHECK(MPI_Get_address(*win_base, &disp_local));
-            if(rank == 0){
-                MPI_CHECK(MPI_Send(&disp_local, 1, MPI_AINT, 1, 1, MPI_COMM_WORLD));
-                MPI_CHECK(MPI_Recv(&disp_remote, 1, MPI_AINT, 1, 1, MPI_COMM_WORLD, &reqstat));
-            } else {
-                MPI_CHECK(MPI_Recv(&disp_remote, 1, MPI_AINT, 0, 1, MPI_COMM_WORLD, &reqstat));
-                MPI_CHECK(MPI_Send(&disp_local, 1, MPI_AINT, 0, 1, MPI_COMM_WORLD));
-            }
-            break;
-        default:
-            if (mem_on_dev) {
-                MPI_CHECK(MPI_Win_create(*win_base, size, 1, MPI_INFO_NULL, MPI_COMM_WORLD, win));
-            } else {
-                MPI_CHECK(MPI_Win_allocate(size, 1, MPI_INFO_NULL, MPI_COMM_WORLD, *win_base, win));
-            }
-            break;
-    }
-#else
-    MPI_CHECK(MPI_Win_create(*win_base, size, 1, MPI_INFO_NULL, MPI_COMM_WORLD, win));
-#endif
-}
-
-void free_buffer (void * buffer, enum accel_type type)
-{
-    switch (type) {
-        case NONE:
-            free(buffer);
-            break;
-        case MANAGED:
-        case CUDA:
-#ifdef _ENABLE_CUDA_
-            cudaFree(buffer);
-#endif
-            break;
-        case OPENACC:
-#ifdef _ENABLE_OPENACC_
-            acc_free(buffer);
-#endif
-            break;
-    }
-
-    /* Free dummy compute related resources */
-    if (CPU == options.target || BOTH == options.target) {
-        free_host_arrays();
-    }
-
-    if (GPU == options.target || BOTH == options.target) {
-#ifdef _ENABLE_CUDA_KERNEL_
-        free_device_arrays();
-#endif /* #ifdef _ENABLE_CUDA_KERNEL_ */
-    }
-}
-
-int init_accel (void)
-{
-#if defined(_ENABLE_OPENACC_) || defined(_ENABLE_CUDA_)
-    char * str;
-     int local_rank, dev_count;
-     int dev_id = 0;
-#endif
-#ifdef _ENABLE_CUDA_
-    CUresult curesult = CUDA_SUCCESS;
-    CUdevice cuDevice;
-#endif
-
-    switch (options.accel) {
-#ifdef _ENABLE_CUDA_
-        case MANAGED:
-        case CUDA:
-            if ((str = getenv("LOCAL_RANK")) != NULL) {
-                cudaGetDeviceCount(&dev_count);
-                local_rank = atoi(str);
-                dev_id = local_rank % dev_count;
-            }
-
-            curesult = cuInit(0);
-            if (curesult != CUDA_SUCCESS) {
-                return 1;
-            }
-
-            curesult = cuDeviceGet(&cuDevice, dev_id);
-            if (curesult != CUDA_SUCCESS) {
-                return 1;
-            }
-
-            curesult = cuCtxCreate(&cuContext, 0, cuDevice);
-            if (curesult != CUDA_SUCCESS) {
-                return 1;
-            }
-            break;
-#endif
-#ifdef _ENABLE_OPENACC_
-        case OPENACC:
-            if ((str = getenv("LOCAL_RANK")) != NULL) {
-                dev_count = acc_get_num_devices(acc_device_not_host);
-                local_rank = atoi(str);
-                dev_id = local_rank % dev_count;
-            }
-
-            acc_set_device_num (dev_id, acc_device_not_host);
-            break;
-#endif
-        default:
-            fprintf(stderr, "Invalid device type, should be cuda or openacc\n");
-            return 1;
-    }
-
-    return 0;
-}
-
-int cleanup_accel (void)
-{
-#ifdef _ENABLE_CUDA_
-    CUresult curesult = CUDA_SUCCESS;
-#endif
-
-    switch (options.accel) {
-#ifdef _ENABLE_CUDA_
-        case MANAGED:
-        case CUDA:
-            curesult = cuCtxDestroy(cuContext);
-
-            if (curesult != CUDA_SUCCESS) {
-                return 1;
-            }
-            break;
-#endif
-#ifdef _ENABLE_OPENACC_
-        case OPENACC:
-            acc_shutdown(acc_device_nvidia);
-            break;
-#endif
-        default:
-            fprintf(stderr, "Invalid accel type, should be cuda or openacc\n");
-            return 1;
-    }
-
-    return 0;
-}
-
-#ifdef _ENABLE_CUDA_KERNEL_
-void free_device_arrays()
-{
-    cudaError_t cuerr = cudaSuccess;
-    if (is_alloc) {
-        cuerr = cudaFree(d_x);
-        if (cuerr != cudaSuccess) {
-            fprintf(stderr, "Failed to free device array\n");
-        }
-
-        cuerr = cudaFree(d_y);
-        if (cuerr != cudaSuccess) {
-            fprintf(stderr, "Failed to free device array\n");
-        }
-
-        is_alloc = 0;
-    }
-}
-#endif
-
-void free_host_arrays()
-{
-    int i = 0;
-
-    if (x) {
-        free(x);
-    }
-    if (y) {
-        free(y);
-    }
-
-    if (a) {
-        for (i = 0; i < DIM; i++) {
-            free(a[i]);
-        }
-        free(a);
-    }
-
-    x = NULL;
-    y = NULL;
-    a = NULL;
-}
-
-void free_memory (void * sbuf, void * rbuf, int rank)
-{
-    switch (rank) {
-        case 0:
-            if ('D' == options.src || 'M' == options.src) {
-                free_device_buffer(sbuf);
-                free_device_buffer(rbuf);
-            } else {
-                free(sbuf);
-                free(rbuf);
-            }
-            break;
-        case 1:
-            if ('D' == options.dst || 'M' == options.dst) {
-                free_device_buffer(sbuf);
-                free_device_buffer(rbuf);
-            } else {
-                free(sbuf);
-                free(rbuf);
-            }
-            break;
-    }
-}
-
-void free_memory_one_sided (void *sbuf, void *rbuf, MPI_Win win, int rank)
-{
-    MPI_CHECK(MPI_Win_free(&win));
-
-    switch (rank) {
-        case 0:
-            if ('D' == options.src) {
-                free_device_buffer(sbuf);
-                free_device_buffer(rbuf);
-            }
-            break;
-        case 1:
-            if ('D' == options.dst) {
-                free_device_buffer(sbuf);
-                free_device_buffer(rbuf);
-            }
-            break;
-    }
-}
-
-double dummy_compute(double seconds, MPI_Request* request)
-{
-    double test_time = 0.0;
-
-    test_time = do_compute_and_probe(seconds, request);
-
-    return test_time;
-}
-
-#ifdef _ENABLE_CUDA_KERNEL_
-void do_compute_gpu(double seconds)
-{
-    int i,j;
-    double time_elapsed = 0.0, t1 = 0.0, t2 = 0.0;
-
-    {
-        t1 = MPI_Wtime();
-
-        /* Execute Dummy Kernel on GPU if set by user */
-        if (options.target == BOTH || options.target == GPU) {
-            {
-                cudaStreamCreate(&stream);
-                call_kernel(A, d_x, d_y, options.device_array_size, &stream);
-            }
-        }
-
-        t2 = MPI_Wtime();
-        time_elapsed += (t2-t1);
-    }
-}
-#endif
-
-void
-compute_on_host()
-{
-    int i = 0, j = 0;
-    for (i = 0; i < DIM; i++)
-        for (j = 0; j < DIM; j++)
-            x[i] = x[i] + a[i][j]*a[j][i] + y[j];
-}
-
-
-static inline void do_compute_cpu(double target_seconds)
-{
-    double t1 = 0.0, t2 = 0.0;
-    double time_elapsed = 0.0;
-    while (time_elapsed < target_seconds) {
-        t1 = MPI_Wtime();
-        compute_on_host();
-        t2 = MPI_Wtime();
-        time_elapsed += (t2-t1);
-    }
-    if (DEBUG) {
-        fprintf(stderr, "time elapsed = %f\n", (time_elapsed * 1e6));
-    }
-}
-
-void wtime(double *t)
-{
-    static int sec = -1;
-    struct timeval tv;
-    //gettimeofday(&tv, (void *)0);
-    gettimeofday(&tv, 0);
-    if (sec < 0) sec = tv.tv_sec;
-    *t = (tv.tv_sec - sec)*1.0e+6 + tv.tv_usec;
-}
-
-double do_compute_and_probe(double seconds, MPI_Request* request)
-{
-    double t1 = 0.0, t2 = 0.0;
-    double test_time = 0.0;
-    int num_tests = 0;
-    double target_seconds_for_compute = 0.0;
-    int flag = 0;
-    MPI_Status status;
-
-    if (options.num_probes) {
-        target_seconds_for_compute = (double) seconds/options.num_probes;
-        if (DEBUG) {
-            fprintf(stderr, "setting target seconds to %f\n", (target_seconds_for_compute * 1e6 ));
-        }
-    } else {
-        target_seconds_for_compute = seconds;
-        if (DEBUG) {
-            fprintf(stderr, "setting target seconds to %f\n", (target_seconds_for_compute * 1e6 ));
-        }
-    }
-
-#ifdef _ENABLE_CUDA_KERNEL_
-    if (options.target == GPU) {
-        if (options.num_probes) {
-            /* Do the dummy compute on GPU only */
-            do_compute_gpu(target_seconds_for_compute);
-            num_tests = 0;
-            while (num_tests < options.num_probes) {
-                t1 = MPI_Wtime();
-                MPI_CHECK(MPI_Test(request, &flag, &status));
-                t2 = MPI_Wtime();
-                test_time += (t2-t1);
-                num_tests++;
-            }
-        } else {
-            do_compute_gpu(target_seconds_for_compute);
-        }
-    } else if (options.target == BOTH) {
-        if (options.num_probes) {
-            /* Do the dummy compute on GPU and CPU*/
-            do_compute_gpu(target_seconds_for_compute);
-            num_tests = 0;
-            while (num_tests < options.num_probes) {
-                t1 = MPI_Wtime();
-                MPI_CHECK(MPI_Test(request, &flag, &status));
-                t2 = MPI_Wtime();
-                test_time += (t2-t1);
-                num_tests++;
-                do_compute_cpu(target_seconds_for_compute);
-            }
-        } else {
-            do_compute_gpu(target_seconds_for_compute);
-            do_compute_cpu(target_seconds_for_compute);
-        }
-    } else
-#endif
-    if (options.target == CPU) {
-        if (options.num_probes) {
-            num_tests = 0;
-            while (num_tests < options.num_probes) {
-                do_compute_cpu(target_seconds_for_compute);
-                t1 = MPI_Wtime();
-                MPI_CHECK(MPI_Test(request, &flag, &status));
-                t2 = MPI_Wtime();
-                test_time += (t2-t1);
-                num_tests++;
-            }
-        } else {
-            do_compute_cpu(target_seconds_for_compute);
-        }
-    }
-
-#ifdef _ENABLE_CUDA_KERNEL_
-    if (options.target == GPU || options.target == BOTH) {
-        cudaDeviceSynchronize();
-        cudaStreamDestroy(stream);
-    }
-#endif
-
-    return test_time;
-}
-
-void allocate_host_arrays()
-{
-    int i=0, j=0;
-    a = (float **)malloc(DIM * sizeof(float *));
-
-    for (i = 0; i < DIM; i++) {
-        a[i] = (float *)malloc(DIM * sizeof(float));
-    }
-
-    x = (float *)malloc(DIM * sizeof(float));
-    y = (float *)malloc(DIM * sizeof(float));
-
-    for (i = 0; i < DIM; i++) {
-        x[i] = y[i] = 1.0f;
-        for (j = 0; j < DIM; j++) {
-            a[i][j] = 2.0f;
-        }
-    }
-}
-
-void allocate_atomic_memory(int rank, char *sbuf_orig, char *rbuf_orig, char *tbuf_orig,
-                       char *cbuf_orig, char **sbuf, char **rbuf, char **tbuf,
-                       char **cbuf, char **win_base, int size, enum WINDOW type, MPI_Win *win)
-{
-    int page_size;
-
-    page_size = getpagesize();
-    assert(page_size <= MAX_ALIGNMENT);
-
-    if (rank == 0) {
-        mem_on_dev = ('D' == options.src) ? 1 : 0;
-    } else {
-        mem_on_dev = ('D' == options.dst) ? 1 : 0;
-    }
-
-    if (mem_on_dev) {
-        CHECK(allocate_device_buffer(sbuf));
-        set_device_memory(*sbuf, 'a', size);
-        CHECK(allocate_device_buffer(rbuf));
-        set_device_memory(*rbuf, 'b', size);
-        CHECK(allocate_device_buffer(tbuf));
-        set_device_memory(*tbuf, 'c', size);
-        if (cbuf != NULL) {
-            CHECK(allocate_device_buffer(cbuf));
-            set_device_memory(*cbuf, 'a', size);
-        }
-    } else {
-        *sbuf = (char *)align_buffer((void *)sbuf_orig, page_size);
-        memset(*sbuf, 'a', size);
-        *rbuf = (char *)align_buffer((void *)rbuf_orig, page_size);
-        memset(*rbuf, 'b', size);
-        *tbuf = (char *)align_buffer((void *)tbuf_orig, page_size);
-        memset(*tbuf, 'c', size);
-        if (cbuf != NULL) {
-            *cbuf = (char *)align_buffer((void *)cbuf_orig, page_size);
-            memset(*cbuf, 'a', size);
-        }
-    }
-
-#if MPI_VERSION >= 3
-    MPI_Status  reqstat;
-
-    switch (type) {
-        case WIN_CREATE:
-            MPI_CHECK(MPI_Win_create(*win_base, size, 1, MPI_INFO_NULL, MPI_COMM_WORLD, win));
-            break;
-        case WIN_DYNAMIC:
-            MPI_CHECK(MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, win));
-            MPI_CHECK(MPI_Win_attach(*win, (void *)*win_base, size));
-            MPI_CHECK(MPI_Get_address(*win_base, &disp_local));
-            if(rank == 0){
-                MPI_CHECK(MPI_Send(&disp_local, 1, MPI_AINT, 1, 1, MPI_COMM_WORLD));
-                MPI_CHECK(MPI_Recv(&disp_remote, 1, MPI_AINT, 1, 1, MPI_COMM_WORLD, &reqstat));
-            } else {
-                MPI_CHECK(MPI_Recv(&disp_remote, 1, MPI_AINT, 0, 1, MPI_COMM_WORLD, &reqstat));
-                MPI_CHECK(MPI_Send(&disp_local, 1, MPI_AINT, 0, 1, MPI_COMM_WORLD));
-            }
-            break;
-        default:
-            if (mem_on_dev) {
-                MPI_CHECK(MPI_Win_create(*win_base, size, 1, MPI_INFO_NULL, MPI_COMM_WORLD, win));
-            } else {
-                MPI_CHECK(MPI_Win_allocate(size, 1, MPI_INFO_NULL, MPI_COMM_WORLD, *win_base, win));
-            }
-            break;
-    }
-#else
-    MPI_CHECK(MPI_Win_create(*win_base, size, 1, MPI_INFO_NULL, MPI_COMM_WORLD, win));
-#endif
-}
-
-void free_atomic_memory (void *sbuf, void *rbuf, void *tbuf, void *cbuf, MPI_Win win, int rank)
-{
-    MPI_CHECK(MPI_Win_free(&win));
-
-    switch (rank) {
-        case 0:
-            if ('D' == options.src) {
-                free_device_buffer(sbuf);
-                free_device_buffer(rbuf);
-                free_device_buffer(tbuf);
-                if (cbuf != NULL)
-                    free_device_buffer(cbuf);
-            }
-            break;
-        case 1:
-            if ('D' == options.dst) {
-                free_device_buffer(sbuf);
-                free_device_buffer(rbuf);
-                free_device_buffer(tbuf);
-                if (cbuf != NULL)
-                    free_device_buffer(cbuf);
-            }
-            break;
-    }
-}
-
-void init_arrays(double target_time)
-{
-
-    if (DEBUG) {
-        fprintf(stderr, "called init_arrays with target_time = %f \n",
-                (target_time * 1e6));
-    }
-
-#ifdef _ENABLE_CUDA_KERNEL_
-    if (options.target == GPU || options.target == BOTH) {
-    /* Setting size of arrays for Dummy Compute */
-    int N = options.device_array_size;
-
-    /* Device Arrays for Dummy Compute */
-    allocate_device_arrays(N);
-
-    double time_elapsed = 0.0;
-    double t1 = 0.0, t2 = 0.0;
-
-    while (1) {
-        t1 = MPI_Wtime();
-
-        if (options.target == GPU || options.target == BOTH) {
-            cudaStreamCreate(&stream);
-            call_kernel(A, d_x, d_y, N, &stream);
-
-            cudaDeviceSynchronize();
-            cudaStreamDestroy(stream);
-        }
-
-        t2 = MPI_Wtime();
-        if ((t2-t1) < target_time)
-        {
-            N += 32;
-
-            /* Now allocate arrays of size N */
-            allocate_device_arrays(N);
-        } else {
-            break;
-        }
-    }
-
-    /* we reach here with desired N so save it and pass it to options */
-    options.device_array_size = N;
-    if (DEBUG) {
-        fprintf(stderr, "correct N = %d\n", N);
-        }
-    }
-#endif
-
-}
-
-#ifdef _ENABLE_CUDA_KERNEL_
-void allocate_device_arrays(int n)
-{
-    cudaError_t cuerr = cudaSuccess;
-
-    /* First free the old arrays */
-    free_device_arrays();
-
-    /* Allocate Device Arrays for Dummy Compute */
-    cuerr = cudaMalloc((void**)&d_x, n * sizeof(float));
-    if (cuerr != cudaSuccess) {
-        fprintf(stderr, "Failed to free device array");
-    }
-
-    cuerr = cudaMalloc((void**)&d_y, n * sizeof(float));
-    if (cuerr != cudaSuccess) {
-        fprintf(stderr, "Failed to free device array");
-    }
-
-    cudaMemset(d_x, 1.0f, n);
-    cudaMemset(d_y, 2.0f, n);
-    is_alloc = 1;
-}
-#endif
-/* vi:set sw=4 sts=4 tw=80: */
diff --git a/cscs-checks/microbenchmarks/mpi/osu/src/osu_util.h b/cscs-checks/microbenchmarks/mpi/osu/src/osu_util.h
deleted file mode 100644
index 0eae7f68bf..0000000000
--- a/cscs-checks/microbenchmarks/mpi/osu/src/osu_util.h
+++ /dev/null
@@ -1,363 +0,0 @@
-/*
- * Copyright (C) 2002-2017 the Network-Based Computing Laboratory
- * (NBCL), The Ohio State University.
- *
- * Contact: Dr. D. K. Panda (panda@cse.ohio-state.edu)
- *
- * For detailed copyright and licensing information, please refer to the
- * copyright file COPYRIGHT in the top level OMB directory.
- */
-#ifndef OSU_COLL_H
-#define OSU_COLL_H 1
-#endif
-
-#ifndef OSU_PT2PT_H
-#define OSU_PT2PT_H 1
-#endif
-
-#include <assert.h>
-#include <mpi.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-#include <string.h>
-#include <stdint.h>
-#include <getopt.h>
-#include <pthread.h>
-#include <inttypes.h>
-#include <sys/time.h>
-#include <limits.h>
-
-
-
-#ifdef _ENABLE_CUDA_
-#include "cuda.h"
-#include "cuda_runtime.h"
-#endif
-
-#define MIN(a,b) ((a)<(b)?(a):(b))
-#define MAX(a,b) ((a)>(b)?(a):(b))
-
-#ifdef _ENABLE_OPENACC_
-#   define OPENACC_ENABLED 1
-#   include <openacc.h>
-#else
-#   define OPENACC_ENABLED 0
-#endif
-
-#ifdef _ENABLE_CUDA_
-#   define CUDA_ENABLED 1
-#else
-#   define CUDA_ENABLED 0
-#endif
-
-#ifdef _ENABLE_CUDA_KERNEL_
-#   define CUDA_KERNEL_ENABLED 1
-#else
-#   define CUDA_KERNEL_ENABLED 0
-#endif
-
-#ifndef BENCHMARK
-#   define BENCHMARK "MPI%s BENCHMARK NAME UNSET"
-#endif
-
-#ifdef PACKAGE_VERSION
-#   define HEADER "# " BENCHMARK " v" PACKAGE_VERSION "\n"
-#else
-#   define HEADER "# " BENCHMARK "\n"
-#endif
-
-#ifndef FIELD_WIDTH
-#   define FIELD_WIDTH 20
-#endif
-
-#ifndef FLOAT_PRECISION
-#   define FLOAT_PRECISION 2
-#endif
-
-#define SYNC_MODE (UPC_IN_ALLSYNC | UPC_OUT_ALLSYNC)
-
-#define CHECK(stmt)                                              \
-do {                                                             \
-   int errno = (stmt);                                           \
-   if (0 != errno) {                                             \
-       fprintf(stderr, "[%s:%d] function call failed with %d \n",\
-        __FILE__, __LINE__, errno);                              \
-       exit(EXIT_FAILURE);                                       \
-   }                                                             \
-   assert(0 == errno);                                           \
-} while (0)
-
-#define MPI_CHECK(stmt)                                          \
-do {                                                             \
-   int mpi_errno = (stmt);                                       \
-   if (MPI_SUCCESS != mpi_errno) {                               \
-       fprintf(stderr, "[%s:%d] MPI call failed with %d \n",     \
-        __FILE__, __LINE__,mpi_errno);                           \
-       exit(EXIT_FAILURE);                                       \
-   }                                                             \
-   assert(MPI_SUCCESS == mpi_errno);                             \
-} while (0)
-
-#define TIME() getMicrosecondTimeStamp()
-double getMicrosecondTimeStamp();
-
-void print_header_coll (int rank, int full) __attribute__((unused));
-void print_header_nbc (int rank, int full);
-void print_data (int rank, int full, int size, double avg_time, double
-min_time, double max_time, int iterations) __attribute__((unused));
-void print_data_nbc (int rank, int full, int size, double ovrl, double
-cpu, double comm, double wait, double init, int iterations);
-
-void allocate_host_arrays();
-
-void
-calculate_and_print_stats(int rank, int size, int numprocs,
-                          double timer, double latency,
-                          double test_time, double cpu_time,
-                          double wait_time, double init_time);
-
-
-enum mpi_req{
-    MAX_REQ_NUM = 1000
-};
-
-#define BW_LOOP_SMALL 100
-#define BW_SKIP_SMALL 10
-#define BW_LOOP_LARGE 20
-#define BW_SKIP_LARGE 2
-#define LAT_LOOP_SMALL 10000
-#define LAT_SKIP_SMALL 100
-#define LAT_LOOP_LARGE 1000
-#define LAT_SKIP_LARGE 10
-#define COLL_LOOP_SMALL 1000
-#define COLL_SKIP_SMALL 100
-#define COLL_LOOP_LARGE 100
-#define COLL_SKIP_LARGE 10
-#define OSHM_LOOP_SMALL 1000
-#define OSHM_LOOP_LARGE 100
-#define OSHM_SKIP_SMALL 200
-#define OSHM_SKIP_LARGE 10
-#define OSHM_LOOP_SMALL_MR 500
-#define OSHM_LOOP_LARGE_MR 50
-#define OSHM_LOOP_ATOMIC 500
-#define UPC_LOOP_SMALL 1000
-#define UPC_LOOP_LARGE 100
-#define UPC_SKIP_SMALL 200
-#define UPC_SKIP_LARGE 10
-
-#define MAX_MESSAGE_SIZE (1 << 22)
-#define MAX_MSG_SIZE_PT2PT (1<<20)
-#define MAX_MSG_SIZE_COLL (1<<20)
-#define MIN_MESSAGE_SIZE 1
-#define LARGE_MESSAGE_SIZE 8192
-
-#define MAX_ALIGNMENT 65536
-#define MAX_MEM_LIMIT (512*1024*1024)
-#define MAX_MEM_LOWER_LIMIT (1*1024*1024)
-#define WINDOW_SIZE_LARGE 64
-#define MYBUFSIZE MAX_MESSAGE_SIZE
-#define ONESBUFSIZE ((MAX_MESSAGE_SIZE * WINDOW_SIZE_LARGE) + MAX_ALIGNMENT)
-#define MESSAGE_ALIGNMENT 64
-#define MESSAGE_ALIGNMENT_MR (1<<12)
-#define MYBUFSIZE_MR (MAX_MESSAGE_SIZE * OSHM_LOOP_LARGE_MR + MESSAGE_ALIGNMENT)
-
-enum po_ret_type {
-    PO_CUDA_NOT_AVAIL,
-    PO_OPENACC_NOT_AVAIL,
-    PO_BAD_USAGE,
-    PO_HELP_MESSAGE,
-    PO_VERSION_MESSAGE,
-    PO_OKAY,
-};
-
-enum accel_type {
-    NONE,
-    CUDA,
-    OPENACC,
-    MANAGED
-};
-
-enum target_type {
-    CPU,
-    GPU,
-    BOTH
-};
-
-enum benchmark_type {
-    COLLECTIVE,
-    PT2PT,
-    ONE_SIDED,
-    MBW_MR,
-    OSHM,
-    UPC,
-    UPCXX
-};
-
-enum test_subtype {
-    BW,
-    LAT,
-    LAT_MT,
-};
-
-enum test_synctype {
-    ALL_SYNC,
-    ACTIVE_SYNC
-};
-
-enum WINDOW {
-    WIN_CREATE=0,
-#if MPI_VERSION >= 3
-    WIN_ALLOCATE,
-    WIN_DYNAMIC
-#endif
-};
-
-/* Synchronization */
-enum SYNC {
-    LOCK=0,
-    PSCW,
-    FENCE,
-#if MPI_VERSION >= 3
-    FLUSH,
-    FLUSH_LOCAL,
-    LOCK_ALL,
-#endif
-};
-
-/*variables*/
-extern char const *win_info[20];
-extern char const *sync_info[20];
-
-extern MPI_Aint disp_remote;
-extern MPI_Aint disp_local;
-
-struct options_t {
-    enum accel_type accel;
-    enum target_type target;
-    int show_size;
-    int show_full;
-    size_t min_message_size;
-    size_t max_message_size;
-    size_t iterations;
-    size_t iterations_large;
-    size_t max_mem_limit;
-    size_t skip;
-    size_t skip_large;
-    size_t window_size_large;
-    int num_probes;
-    int device_array_size;
-
-    enum benchmark_type bench;
-    enum test_subtype  subtype;
-    enum test_synctype synctype;
-
-    char src;
-    char dst;
-    int num_threads;
-    char managedSend;
-    char managedRecv;
-    enum WINDOW win;
-    enum SYNC sync;
-
-    int window_size;
-    int window_varied;
-    int print_rate;
-    int pairs;
-};
-
-
-extern struct options_t options;
-
-
-/*
- * Non-blocking Collectives
- */
-double call_test(int * num_tests, MPI_Request** request);
-void allocate_device_arrays(int n);
-double dummy_compute(double target_secs, MPI_Request *request);
-void init_arrays(double seconds);
-double do_compute_and_probe(double seconds, MPI_Request *request);
-void free_host_arrays();
-
-#ifdef _ENABLE_CUDA_KERNEL_
-void free_device_arrays();
-#endif
-
-/*
- * Option Processing
- */
-int process_options (int argc, char *argv[]);
-
-/*
- * Print Information
- */
-void print_bad_usage_message (int rank);
-void print_help_message (int rank);
-void print_version_message (int rank);
-void print_preamble (int rank);
-void print_preamble_nbc (int rank);
-void print_stats (int rank, int size, double avg, double min, double max);
-void print_stats_nbc (int rank, int size, double ovrl, double cpu, double comm,
-                      double wait, double init, double test);
-/*
- * Memory Management
- */
-int allocate_memory_coll (void ** buffer, size_t size, enum accel_type type);
-void free_buffer (void * buffer, enum accel_type type);
-void set_buffer (void * buffer, enum accel_type type, int data, size_t size);
-
-/*
- * CUDA Context Management
- */
-int init_accel (void);
-int cleanup_accel (void);
-
-/*
- * Set Benchmark Properties
- */
-void set_header (const char * header);
-void set_benchmark_name (const char * name);
-void enable_accel_support (void);
-
-extern MPI_Request request[MAX_REQ_NUM];
-extern MPI_Status  reqstat[MAX_REQ_NUM];
-extern MPI_Request send_request[MAX_REQ_NUM];
-extern MPI_Request recv_request[MAX_REQ_NUM];
-
-#ifdef _ENABLE_CUDA_
-extern CUcontext cuContext;
-#endif
-
-
-#define DEF_NUM_THREADS 2
-#define MIN_NUM_THREADS 1
-#define MAX_NUM_THREADS 128
-
-#define WINDOW_SIZES {8, 16, 32, 64, 128}
-#define WINDOW_SIZES_COUNT   (5)
-
-void usage_mbw_mr();
-int allocate_memory_pt2pt (char **sbuf, char **rbuf, int rank);
-void print_header_pt2pt (int rank, int type);
-void free_memory (void *sbuf, void *rbuf, int rank);
-void print_header(int rank, int full);
-void allocate_memory_one_sided(int, char *, char *, char **, char **,
-                     char **win_base, int, enum WINDOW, MPI_Win *);
-void free_memory_one_sided (void *, void *, MPI_Win, int);
-void allocate_atomic_memory(int, char *, char *, char *,
-                            char *, char **, char **, char **, char **,
-                            char **win_base, int, enum WINDOW, MPI_Win *);
-void free_atomic_memory (void *, void *, void *, void *, MPI_Win, int);
-void usage_one_sided (char const *);
-void print_header_one_sided (int, enum WINDOW, enum SYNC);
-
-void print_help_message_get_acc_lat (int);
-
-void print_header_pgas (const char *header, int rank, int full);
-void print_data_pgas (int rank, int full, int size, double avg_time, double min_time, double max_time, int iterations);
-void print_usage_pgas(int rank, const char * prog, int has_size);
-void print_version_pgas(const char *header);
-void usage_oshm_pt2pt(int myid);
-void wtime(double *t);
diff --git a/docs/hpctestlib.rst b/docs/hpctestlib.rst
index 9edc2c3797..68c95ac253 100644
--- a/docs/hpctestlib.rst
+++ b/docs/hpctestlib.rst
@@ -1,11 +1,55 @@
+***********************************
 ReFrame Test Library (experimental)
-===================================
+***********************************
+
 
 This is a collection of generic tests that you can either run out-of-the-box by specializing them for your system using the :option:`-S` option or create your site-specific tests by building upon them.
 
 
+Microbenchmarks
+===============
+
+
+OSU microbenchmarks
+-------------------
+
+There are two final parameterized tests that represent the various OSU benchmarks:
+
+  - The :class:`osu_run` test that runs the benchmarks only.
+    This assumes that the OSU microbenchmarks are installed and available.
+  - The :class:`osu_build_run` test that builds and runs the benchmarks.
+    This test uses two fixtures in total: one to build the tests and one to fetch them.
+
+Depending on your setup you can select the most appropriate final test.
+The benchmarks define various variables with a reasonable default value that affect the execution of the benchmark.
+For collective communication benchmarks, setting the :attr:`num_tasks` is required.
+All tests set :attr:`num_tasks_per_node` to ``1`` by default.
+
+Examples
+^^^^^^^^
+
+Run the run-only version of the point to point bandwidth benchmark:
+
+.. code-block:: console
+
+   reframe -n 'osu_run.*benchmark_info=mpi.pt2pt.osu_bw' -S modules=my-osu-benchmarks -S valid_systems=mysystem -S valid_prog_environs=myenv -l
+
+
+Build and run the CUDA-aware version of the allreduce benchmark.
+
+.. code-block:: console
+
+   reframe -n 'osu_build_run.*benchmark_info=mpi.collective.osu_allreduce.*build_type=cuda' -S device_buffers=cuda -S num_tasks=16 -S valid_systems=sys:part -S valid_prog_environs=myenv -l
+
+
+.. automodule:: hpctestlib.microbenchmarks.mpi.osu
+   :members:
+   :show-inheritance:
+
+
+
 Scientific Applications
------------------------
+=======================
 
 .. automodule:: hpctestlib.sciapps.amber.nve
    :members:
@@ -16,7 +60,7 @@ Scientific Applications
    :show-inheritance:
 
 Data Analytics
---------------
+==============
 
 .. automodule:: hpctestlib.data_analytics.spark.spark_checks
    :members:
@@ -24,7 +68,7 @@ Data Analytics
 
 
 Python
-------
+======
 
 .. automodule:: hpctestlib.python.numpy.numpy_ops
    :members:
@@ -32,7 +76,7 @@ Python
 
 
 Interactive Computing
----------------------
+=====================
 
 .. automodule:: hpctestlib.interactive.jupyter.ipcmagic
    :members:
@@ -40,7 +84,7 @@ Interactive Computing
 
 
 Machine Learning
-----------------
+================
 
 .. automodule:: hpctestlib.ml.tensorflow.horovod
    :members:
diff --git a/hpctestlib/microbenchmarks/mpi/osu.py b/hpctestlib/microbenchmarks/mpi/osu.py
new file mode 100644
index 0000000000..7a94dc4c1a
--- /dev/null
+++ b/hpctestlib/microbenchmarks/mpi/osu.py
@@ -0,0 +1,192 @@
+# Copyright 2016-2022 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
+# ReFrame Project Developers. See the top-level LICENSE file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import os
+import reframe as rfm
+import reframe.utility.sanity as sn
+
+
+class fetch_osu_benchmarks(rfm.RunOnlyRegressionTest):
+    '''Fixture for fetching the OSU benchmarks.'''
+
+    #: The version of the benchmarks to fetch.
+    #:
+    #: :type: :class:`str`
+    #: :default: ``'5.9'``
+    version = variable(str, value='5.9')
+
+    local = True
+    osu_file_name = f'osu-micro-benchmarks-{version}.tar.gz'
+    executable = f'curl -LJO http://mvapich.cse.ohio-state.edu/download/mvapich/{osu_file_name}'  # noqa: E501
+
+    @sanity_function
+    def validate_download(self):
+        return sn.assert_eq(self.job.exitcode, 0)
+
+
+class build_osu_benchmarks(rfm.CompileOnlyRegressionTest):
+    '''Fixture for building the OSU benchmarks'''
+
+    #: Build variant parameter.
+    #:
+    #: :type: :class:`str`
+    #: :values: ``'cpu', 'cuda', 'rocm', 'openacc'``
+    build_type = parameter(['cpu', 'cuda', 'rocm', 'openacc'])
+
+    build_system = 'Autotools'
+    build_prefix = variable(str)
+
+    #: The fixture object that retrieves the benchmarks
+    #:
+    #: :type: :class:`fetch_osu_benchmarks`
+    #: :scope: *session*
+    osu_benchmarks = fixture(fetch_osu_benchmarks, scope='session')
+
+    @run_before('compile')
+    def prepare_build(self):
+        tarball = f'osu-micro-benchmarks-{self.osu_benchmarks.version}.tar.gz'
+        self.build_prefix = tarball[:-7]  # remove .tar.gz extension
+        fullpath = os.path.join(self.osu_benchmarks.stagedir, tarball)
+        self.prebuild_cmds += [
+            f'cp {fullpath} {self.stagedir}',
+            f'tar xzf {tarball}',
+            f'cd {self.build_prefix}'
+        ]
+        self.build_system.config_opts = [f'--enable-{self.build_type}']
+        self.build_system.make_opts = ['-C', 'mpi']
+        self.build_system.max_concurrency = 8
+
+    @sanity_function
+    def validate_build(self):
+        # If build fails, the test will fail before reaching this point.
+        return True
+
+
+class osu_benchmark(rfm.RunOnlyRegressionTest):
+    '''OSU benchmark test base class.'''
+
+    #: Number of warmup iterations.
+    #:
+    #: This value is passed to the excutable through the -x option.
+    #:
+    #: :type: :class:`int`
+    #: :default: ``10``
+    num_warmup_iters = variable(int, value=10)
+
+    #: Number of iterations.
+    #:
+    #: This value is passed to the excutable through the -i option.
+    #:
+    #: :type: :class:`int`
+    #: :default: ``1000``
+    num_iters = variable(int, value=1000)
+
+    #: Maximum message size.
+    #:
+    #: Both the performance and the sanity checks will be done
+    #: for this message size.
+    #:
+    #: This value is set to ``8`` for latency benchmarks and to ``4194304`` for
+    #: bandwidth benchmarks.
+    #:
+    #: :type: :class:`int`
+    message_size = variable(int)
+
+    #: Device buffers.
+    #:
+    #: Use accelerator device buffers.
+    #: Valid values are ``cpu``, ``cuda``, ``openacc`` or ``rocm``.
+    #:
+    #: :type: :class:`str`
+    #: :default: ``'cpu'``
+    device_buffers = variable(str, value='cpu')
+
+    #: Number of tasks to use.
+    #:
+    #: This variable is required.
+    #: It is set to ``2`` for point to point benchmarks, but it is undefined
+    #: for collective benchmarks
+    #:
+    #: :required: Yes
+    num_tasks = required
+    num_tasks_per_node = 1
+
+    #: Parameter indicating the available benchmark to execute.
+    #:
+    #: :type: 2-element tuple containing the benchmark name and whether latency
+    #:   or bandwidth is to be measured.
+    #:
+    #: :values:
+    #:   ``mpi.collective.osu_alltoall``,
+    #:   ``mpi.collective.osu_allreduce``,
+    #:   ``mpi.pt2pt.osu_bw``,
+    #:   ``mpi.pt2pt.osu_latency``
+    benchmark_info = parameter([
+        ('mpi.collective.osu_alltoall', 'latency'),
+        ('mpi.collective.osu_allreduce', 'latency'),
+        ('mpi.pt2pt.osu_bw', 'bandwidth'),
+        ('mpi.pt2pt.osu_latency', 'latency')
+    ], fmt=lambda x: x[0], loggable=True)
+
+    @run_after('init')
+    def setup_per_benchmark(self):
+        bench, bench_metric = self.benchmark_info
+        if bench_metric == 'latency':
+            self.message_size = 8
+            unit = 'us'
+        elif bench_metric == 'bandwidth':
+            self.message_size = 4194304
+            unit = 'MB/s'
+        else:
+            raise ValueError(f'unknown benchmark metric: {bench_metric}')
+
+        self.executable = bench.split('.')[-1]
+        self.executable_opts = ['-m', f'{self.message_size}',
+                                '-x', f'{self.num_warmup_iters}',
+                                '-i', f'{self.num_iters}']
+        if self.device_buffers != 'cpu':
+            self.executable_opts += ['-d', self.device_buffers]
+
+        if bench.startswith('mpi.pt2pt'):
+            self.executable_opts += ['D', 'D']
+            self.num_tasks = 2
+
+        self.perf_variables = {
+            bench_metric: sn.make_performance_function(
+                self._extract_metric, unit
+            )
+        }
+
+    @sanity_function
+    def validate_test(self):
+        return sn.assert_found(rf'^{self.message_size}', self.stdout)
+
+    @deferrable
+    def _extract_metric(self):
+        return sn.extractsingle(rf'^{self.message_size}\s+(\S+)',
+                                self.stdout, 1, float)
+
+
+@rfm.simple_test
+class osu_run(osu_benchmark):
+    '''Run-only OSU benchmark test'''
+
+
+@rfm.simple_test
+class osu_build_run(osu_benchmark):
+    '''OSU benchmark test (build and run)'''
+
+    #: The fixture object that builds the OSU binaries
+    #:
+    #: :type: :class:`build_osu_benchmarks`
+    #: :scope: *environment*
+    osu_binaries = fixture(build_osu_benchmarks, scope='environment')
+
+    @run_before('run')
+    def prepend_build_prefix(self):
+        bench_path = self.benchmark_info[0].replace('.', '/')
+        self.executable = os.path.join(self.osu_binaries.stagedir,
+                                       self.osu_binaries.build_prefix,
+                                       bench_path)