diff --git a/cscs-checks/microbenchmarks/mpi/osu/osu_tests.py b/cscs-checks/microbenchmarks/mpi/osu/osu_tests.py
index 9adf34cf2a..c66a047daf 100644
--- a/cscs-checks/microbenchmarks/mpi/osu/osu_tests.py
+++ b/cscs-checks/microbenchmarks/mpi/osu/osu_tests.py
@@ -7,96 +7,134 @@
 import reframe.utility.sanity as sn
 
 
-@rfm.parameterized_test(['production'])
+@rfm.simple_test
 class AlltoallTest(rfm.RegressionTest):
-    def __init__(self, variant):
-        self.strict_check = False
-        self.valid_systems = ['daint:gpu', 'dom:gpu']
-        self.descr = 'Alltoall OSU microbenchmark'
-        self.build_system = 'Make'
-        self.build_system.makefile = 'Makefile_alltoall'
-        self.executable = './osu_alltoall'
-        # The -m option sets the maximum message size
-        # The -x option sets the number of warm-up iterations
-        # The -i option sets the number of iterations
-        self.executable_opts = ['-m', '8', '-x', '1000', '-i', '20000']
-        self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-gnu',
-                                    'PrgEnv-intel']
-        self.maintainers = ['RS', 'AJ']
-        self.sanity_patterns = sn.assert_found(r'^8', self.stdout)
-        self.perf_patterns = {
-            'latency': sn.extractsingle(r'^8\s+(?P<latency>\S+)',
-                                        self.stdout, 'latency', float)
+    variant = parameter(['production'])
+    strict_check = False
+    valid_systems = ['daint:gpu', 'dom:gpu']
+    descr = 'Alltoall OSU microbenchmark'
+    build_system = 'Make'
+    executable = './osu_alltoall'
+    # The -m option sets the maximum message size
+    # The -x option sets the number of warm-up iterations
+    # The -i option sets the number of iterations
+    executable_opts = ['-m', '8', '-x', '1000', '-i', '20000']
+    valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-gnu',
+                           'PrgEnv-intel', 'PrgEnv-nvidia']
+    maintainers = ['RS', 'AJ']
+    reference = {
+        'dom:gpu': {
+            'latency': (8.23, None, 0.1, 'us')
+        },
+        'daint:gpu': {
+            'latency': (20.73, None, 2.0, 'us')
         }
-        self.tags = {variant, 'benchmark', 'craype'}
-        self.reference = {
-            'dom:gpu': {
-                'latency': (8.23, None, 0.1, 'us')
-            },
-            'daint:gpu': {
-                'latency': (20.73, None, 2.0, 'us')
-            }
+    }
+    num_tasks_per_node = 1
+    num_gpus_per_node  = 1
+    extra_resources = {
+        'switches': {
+            'num_switches': 1
         }
-        self.num_tasks_per_node = 1
-        self.num_gpus_per_node  = 1
+    }
+
+    @run_after('init')
+    def set_tags(self):
+        self.tags = {self.variant, 'benchmark', 'craype'}
+
+    @run_before('compile')
+    def set_makefile(self):
+        self.build_system.makefile = 'Makefile_alltoall'
+
+    @run_before('run')
+    def set_num_tasks(self):
         if self.current_system.name == 'daint':
             self.num_tasks = 16
         else:
             self.num_tasks = 6
 
-        self.extra_resources = {
-            'switches': {
-                'num_switches': 1
-            }
+    @run_before('sanity')
+    def set_sanity(self):
+        self.sanity_patterns = sn.assert_found(r'^8', self.stdout)
+
+    @run_before('performance')
+    def set_performance_patterns(self):
+        self.perf_patterns = {
+            'latency': sn.extractsingle(r'^8\s+(?P<latency>\S+)',
+                                        self.stdout, 'latency', float)
         }
 
 
 @rfm.simple_test
 class FlexAlltoallTest(rfm.RegressionTest):
-    def __init__(self):
-        self.valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc',
-                              'arolla:cn', 'arolla:pn', 'tsa:cn', 'tsa:pn']
-        self.valid_prog_environs = ['PrgEnv-cray']
+    valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc',
+                     'arolla:cn', 'arolla:pn', 'tsa:cn', 'tsa:pn']
+    valid_prog_environs = ['PrgEnv-cray']
+    descr = 'Flexible Alltoall OSU test'
+    build_system = 'Make'
+    executable = './osu_alltoall'
+    maintainers = ['RS', 'AJ']
+    num_tasks_per_node = 1
+    num_tasks = 0
+    tags = {'diagnostic', 'ops', 'benchmark', 'craype'}
+
+    @run_after('init')
+    def add_prog_environ(self):
         if self.current_system.name in ['arolla', 'tsa']:
             self.exclusive_access = True
             self.valid_prog_environs = ['PrgEnv-gnu', 'PrgEnv-pgi']
 
-        self.descr = 'Flexible Alltoall OSU test'
-        self.build_system = 'Make'
+    @run_before('compile')
+    def set_makefile(self):
         self.build_system.makefile = 'Makefile_alltoall'
-        self.executable = './osu_alltoall'
-        self.maintainers = ['RS', 'AJ']
-        self.num_tasks_per_node = 1
-        self.num_tasks = 0
+
+    @run_before('sanity')
+    def set_sanity(self):
         self.sanity_patterns = sn.assert_found(r'^1048576', self.stdout)
-        self.tags = {'diagnostic', 'ops', 'benchmark', 'craype'}
 
 
-@rfm.parameterized_test(['small'], ['large'])
+@rfm.simple_test
 class AllreduceTest(rfm.RegressionTest):
-    def __init__(self, variant):
-        self.strict_check = False
-        self.valid_systems = ['daint:gpu', 'daint:mc']
-        if variant == 'small':
+    variant = parameter(['small'], ['large'])
+    strict_check = False
+    valid_systems = ['daint:gpu', 'daint:mc']
+    descr = 'Allreduce OSU microbenchmark'
+    build_system = 'Make'
+    executable = './osu_allreduce'
+    # The -x option controls the number of warm-up iterations
+    # The -i option controls the number of iterations
+    executable_opts = ['-m', '8', '-x', '1000', '-i', '20000']
+    valid_prog_environs = ['PrgEnv-gnu', 'PrgEnv-nvidia']
+    maintainers = ['RS', 'AJ']
+    tags = {'production', 'benchmark', 'craype'}
+    num_tasks_per_node = 1
+    num_gpus_per_node  = 1
+    extra_resources = {
+        'switches': {
+            'num_switches': 1
+        }
+    }
+
+    @run_after('init')
+    def add_valid_systems(self):
+        if self.variant == 'small':
             self.valid_systems += ['dom:gpu', 'dom:mc']
 
-        self.descr = 'Allreduce OSU microbenchmark'
-        self.build_system = 'Make'
+    @run_before('compile')
+    def set_makefile(self):
         self.build_system.makefile = 'Makefile_allreduce'
-        self.executable = './osu_allreduce'
-        # The -x option controls the number of warm-up iterations
-        # The -i option controls the number of iterations
-        self.executable_opts = ['-m', '8', '-x', '1000', '-i', '20000']
-        self.valid_prog_environs = ['PrgEnv-gnu']
-        self.maintainers = ['RS', 'AJ']
+
+    @run_before('run')
+    def set_num_tasks(self):
+        self.num_tasks = 6 if self.variant == 'small' else 16
+
+    @run_before('sanity')
+    def set_sanity(self):
         self.sanity_patterns = sn.assert_found(r'^8', self.stdout)
-        self.perf_patterns = {
-            'latency': sn.extractsingle(r'^8\s+(?P<latency>\S+)',
-                                        self.stdout, 'latency', float)
-        }
-        self.tags = {'production', 'benchmark', 'craype'}
-        if variant == 'small':
-            self.num_tasks = 6
+
+    @run_before('performance')
+    def set_performance_patterns(self):
+        if self.variant == 'small':
             self.reference = {
                 'dom:gpu': {
                     'latency': (5.67, None, 0.05, 'us')
@@ -109,7 +147,6 @@ def __init__(self, variant):
                 }
             }
         else:
-            self.num_tasks = 16
             self.reference = {
                 'daint:gpu': {
                     'latency': (13.62, None, 1.16, 'us')
@@ -118,74 +155,78 @@ def __init__(self, variant):
                     'latency': (19.07, None, 1.64, 'us')
                 }
             }
-
-        self.num_tasks_per_node = 1
-        self.num_gpus_per_node  = 1
-        self.extra_resources = {
-            'switches': {
-                'num_switches': 1
-            }
+        self.perf_patterns = {
+            'latency': sn.extractsingle(r'^8\s+(?P<latency>\S+)',
+                                        self.stdout, 'latency', float)
         }
 
 
 class P2PBaseTest(rfm.RegressionTest):
-    def __init__(self):
-        self.exclusive_access = True
-        self.strict_check = False
-        self.num_tasks = 2
-        self.num_tasks_per_node = 1
-        self.descr = 'P2P microbenchmark'
-        self.build_system = 'Make'
-        self.build_system.makefile = 'Makefile_p2p'
+    exclusive_access = True
+    strict_check = False
+    num_tasks = 2
+    num_tasks_per_node = 1
+    descr = 'P2P microbenchmark'
+    build_system = 'Make'
+    maintainers = ['RS', 'AJ']
+    tags = {'production', 'benchmark', 'craype'}
+    extra_resources = {
+        'switches': {
+            'num_switches': 1
+        }
+    }
+
+    @run_after('init')
+    def add_valid_prog_environs(self):
         if self.current_system.name in ['arolla', 'tsa']:
             self.exclusive_access = True
             self.valid_prog_environs = ['PrgEnv-gnu', 'PrgEnv-pgi']
         else:
             self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-gnu',
-                                        'PrgEnv-intel']
-        self.maintainers = ['RS', 'AJ']
-        self.tags = {'production', 'benchmark', 'craype'}
-        self.sanity_patterns = sn.assert_found(r'^4194304', self.stdout)
+                                        'PrgEnv-intel', 'PrgEnv-nvidia']
 
-        self.extra_resources = {
-            'switches': {
-                'num_switches': 1
-            }
-        }
+    @run_before('compile')
+    def set_makefile(self):
+        self.build_system.makefile = 'Makefile_p2p'
+
+    @run_before('sanity')
+    def set_sanity(self):
+        self.sanity_patterns = sn.assert_found(r'^4194304', self.stdout)
 
 
 @rfm.simple_test
 class P2PCPUBandwidthTest(P2PBaseTest):
-    def __init__(self):
-        super().__init__()
-        self.valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc',
-                              'arolla:cn', 'tsa:cn', 'eiger:mc', 'pilatus:mc']
-        self.executable = './p2p_osu_bw'
-        self.executable_opts = ['-x', '100', '-i', '1000']
-        self.reference = {
-            'daint:gpu': {
-                'bw': (9607.0, -0.10, None, 'MB/s')
-            },
-            'daint:mc': {
-                'bw': (9649.0, -0.10, None, 'MB/s')
-            },
-            'dom:gpu': {
-                'bw': (9476.3, -0.05, None, 'MB/s')
-            },
-            'dom:mc': {
-                'bw': (9528.0, -0.20, None, 'MB/s')
-            },
-            'eiger:mc': {
-                'bw': (12240.0, -0.10, None, 'MB/s')
-            },
-            'pilatus:mc': {
-                'bw': (12240.0, -0.10, None, 'MB/s')
-            },
-            # keeping as reference:
-            # 'monch:compute': {
-            #     'bw': (6317.84, -0.15, None, 'MB/s')
-            # },
-        }
+    valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc',
+                     'arolla:cn', 'tsa:cn', 'eiger:mc', 'pilatus:mc']
+    executable = './p2p_osu_bw'
+    executable_opts = ['-x', '100', '-i', '1000']
+    reference = {
+        'daint:gpu': {
+            'bw': (9607.0, -0.10, None, 'MB/s')
+        },
+        'daint:mc': {
+            'bw': (9649.0, -0.10, None, 'MB/s')
+        },
+        'dom:gpu': {
+            'bw': (9476.3, -0.05, None, 'MB/s')
+        },
+        'dom:mc': {
+            'bw': (9528.0, -0.20, None, 'MB/s')
+        },
+        'eiger:mc': {
+            'bw': (12240.0, -0.10, None, 'MB/s')
+        },
+        'pilatus:mc': {
+            'bw': (12240.0, -0.10, None, 'MB/s')
+        },
+        # keeping as reference:
+        # 'monch:compute': {
+        #     'bw': (6317.84, -0.15, None, 'MB/s')
+        # },
+    }
+
+    @run_before('performance')
+    def set_performance_patterns(self):
         self.perf_patterns = {
             'bw': sn.extractsingle(r'^4194304\s+(?P<bw>\S+)',
                                    self.stdout, 'bw', float)
@@ -194,36 +235,39 @@ def __init__(self):
 
 @rfm.simple_test
 class P2PCPULatencyTest(P2PBaseTest):
-    def __init__(self):
-        super().__init__()
-        self.valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc',
-                              'arolla:cn', 'tsa:cn', 'eiger:mc', 'pilatus:mc']
-        self.executable_opts = ['-x', '100', '-i', '1000']
-
-        self.executable = './p2p_osu_latency'
-        self.reference = {
-            'daint:gpu': {
-                'latency': (1.30, None, 0.70, 'us')
-            },
-            'daint:mc': {
-                'latency': (1.61, None, 0.85, 'us')
-            },
-            'dom:gpu': {
-                'latency': (1.138, None, 0.10, 'us')
-            },
-            'dom:mc': {
-                'latency': (1.24, None, 0.15, 'us')
-            },
-            'eiger:mc': {
-                'latency': (2.33, None, 0.15, 'us')
-            },
-            'pilatus:mc': {
-                'latency': (2.33, None, 0.15, 'us')
-            },
-            # keeping as reference:
-            # 'monch:compute': {
-            #     'latency': (1.27, None, 0.1, 'us')
-            # },
+    valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc',
+                     'arolla:cn', 'tsa:cn', 'eiger:mc', 'pilatus:mc']
+    executable = './p2p_osu_latency'
+    reference = {
+        'daint:gpu': {
+            'latency': (1.30, None, 0.70, 'us')
+        },
+        'daint:mc': {
+            'latency': (1.61, None, 0.85, 'us')
+        },
+        'dom:gpu': {
+            'latency': (1.138, None, 0.10, 'us')
+        },
+        'dom:mc': {
+            'latency': (1.24, None, 0.15, 'us')
+        },
+        'eiger:mc': {
+            'latency': (2.33, None, 0.15, 'us')
+        },
+        'pilatus:mc': {
+            'latency': (2.33, None, 0.15, 'us')
+        },
+        # keeping as reference:
+        # 'monch:compute': {
+        #     'latency': (1.27, None, 0.1, 'us')
+        # },
+    }
+
+    @run_before('performance')
+    def set_performance_patterns(self):
+        self.perf_patterns = {
+            'bw': sn.extractsingle(r'^4194304\s+(?P<bw>\S+)',
+                                   self.stdout, 'bw', float)
         }
         self.perf_patterns = {
             'latency': sn.extractsingle(r'^8\s+(?P<latency>\S+)',
@@ -233,70 +277,88 @@ def __init__(self):
 
 @rfm.simple_test
 class G2GBandwidthTest(P2PBaseTest):
-    def __init__(self):
-        super().__init__()
-        self.valid_systems = ['daint:gpu', 'dom:gpu', 'arolla:cn', 'tsa:cn']
-        self.num_gpus_per_node = 1
-        self.executable = './p2p_osu_bw'
-        self.executable_opts = ['-x', '100', '-i', '1000', '-d',
-                                'cuda', 'D', 'D']
-
-        self.reference = {
-            'dom:gpu': {
-                'bw': (8813.09, -0.05, None, 'MB/s')
-            },
-            'daint:gpu': {
-                'bw': (8765.65, -0.1, None, 'MB/s')
-            },
-            '*': {
-                'bw': (0, None, None, 'MB/s')
-            }
+    valid_systems = ['daint:gpu', 'dom:gpu', 'arolla:cn', 'tsa:cn']
+    num_gpus_per_node = 1
+    executable = './p2p_osu_bw'
+    executable_opts = ['-x', '100', '-i', '1000', '-d',
+                       'cuda', 'D', 'D']
+
+    reference = {
+        'dom:gpu': {
+            'bw': (8813.09, -0.05, None, 'MB/s')
+        },
+        'daint:gpu': {
+            'bw': (8765.65, -0.1, None, 'MB/s')
+        },
+        '*': {
+            'bw': (0, None, None, 'MB/s')
         }
+    }
+
+    @run_before('performance')
+    def set_performance_patterns(self):
         self.perf_patterns = {
             'bw': sn.extractsingle(r'^4194304\s+(?P<bw>\S+)',
                                    self.stdout, 'bw', float)
         }
+
+    @run_before('compile')
+    def set_cpp_flags(self):
+        self.build_system.cppflags = ['-D_ENABLE_CUDA_']
+
+    @run_before('compile')
+    def set_modules(self):
         if self.current_system.name in ['daint', 'dom']:
             self.num_gpus_per_node  = 1
-            self.modules = ['craype-accel-nvidia60']
             self.variables = {'MPICH_RDMA_ENABLED_CUDA': '1'}
+            if self.current_environ.name == 'PrgEnv-nvidia':
+                self.modules = ['cudatoolkit/21.3_11.2']
+            else:
+                self.modules = ['craype-accel-nvidia60']
         elif self.current_system.name in ['arolla', 'tsa']:
             self.modules = ['cuda/10.1.243']
             self.build_system.ldflags = ['-L$EBROOTCUDA/lib64',
                                          '-lcudart', '-lcuda']
 
-        self.build_system.cppflags = ['-D_ENABLE_CUDA_']
-
 
 @rfm.simple_test
 class G2GLatencyTest(P2PBaseTest):
-    def __init__(self):
-        super().__init__()
-        self.valid_systems = ['daint:gpu', 'dom:gpu', 'arolla:cn', 'tsa:cn']
-        self.num_gpus_per_node = 1
-        self.executable = './p2p_osu_latency'
-        self.executable_opts = ['-x', '100', '-i', '1000', '-d',
-                                'cuda', 'D', 'D']
-
-        self.reference = {
-            'dom:gpu': {
-                'latency': (5.56, None, 0.1, 'us')
-            },
-            'daint:gpu': {
-                'latency': (6.8, None, 0.65, 'us')
-            },
-        }
+    valid_systems = ['daint:gpu', 'dom:gpu', 'arolla:cn', 'tsa:cn']
+    num_gpus_per_node = 1
+    executable = './p2p_osu_latency'
+    executable_opts = ['-x', '100', '-i', '1000', '-d',
+                       'cuda', 'D', 'D']
+
+    reference = {
+        'dom:gpu': {
+            'latency': (5.56, None, 0.1, 'us')
+        },
+        'daint:gpu': {
+            'latency': (6.8, None, 0.65, 'us')
+        },
+    }
+
+    @run_before('performance')
+    def set_performance_patterns(self):
         self.perf_patterns = {
             'latency': sn.extractsingle(r'^8\s+(?P<latency>\S+)',
                                         self.stdout, 'latency', float)
         }
+
+    @run_before('compile')
+    def set_cpp_flags(self):
+        self.build_system.cppflags = ['-D_ENABLE_CUDA_']
+
+    @run_before('compile')
+    def set_modules(self):
         if self.current_system.name in ['daint', 'dom']:
             self.num_gpus_per_node  = 1
-            self.modules = ['craype-accel-nvidia60']
             self.variables = {'MPICH_RDMA_ENABLED_CUDA': '1'}
+            if self.current_environ.name == 'PrgEnv-nvidia':
+                self.modules = ['cudatoolkit/21.3_11.2']
+            else:
+                self.modules = ['craype-accel-nvidia60']
         elif self.current_system.name in ['arolla', 'tsa']:
             self.modules = ['cuda/10.1.243']
             self.build_system.ldflags = ['-L$EBROOTCUDA/lib64',
                                          '-lcudart', '-lcuda']
-
-        self.build_system.cppflags = ['-D_ENABLE_CUDA_']