From c2e76ea6d1d71d794c759030d3b16d24caf6a393 Mon Sep 17 00:00:00 2001 From: Theofilos Manitaras Date: Tue, 21 Dec 2021 11:35:57 +0100 Subject: [PATCH 1/2] New test for Slurm heterogenous job support --- cscs-checks/system/slurm/heterogeneous.py | 65 ++++++++++++++++++++ cscs-checks/system/slurm/src/heterogeneous.c | 20 ++++++ 2 files changed, 85 insertions(+) create mode 100644 cscs-checks/system/slurm/heterogeneous.py create mode 100644 cscs-checks/system/slurm/src/heterogeneous.c diff --git a/cscs-checks/system/slurm/heterogeneous.py b/cscs-checks/system/slurm/heterogeneous.py new file mode 100644 index 0000000000..c9e3851909 --- /dev/null +++ b/cscs-checks/system/slurm/heterogeneous.py @@ -0,0 +1,65 @@ +# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich) +# ReFrame Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: BSD-3-Clause + +import reframe as rfm +import reframe.utility.sanity as sn + + +@rfm.simple_test +class HeterogenousSlurmJobTest(rfm.RegressionTest): + descr = 'Heterogenous Slurm job test' + sourcepath = 'heterogeneous.c' + valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc'] + valid_prog_environs = ['PrgEnv-cray'] + num_tasks = 12 + num_tasks_per_node = 3 + num_cpus_per_task = 4 + num_tasks2 = 1 + num_tasks_per_node2 = 1 + num_cpus_per_task2 = 6 + build_system = 'SingleSource' + maintainers = ['TM', 'VH'] + tags = {'slurm'} + + @run_before('compile') + def set_cflags(self): + self.build_system.cflags = ['-fopenmp'] + + @run_before('run') + def set_threads_per_core(self): + self.job.options = ['--threads-per-core=1'] + + @run_before('run') + def setup_heterogeneous_job(self): + self.job.options += [ + f'#SBATCH hetjob', f'--ntasks={self.num_tasks2}', + f'--ntasks-per-node={self.num_tasks_per_node2}', + f'--cpus-per-task={self.num_cpus_per_task}', + f'--threads-per-core=1', + # The second constraint has to be passed using the #SBATCH prefix + # verbatim, so that ReFrame does not combine the constraints + f'#SBATCH --constraint={self.current_partition.name}' + ] + # Ensure that the two heterogeneous jobs share the MPI_COMM_WORLD + # communicator + self.job.launcher.options = ['--het-group=0,1'] + + @sanity_function + def validate(self): + return sn.all([ + *[sn.assert_found(f'Hello from rank {rank} running omp thread ' + f'{thread}/{self.num_cpus_per_task}', + self.stdout) + for rank in range(self.num_tasks) + for thread in range(self.num_cpus_per_task)], + # The mpi ranks of the second job are assigned the remaining ids + # of the total ranks of the MPI_COMM_WORLD communicator + *[sn.assert_found(f'Hello from rank {rank + self.num_tasks} ' + f'running omp thread ' + f'{thread}/{self.num_cpus_per_task2}', + self.stdout) + for rank in range(self.num_tasks2) + for thread in range(self.num_cpus_per_task2)], + ]) diff --git a/cscs-checks/system/slurm/src/heterogeneous.c b/cscs-checks/system/slurm/src/heterogeneous.c new file mode 100644 index 0000000000..60dd015178 --- /dev/null +++ b/cscs-checks/system/slurm/src/heterogeneous.c @@ -0,0 +1,20 @@ +#include +#include +#include "mpi.h" + +int main(int argc, char **argv) { + int thread_safety; + MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &thread_safety); + int size, rank; + MPI_Comm_size(MPI_COMM_WORLD, &size); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + #pragma omp parallel + { + int tid = omp_get_thread_num(); + int num_threads = omp_get_num_threads(); + printf("Hello from rank %d running omp thread %d/%d\n", rank, tid, num_threads); + } + + return 0; +} From bc8fbedc97dd719491af4c07e9b81e0b5f331f23 Mon Sep 17 00:00:00 2001 From: Theofilos Manitaras Date: Wed, 12 Jan 2022 11:00:57 +0100 Subject: [PATCH 2/2] Address PR comments --- .../system/slurm/{heterogeneous.py => hetjob.py} | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) rename cscs-checks/system/slurm/{heterogeneous.py => hetjob.py} (86%) diff --git a/cscs-checks/system/slurm/heterogeneous.py b/cscs-checks/system/slurm/hetjob.py similarity index 86% rename from cscs-checks/system/slurm/heterogeneous.py rename to cscs-checks/system/slurm/hetjob.py index c9e3851909..f030d470a3 100644 --- a/cscs-checks/system/slurm/heterogeneous.py +++ b/cscs-checks/system/slurm/hetjob.py @@ -16,9 +16,9 @@ class HeterogenousSlurmJobTest(rfm.RegressionTest): num_tasks = 12 num_tasks_per_node = 3 num_cpus_per_task = 4 - num_tasks2 = 1 - num_tasks_per_node2 = 1 - num_cpus_per_task2 = 6 + num_tasks_het = 1 + num_tasks_per_node_het = 1 + num_cpus_per_task_het = 6 build_system = 'SingleSource' maintainers = ['TM', 'VH'] tags = {'slurm'} @@ -34,8 +34,8 @@ def set_threads_per_core(self): @run_before('run') def setup_heterogeneous_job(self): self.job.options += [ - f'#SBATCH hetjob', f'--ntasks={self.num_tasks2}', - f'--ntasks-per-node={self.num_tasks_per_node2}', + f'#SBATCH hetjob', f'--ntasks={self.num_tasks_het}', + f'--ntasks-per-node={self.num_tasks_per_node_het}', f'--cpus-per-task={self.num_cpus_per_task}', f'--threads-per-core=1', # The second constraint has to be passed using the #SBATCH prefix @@ -58,8 +58,8 @@ def validate(self): # of the total ranks of the MPI_COMM_WORLD communicator *[sn.assert_found(f'Hello from rank {rank + self.num_tasks} ' f'running omp thread ' - f'{thread}/{self.num_cpus_per_task2}', + f'{thread}/{self.num_cpus_per_task_het}', self.stdout) - for rank in range(self.num_tasks2) - for thread in range(self.num_cpus_per_task2)], + for rank in range(self.num_tasks_het) + for thread in range(self.num_cpus_per_task_het)], ])