From 1b56d726dd3a26770ed49fe206b4c0e1b7abeacd Mon Sep 17 00:00:00 2001 From: Victor Holanda Rusu Date: Tue, 3 Mar 2020 15:18:38 +0100 Subject: [PATCH 1/8] Add GREASY test --- config/cscs.py | 11 +- cscs-checks/apps/greasy/greasy_check.py | 218 ++++++++++++++++++ .../apps/greasy/src/tasks_mpi_openmp.c | 62 +++++ 3 files changed, 288 insertions(+), 3 deletions(-) create mode 100644 cscs-checks/apps/greasy/greasy_check.py create mode 100644 cscs-checks/apps/greasy/src/tasks_mpi_openmp.c diff --git a/config/cscs.py b/config/cscs.py index e18f8437e2..e820d18881 100644 --- a/config/cscs.py +++ b/config/cscs.py @@ -114,7 +114,8 @@ class ReframeSettings: 'descr': 'Hybrid nodes (Haswell/P100)', 'max_jobs': 100, 'resources': { - 'switches': ['--switches={num_switches}'] + 'switches': ['--switches={num_switches}'], + 'gres': ['--gres={gres}'] } }, @@ -135,7 +136,8 @@ class ReframeSettings: 'descr': 'Multicore nodes (Broadwell)', 'max_jobs': 100, 'resources': { - 'switches': ['--switches={num_switches}'] + 'switches': ['--switches={num_switches}'], + 'gres': ['--gres={gres}'] } }, @@ -192,6 +194,9 @@ class ReframeSettings: 'PrgEnv-pgi'], 'descr': 'Hybrid nodes (Haswell/P100)', 'max_jobs': 100, + 'resources': { + 'gres': ['--gres={gres}'] + } }, 'mc': { @@ -212,7 +217,7 @@ class ReframeSettings: 'descr': 'Multicore nodes (Broadwell)', 'max_jobs': 100, 'resources': { - 'switches': ['--switches={num_switches}'] + 'gres': ['--gres={gres}'] } }, diff --git a/cscs-checks/apps/greasy/greasy_check.py b/cscs-checks/apps/greasy/greasy_check.py new file mode 100644 index 0000000000..227abd6742 --- /dev/null +++ b/cscs-checks/apps/greasy/greasy_check.py @@ -0,0 +1,218 @@ +# Copyright 2016-2020 Swiss National Supercomputing Centre (CSCS/ETH Zurich) +# ReFrame Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: BSD-3-Clause + +import itertools +import os +import fnmatch + +from datetime import datetime + +import reframe as rfm +import reframe.utility.sanity as sn + +from reframe.core.launchers.registry import getlauncher + + +def toSeconds(str): + return (datetime.strptime(str, '%H:%M:%S') - + datetime.strptime('00:00:00', '%H:%M:%S')).total_seconds() + + +@rfm.required_version('>=2.19') +@rfm.parameterized_test( + ['serial', 'daint:gpu', 24, 12, 1, 1], + ['serial', 'daint:mc', 72, 36, 1, 1], + ['openmp', 'daint:gpu', 24, 3, 1, 4], + ['openmp', 'daint:mc', 72, 9, 1, 4], + ['mpi', 'daint:gpu', 24, 4, 3, 1], + ['mpi', 'daint:mc', 72, 12, 3, 1], + ['mpi+openmp', 'daint:gpu', 24, 3, 2, 2], + ['mpi+openmp', 'daint:mc', 72, 6, 3, 2]) +class GREASYCheck(rfm.RegressionTest): + def __init__(self, variant, system, num_greasy_tasks, nworkes_per_node, + nranks_per_worker, ncpus_per_worker): + + self.valid_systems = [system] + if system.startswith('daint'): + self.valid_systems += [system.replace('daint', 'dom')] + + self.valid_prog_environs = ['PrgEnv-gnu'] + self.sourcepath = 'tasks_mpi_openmp.c' + self.build_system = 'SingleSource' + + # sleep enough time to distinguish if the files are running in parallel + # or not + self.sleep_time = 60 + self.build_system.cflags = ['-DSLEEP_TIME=%d' % self.sleep_time] + + if variant in ['openmp']: + self.build_system.cflags += ['-fopenmp'] + elif variant in ['mpi']: + self.build_system.cflags += ['-D_MPI'] + elif variant in ['mpi+openmp']: + self.build_system.cflags += ['-fopenmp', '-D_MPI'] + + self.executable = 'tasks_mpi_openmp.x' + self.tasks_file = 'tasks.txt' + self.executable_opts = [self.tasks_file] + self.greasy_logfile = 'greasy.log' + self.keep_files = [self.tasks_file, self.greasy_logfile] + + self.sanity_patterns = self.eval_sanity() + + nnodes = 2 + self.num_greasy_tasks = num_greasy_tasks + self.nworkes_per_node = nworkes_per_node + self.nranks_per_worker = nranks_per_worker + self.num_tasks_per_node = nranks_per_worker * nworkes_per_node + self.num_tasks = self.num_tasks_per_node * nnodes + self.num_cpus_per_task = ncpus_per_worker + + # Reference value is system agnostic and depnes + refperf = self.sleep_time * num_greasy_tasks / nworkes_per_node / nnodes + self.reference = { + '*': { + 'time': (refperf, None, 0.3, 's') + } + } + self.perf_patterns = { + 'time': sn.extractsingle(r'Total time: (?P\S+)', + self.greasy_logfile, + 'perf', toSeconds) + } + + # On SLURM there is no need to set OMP_NUM_THREADS if one defines + # num_cpus_per_task, but adding for completeness and portability + self.variables = { + 'OMP_NUM_THREADS': str(self.num_cpus_per_task), + 'GREASY_NWORKERS_PER_NODE': str(nworkes_per_node), + 'GREASY_LOGFILE': self.greasy_logfile + } + + self.modules = ['GREASY'] + self.maintainers = ['VH', 'SK'] + self.use_multithreading = False + + self.tags = {'production'} + + @rfm.run_before('run') + def generate_tasks_file(self): + with open(os.path.join(self.stagedir, self.tasks_file), 'w') as outfile: + for i in range(self.num_greasy_tasks): + outfile.write("./%s output-%d\n" % (self.executable, i)) + + @rfm.run_before('run') + def daint_dom_gpu_specific_workaround(self): + if self.current_partition.fullname in ['daint:gpu', 'dom:gpu']: + self.variables['CRAY_CUDA_MPS'] = "1" + self.variables['CUDA_VISIBLE_DEVICES'] = "0" + self.variables['GPU_DEVICE_ORDINAL'] = "0" + + self.extra_resources = { + 'gres': { + 'gres': 'gpu:0,craynetwork:4' + } + } + + @rfm.run_before('run') + def change_executable_name(self): + # After compiling the code we can change the executable to be + # the greasy one + self.executable = 'greasy' + + @rfm.run_before('run') + def set_launcher(self): + # The job launcher has to be changed to local since greasy + # make calls to srun + self.job.launcher = getlauncher('local')() + + @sn.sanity_function + def eval_sanity(self): + output_files = [] + for file in os.listdir(self.stagedir): + if file.startswith("output-"): + output_files.append(file) + + num_greasy_tasks = len(output_files) + failure_msg = ('Requested %s task(s), but only executed %s tasks(s)' % + (self.num_greasy_tasks, num_greasy_tasks)) + sn.evaluate(sn.assert_eq(num_greasy_tasks, self.num_greasy_tasks, + msg=failure_msg)) + + num_tasks = sn.getattr(self, 'nranks_per_worker') + num_cpus_per_task = sn.getattr(self, 'num_cpus_per_task') + + def tid(match): + return int(match.group(1)) + + def num_threads(match): + return int(match.group(2)) + + def rank(match): + return int(match.group(3)) + + def num_ranks(match): + return int(match.group(4)) + + for output_file in output_files: + result = sn.findall(r'Hello, World from thread \s*(\d+) out ' + r'of \s*(\d+) from process \s*(\d+) out of ' + r'\s*(\d+)', output_file) + + failure_msg = ('Found %s Hello, World... pattern(s), but expected ' + '%s pattern(s) inside the output file %s' % ( + sn.count(result), + num_tasks * num_cpus_per_task, + output_file)) + sn.evaluate(sn.assert_eq(sn.count(result), + num_tasks * num_cpus_per_task, + msg=failure_msg)) + + sn.evaluate(sn.assert_true(sn.all( + sn.chain( + sn.map(lambda x: sn.assert_lt(tid(x), num_threads(x)), + result), + sn.map(lambda x: sn.assert_lt(rank(x), num_ranks(x)), + result), + sn.map( + lambda x: sn.assert_lt(tid(x), num_cpus_per_task), + result), + sn.map( + lambda x: sn.assert_eq(num_threads(x), + num_cpus_per_task), + result), + sn.map(lambda x: sn.assert_lt(rank(x), num_tasks), + result), + sn.map(lambda x: sn.assert_eq(num_ranks(x), num_tasks), + result), + ) + ))) + + sn.evaluate(sn.assert_eq(sn.count(sn.findall(r'Finished greasing', + self.greasy_logfile)), 1)) + + result = sn.findall(r'INFO: Summary of (\d+) tasks: ' + r'(\d+) OK, ' + r'(\d+) FAILED, ' + r'(\d+) CANCELLED, ' + r'(\d+) INVALID\.', output_file) + sn.evaluate(sn.assert_true(sn.all( + sn.chain( + sn.map(lambda x: sn.assert_eq(int(x.group(1)), + self.num_greasy_tasks), + result), + sn.map(lambda x: sn.assert_eq(int(x.group(2)), + self.num_greasy_tasks), + result), + sn.map(lambda x: sn.assert_eq(int(x.group(3)), 0), + result), + sn.map(lambda x: sn.assert_eq(int(x.group(4)), 0), + result), + sn.map(lambda x: sn.assert_eq(int(x.group(5)), 0), + result), + ) + ))) + + return True diff --git a/cscs-checks/apps/greasy/src/tasks_mpi_openmp.c b/cscs-checks/apps/greasy/src/tasks_mpi_openmp.c new file mode 100644 index 0000000000..527c456e1d --- /dev/null +++ b/cscs-checks/apps/greasy/src/tasks_mpi_openmp.c @@ -0,0 +1,62 @@ +#include +#include +#include + +#ifdef _OPENMP +#include +#endif +#ifdef _MPI +#include "mpi.h" +#endif + +#define STRINGIZE_MACRO(A) #A +#define STRINGIZE(A) STRINGIZE_MACRO(A) + +int main(int argc, char *argv[]) +{ + int size = 1; + int rank = 0; + int tid = 0; + FILE *outputfile; + + if (argc > 1) + { + outputfile = fopen(argv[1], "a"); + if (outputfile == NULL) { + fprintf(stdout, "Error. Unable to open output file %s", argv[1]); + } + } + else + { + outputfile = stdout; + } + +#ifdef _MPI + MPI_Init(&argc, &argv); + MPI_Comm_size(MPI_COMM_WORLD, &size); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); +#endif + +#ifdef _OPENMP + #pragma omp parallel default(shared) private(tid) +#endif + { +#ifdef _OPENMP + int nthreads = omp_get_num_threads(); + tid = omp_get_thread_num(); +#else + int nthreads = 1; +#endif + // sleep for as long as it is necessary to distinguish whether we are running in parallel or not + // if GREASY is running correctly the test should take approximatelly this amount of time to run + sleep(atoi(STRINGIZE(SLEEP_TIME))); + fprintf(outputfile, "Hello, World from thread %d out of %d from process %d out of %d\n", + tid, nthreads, rank, size); + } + +#ifdef _MPI + MPI_Finalize(); +#endif + + return 0; +} From f2afae1f7af8ff071e9db6547aa8c30c2980fbcb Mon Sep 17 00:00:00 2001 From: Victor Holanda Rusu Date: Wed, 25 Mar 2020 13:02:53 +0100 Subject: [PATCH 2/8] Address PR remarks --- cscs-checks/apps/greasy/greasy_check.py | 105 ++++++++++++++++++------ 1 file changed, 81 insertions(+), 24 deletions(-) diff --git a/cscs-checks/apps/greasy/greasy_check.py b/cscs-checks/apps/greasy/greasy_check.py index 227abd6742..8a655319d6 100644 --- a/cscs-checks/apps/greasy/greasy_check.py +++ b/cscs-checks/apps/greasy/greasy_check.py @@ -22,21 +22,19 @@ def toSeconds(str): @rfm.required_version('>=2.19') @rfm.parameterized_test( - ['serial', 'daint:gpu', 24, 12, 1, 1], - ['serial', 'daint:mc', 72, 36, 1, 1], - ['openmp', 'daint:gpu', 24, 3, 1, 4], - ['openmp', 'daint:mc', 72, 9, 1, 4], - ['mpi', 'daint:gpu', 24, 4, 3, 1], - ['mpi', 'daint:mc', 72, 12, 3, 1], - ['mpi+openmp', 'daint:gpu', 24, 3, 2, 2], - ['mpi+openmp', 'daint:mc', 72, 6, 3, 2]) + ['serial', 'gpu', 24, 12, 1, 1], + ['serial', 'mc', 72, 36, 1, 1], + ['openmp', 'gpu', 24, 3, 1, 4], + ['openmp', 'mc', 72, 9, 1, 4], + ['mpi', 'gpu', 24, 4, 3, 1], + ['mpi', 'mc', 72, 12, 3, 1], + ['mpi+openmp', 'gpu', 24, 3, 2, 2], + ['mpi+openmp', 'mc', 72, 6, 3, 2]) class GREASYCheck(rfm.RegressionTest): def __init__(self, variant, system, num_greasy_tasks, nworkes_per_node, nranks_per_worker, ncpus_per_worker): - self.valid_systems = [system] - if system.startswith('daint'): - self.valid_systems += [system.replace('daint', 'dom')] + self.valid_systems = ['daint:' + system, 'dom:' + system] self.valid_prog_environs = ['PrgEnv-gnu'] self.sourcepath = 'tasks_mpi_openmp.c' @@ -115,6 +113,12 @@ def daint_dom_gpu_specific_workaround(self): 'gres': 'gpu:0,craynetwork:4' } } + elif self.current_partition.fullname in ['daint:mc', 'dom:mc']: + self.extra_resources = { + 'gres': { + 'gres': 'craynetwork:72' + } + } @rfm.run_before('run') def change_executable_name(self): @@ -172,45 +176,98 @@ def num_ranks(match): sn.evaluate(sn.assert_true(sn.all( sn.chain( - sn.map(lambda x: sn.assert_lt(tid(x), num_threads(x)), + sn.map(lambda x: sn.assert_lt(tid(x), num_threads(x), + msg='Found %d threads rather than %d' %(tid(x), num_threads(x))), result), - sn.map(lambda x: sn.assert_lt(rank(x), num_ranks(x)), + sn.map(lambda x: sn.assert_lt(rank(x), num_ranks(x), + msg='Rank id %d is not lower ' + 'than the number of ranks' + ' %d in output file %s' + % (rank(x), + self.nranks_per_worker, + output_file)), result), sn.map( - lambda x: sn.assert_lt(tid(x), num_cpus_per_task), + lambda x: sn.assert_lt(tid(x), + self.num_cpus_per_task, + msg='Rank id %d is not lower ' + 'than the number of cpus per' + ' task %d in output file %s' + % (tid(x), + self.num_cpus_per_task, + output_file)), result), sn.map( lambda x: sn.assert_eq(num_threads(x), - num_cpus_per_task), + num_cpus_per_task, + msg='Found %d threads rather ' + 'than %d in output file %s' + % (num_threads(x), + self.num_cpus_per_task, + output_file)), result), - sn.map(lambda x: sn.assert_lt(rank(x), num_tasks), + sn.map(lambda x: sn.assert_lt(rank(x), num_tasks, + msg='Found %d threads rather ' + 'than %d in output file %s' + % (rank(x), + self.num_cpus_per_task, + output_file)), result), - sn.map(lambda x: sn.assert_eq(num_ranks(x), num_tasks), + sn.map(lambda x: sn.assert_eq(num_ranks(x), num_tasks, + msg='Number of ranks %d is ' + 'not equal to %d in ' + ' output file %s' + % (num_ranks(x), + self.nranks_per_worker, + output_file)), result), ) ))) sn.evaluate(sn.assert_eq(sn.count(sn.findall(r'Finished greasing', - self.greasy_logfile)), 1)) + self.greasy_logfile)), 1)) result = sn.findall(r'INFO: Summary of (\d+) tasks: ' r'(\d+) OK, ' r'(\d+) FAILED, ' r'(\d+) CANCELLED, ' - r'(\d+) INVALID\.', output_file) + r'(\d+) INVALID\.', self.greasy_logfile) + sn.evaluate(sn.assert_true(sn.all( sn.chain( sn.map(lambda x: sn.assert_eq(int(x.group(1)), - self.num_greasy_tasks), + self.num_greasy_tasks, + msg='Number of greasy ' + ' tasks is %d but found ' + '%s' + % (self.num_greasy_tasks, + x.group(1))), result), sn.map(lambda x: sn.assert_eq(int(x.group(2)), - self.num_greasy_tasks), + self.num_greasy_tasks, + msg='Expected %d ' + 'successful tasks but ' + 'found %s' + % (self.num_greasy_tasks, + x.group(2))), result), - sn.map(lambda x: sn.assert_eq(int(x.group(3)), 0), + sn.map(lambda x: sn.assert_eq(int(x.group(3)), 0, + msg='Expected 0 ' + 'failed tasks but ' + 'found %s' + % x.group(3)), result), - sn.map(lambda x: sn.assert_eq(int(x.group(4)), 0), + sn.map(lambda x: sn.assert_eq(int(x.group(4)), 0, + msg='Expected 0 ' + 'cancelled tasks but ' + 'found %s' + % x.group(4)), result), - sn.map(lambda x: sn.assert_eq(int(x.group(5)), 0), + sn.map(lambda x: sn.assert_eq(int(x.group(5)), 0, + msg='Expected 0 ' + 'invalid tasks but ' + 'found %s' + % x.group(5)), result), ) ))) From 13119972bc6ce5a40e576aedcd3efb0529a44f2c Mon Sep 17 00:00:00 2001 From: Victor Holanda Rusu Date: Wed, 25 Mar 2020 14:35:11 +0100 Subject: [PATCH 3/8] Address further PR remarks --- cscs-checks/apps/greasy/greasy_check.py | 156 +++++++++--------------- 1 file changed, 58 insertions(+), 98 deletions(-) diff --git a/cscs-checks/apps/greasy/greasy_check.py b/cscs-checks/apps/greasy/greasy_check.py index 8a655319d6..0a91561d6f 100644 --- a/cscs-checks/apps/greasy/greasy_check.py +++ b/cscs-checks/apps/greasy/greasy_check.py @@ -6,16 +6,14 @@ import itertools import os import fnmatch - from datetime import datetime import reframe as rfm import reframe.utility.sanity as sn - from reframe.core.launchers.registry import getlauncher -def toSeconds(str): +def to_seconds(str): return (datetime.strptime(str, '%H:%M:%S') - datetime.strptime('00:00:00', '%H:%M:%S')).total_seconds() @@ -29,11 +27,11 @@ def toSeconds(str): ['mpi', 'gpu', 24, 4, 3, 1], ['mpi', 'mc', 72, 12, 3, 1], ['mpi+openmp', 'gpu', 24, 3, 2, 2], - ['mpi+openmp', 'mc', 72, 6, 3, 2]) + ['mpi+openmp', 'mc', 72, 6, 3, 2] +) class GREASYCheck(rfm.RegressionTest): def __init__(self, variant, system, num_greasy_tasks, nworkes_per_node, nranks_per_worker, ncpus_per_worker): - self.valid_systems = ['daint:' + system, 'dom:' + system] self.valid_prog_environs = ['PrgEnv-gnu'] @@ -78,7 +76,7 @@ def __init__(self, variant, system, num_greasy_tasks, nworkes_per_node, self.perf_patterns = { 'time': sn.extractsingle(r'Total time: (?P\S+)', self.greasy_logfile, - 'perf', toSeconds) + 'perf', to_seconds) } # On SLURM there is no need to set OMP_NUM_THREADS if one defines @@ -140,11 +138,10 @@ def eval_sanity(self): output_files.append(file) num_greasy_tasks = len(output_files) - failure_msg = ('Requested %s task(s), but only executed %s tasks(s)' % - (self.num_greasy_tasks, num_greasy_tasks)) + failure_msg = ('Requested {0} task(s), but only executed {0} ' + 'tasks(s)'.format(self.num_greasy_tasks)) sn.evaluate(sn.assert_eq(num_greasy_tasks, self.num_greasy_tasks, msg=failure_msg)) - num_tasks = sn.getattr(self, 'nranks_per_worker') num_cpus_per_task = sn.getattr(self, 'num_cpus_per_task') @@ -165,111 +162,74 @@ def num_ranks(match): r'of \s*(\d+) from process \s*(\d+) out of ' r'\s*(\d+)', output_file) - failure_msg = ('Found %s Hello, World... pattern(s), but expected ' - '%s pattern(s) inside the output file %s' % ( - sn.count(result), - num_tasks * num_cpus_per_task, - output_file)) + failure_msg = ('Found {0} Hello, World... pattern(s), but expected' + ' {1} pattern(s) inside the output file {0}'.format( + sn.count(result), num_tasks * num_cpus_per_task, + output_file)) sn.evaluate(sn.assert_eq(sn.count(result), num_tasks * num_cpus_per_task, msg=failure_msg)) - sn.evaluate(sn.assert_true(sn.all( + sn.evaluate(sn.all( sn.chain( sn.map(lambda x: sn.assert_lt(tid(x), num_threads(x), - msg='Found %d threads rather than %d' %(tid(x), num_threads(x))), - result), + msg='Found {0} threads ' + 'rather than {1}'.format( + tid(x), num_threads(x))), + result), sn.map(lambda x: sn.assert_lt(rank(x), num_ranks(x), - msg='Rank id %d is not lower ' - 'than the number of ranks' - ' %d in output file %s' - % (rank(x), - self.nranks_per_worker, - output_file)), - result), - sn.map( - lambda x: sn.assert_lt(tid(x), - self.num_cpus_per_task, - msg='Rank id %d is not lower ' - 'than the number of cpus per' - ' task %d in output file %s' - % (tid(x), + msg='Rank id {0} is not ' + 'lower than the number of ' + 'ranks {1} in output file ' + '{2}'.format(rank(x), + self.nranks_per_worker, + output_file)), + result), + sn.map(lambda x: sn.assert_lt(tid(x), + self.num_cpus_per_task, + msg='Rank id {0} is not ' + 'lower than the number of ' + 'cpus per task {1} in output' + ' file {2}'.format(tid(x), self.num_cpus_per_task, - output_file)), - result), + output_file)), + result), sn.map( lambda x: sn.assert_eq(num_threads(x), num_cpus_per_task, - msg='Found %d threads rather ' - 'than %d in output file %s' - % (num_threads(x), - self.num_cpus_per_task, - output_file)), + msg='Found {0} threads rather ' + 'than {1} in output file ' + '{2}'.format(num_threads(x), + self.num_cpus_per_task, + output_file)), result), sn.map(lambda x: sn.assert_lt(rank(x), num_tasks, - msg='Found %d threads rather ' - 'than %d in output file %s' - % (rank(x), - self.num_cpus_per_task, - output_file)), - result), + msg='Found {0} threads ' + 'rather than {1} in output ' + 'file {2}'.format(rank(x), + self.num_cpus_per_task, + output_file)), + result), sn.map(lambda x: sn.assert_eq(num_ranks(x), num_tasks, - msg='Number of ranks %d is ' - 'not equal to %d in ' - ' output file %s' - % (num_ranks(x), - self.nranks_per_worker, + msg='Number of ranks {0} is ' + 'not equal to {1} in ' + 'output file ' + '{2}'.format(num_ranks(x), + self.nranks_per_worker, output_file)), - result), + result), ) - ))) - - sn.evaluate(sn.assert_eq(sn.count(sn.findall(r'Finished greasing', - self.greasy_logfile)), 1)) - - result = sn.findall(r'INFO: Summary of (\d+) tasks: ' - r'(\d+) OK, ' - r'(\d+) FAILED, ' - r'(\d+) CANCELLED, ' - r'(\d+) INVALID\.', self.greasy_logfile) - - sn.evaluate(sn.assert_true(sn.all( - sn.chain( - sn.map(lambda x: sn.assert_eq(int(x.group(1)), - self.num_greasy_tasks, - msg='Number of greasy ' - ' tasks is %d but found ' - '%s' - % (self.num_greasy_tasks, - x.group(1))), - result), - sn.map(lambda x: sn.assert_eq(int(x.group(2)), - self.num_greasy_tasks, - msg='Expected %d ' - 'successful tasks but ' - 'found %s' - % (self.num_greasy_tasks, - x.group(2))), - result), - sn.map(lambda x: sn.assert_eq(int(x.group(3)), 0, - msg='Expected 0 ' - 'failed tasks but ' - 'found %s' - % x.group(3)), - result), - sn.map(lambda x: sn.assert_eq(int(x.group(4)), 0, - msg='Expected 0 ' - 'cancelled tasks but ' - 'found %s' - % x.group(4)), - result), - sn.map(lambda x: sn.assert_eq(int(x.group(5)), 0, - msg='Expected 0 ' - 'invalid tasks but ' - 'found %s' - % x.group(5)), - result), + )) + + sn.evaluate(sn.assert_found(r'Finished greasing', self.greasy_logfile)) + + sn.evaluate(sn.assert_found(r'INFO: Summary of {0} tasks: ' + r'{0} OK, ' + r'0 FAILED, ' + r'0 CANCELLED, ' + r'0 INVALID\.'.format( + self.num_greasy_tasks), + self.greasy_logfile) ) - ))) return True From ffa5bda86d60b2333af466b0a793b16fe0f71721 Mon Sep 17 00:00:00 2001 From: Victor Holanda Rusu Date: Tue, 7 Apr 2020 14:10:03 +0200 Subject: [PATCH 4/8] Address PR remarks --- cscs-checks/apps/greasy/greasy_check.py | 133 +++++++++++------------- 1 file changed, 58 insertions(+), 75 deletions(-) diff --git a/cscs-checks/apps/greasy/greasy_check.py b/cscs-checks/apps/greasy/greasy_check.py index 0a91561d6f..d2787205b4 100644 --- a/cscs-checks/apps/greasy/greasy_check.py +++ b/cscs-checks/apps/greasy/greasy_check.py @@ -30,9 +30,9 @@ def to_seconds(str): ['mpi+openmp', 'mc', 72, 6, 3, 2] ) class GREASYCheck(rfm.RegressionTest): - def __init__(self, variant, system, num_greasy_tasks, nworkes_per_node, + def __init__(self, variant, partition, num_greasy_tasks, nworkes_per_node, nranks_per_worker, ncpus_per_worker): - self.valid_systems = ['daint:' + system, 'dom:' + system] + self.valid_systems = ['daint:' + partition, 'dom:' + partition] self.valid_prog_environs = ['PrgEnv-gnu'] self.sourcepath = 'tasks_mpi_openmp.c' @@ -42,12 +42,11 @@ def __init__(self, variant, system, num_greasy_tasks, nworkes_per_node, # or not self.sleep_time = 60 self.build_system.cflags = ['-DSLEEP_TIME=%d' % self.sleep_time] - - if variant in ['openmp']: + if variant == 'openmp': self.build_system.cflags += ['-fopenmp'] - elif variant in ['mpi']: + elif variant == 'mpi': self.build_system.cflags += ['-D_MPI'] - elif variant in ['mpi+openmp']: + elif variant == 'mpi+openmp': self.build_system.cflags += ['-fopenmp', '-D_MPI'] self.executable = 'tasks_mpi_openmp.x' @@ -55,18 +54,16 @@ def __init__(self, variant, system, num_greasy_tasks, nworkes_per_node, self.executable_opts = [self.tasks_file] self.greasy_logfile = 'greasy.log' self.keep_files = [self.tasks_file, self.greasy_logfile] - - self.sanity_patterns = self.eval_sanity() - nnodes = 2 + self.use_multithreading = False self.num_greasy_tasks = num_greasy_tasks self.nworkes_per_node = nworkes_per_node self.nranks_per_worker = nranks_per_worker self.num_tasks_per_node = nranks_per_worker * nworkes_per_node self.num_tasks = self.num_tasks_per_node * nnodes self.num_cpus_per_task = ncpus_per_worker - - # Reference value is system agnostic and depnes + self.sanity_patterns = self.eval_sanity() + # Reference value is system agnostic refperf = self.sleep_time * num_greasy_tasks / nworkes_per_node / nnodes self.reference = { '*': { @@ -78,7 +75,6 @@ def __init__(self, variant, system, num_greasy_tasks, nworkes_per_node, self.greasy_logfile, 'perf', to_seconds) } - # On SLURM there is no need to set OMP_NUM_THREADS if one defines # num_cpus_per_task, but adding for completeness and portability self.variables = { @@ -86,26 +82,22 @@ def __init__(self, variant, system, num_greasy_tasks, nworkes_per_node, 'GREASY_NWORKERS_PER_NODE': str(nworkes_per_node), 'GREASY_LOGFILE': self.greasy_logfile } - self.modules = ['GREASY'] self.maintainers = ['VH', 'SK'] - self.use_multithreading = False - self.tags = {'production'} @rfm.run_before('run') def generate_tasks_file(self): - with open(os.path.join(self.stagedir, self.tasks_file), 'w') as outfile: + with open(os.path.join(self.stagedir, self.tasks_file), 'w') as fp: for i in range(self.num_greasy_tasks): - outfile.write("./%s output-%d\n" % (self.executable, i)) + fp.write('./%s output-%d\n' % (self.executable, i)) @rfm.run_before('run') def daint_dom_gpu_specific_workaround(self): if self.current_partition.fullname in ['daint:gpu', 'dom:gpu']: - self.variables['CRAY_CUDA_MPS'] = "1" - self.variables['CUDA_VISIBLE_DEVICES'] = "0" - self.variables['GPU_DEVICE_ORDINAL'] = "0" - + self.variables['CRAY_CUDA_MPS'] = '1' + self.variables['CUDA_VISIBLE_DEVICES'] = '0' + self.variables['GPU_DEVICE_ORDINAL'] = '0' self.extra_resources = { 'gres': { 'gres': 'gpu:0,craynetwork:4' @@ -133,13 +125,11 @@ def set_launcher(self): @sn.sanity_function def eval_sanity(self): output_files = [] - for file in os.listdir(self.stagedir): - if file.startswith("output-"): - output_files.append(file) - + output_files = [file for file in os.listdir(self.stagedir) + if file.startswith('output-')] num_greasy_tasks = len(output_files) - failure_msg = ('Requested {0} task(s), but only executed {0} ' - 'tasks(s)'.format(self.num_greasy_tasks)) + failure_msg = (f'Requested {self.num_greasy_tasks} task(s), but ' + f'executed only {num_greasy_tasks} tasks(s)') sn.evaluate(sn.assert_eq(num_greasy_tasks, self.num_greasy_tasks, msg=failure_msg)) num_tasks = sn.getattr(self, 'nranks_per_worker') @@ -162,10 +152,10 @@ def num_ranks(match): r'of \s*(\d+) from process \s*(\d+) out of ' r'\s*(\d+)', output_file) - failure_msg = ('Found {0} Hello, World... pattern(s), but expected' - ' {1} pattern(s) inside the output file {0}'.format( - sn.count(result), num_tasks * num_cpus_per_task, - output_file)) + failure_msg = (f'Found {sn.count(result)} Hello, World... ' + f'pattern(s) but expected ' + f'{num_tasks * num_cpus_per_task} pattern(s) inside ' + f'the output file {output_file}') sn.evaluate(sn.assert_eq(sn.count(result), num_tasks * num_cpus_per_task, msg=failure_msg)) @@ -173,62 +163,55 @@ def num_ranks(match): sn.evaluate(sn.all( sn.chain( sn.map(lambda x: sn.assert_lt(tid(x), num_threads(x), - msg='Found {0} threads ' - 'rather than {1}'.format( - tid(x), num_threads(x))), - result), + msg=f'Found {tid(x)} threads ' + 'rather than ' + f'{num_threads(x)}'), result), sn.map(lambda x: sn.assert_lt(rank(x), num_ranks(x), - msg='Rank id {0} is not ' - 'lower than the number of ' - 'ranks {1} in output file ' - '{2}'.format(rank(x), - self.nranks_per_worker, - output_file)), - result), + msg=f'Rank id {rank(x)} is ' + 'not lower than the number' + 'of ranks ' + f'{self.nranks_per_worker} in' + ' output file '), result), sn.map(lambda x: sn.assert_lt(tid(x), self.num_cpus_per_task, - msg='Rank id {0} is not ' - 'lower than the number of ' - 'cpus per task {1} in output' - ' file {2}'.format(tid(x), - self.num_cpus_per_task, - output_file)), + msg=f'Rank id {tid(x)} is not' + ' lower than the number of ' + 'cpus per task ' + f'{self.num_cpus_per_task} in' + f' output file {output_file}'), result), - sn.map( - lambda x: sn.assert_eq(num_threads(x), - num_cpus_per_task, - msg='Found {0} threads rather ' - 'than {1} in output file ' - '{2}'.format(num_threads(x), - self.num_cpus_per_task, - output_file)), - result), + sn.map(lambda x: sn.assert_eq(num_threads(x), + num_cpus_per_task, + msg=f'Found {num_threads(x)} ' + 'threads rather ' + 'than ' + f'{self.num_cpus_per_task}' + ' in output file ' + f'{output_file}'), result), sn.map(lambda x: sn.assert_lt(rank(x), num_tasks, - msg='Found {0} threads ' - 'rather than {1} in output ' - 'file {2}'.format(rank(x), - self.num_cpus_per_task, - output_file)), + msg=f'Found {rank(x)} threads' + ' rather than ' + f'{self.num_cpus_per_task} in' + f' output file {output_file}'), result), sn.map(lambda x: sn.assert_eq(num_ranks(x), num_tasks, - msg='Number of ranks {0} is ' - 'not equal to {1} in ' - 'output file ' - '{2}'.format(num_ranks(x), - self.nranks_per_worker, - output_file)), - result), + msg=f'Number of ranks ' + f'{num_ranks(x)} is not ' + 'equal to ' + f'{self.nranks_per_worker}' + ' in output file ' + f'{output_file}'), result), ) )) sn.evaluate(sn.assert_found(r'Finished greasing', self.greasy_logfile)) - sn.evaluate(sn.assert_found(r'INFO: Summary of {0} tasks: ' - r'{0} OK, ' - r'0 FAILED, ' - r'0 CANCELLED, ' - r'0 INVALID\.'.format( - self.num_greasy_tasks), + sn.evaluate(sn.assert_found(f'INFO: Summary of {self.num_greasy_tasks} ' + f'tasks: ' + f'{self.num_greasy_tasks} OK, ' + f'0 FAILED, ' + f'0 CANCELLED, ' + f'0 INVALID\.', self.greasy_logfile) ) From 5bd02899c08a953e9f0255b30b7ed6927eb35df7 Mon Sep 17 00:00:00 2001 From: Victor Holanda Rusu Date: Thu, 9 Apr 2020 12:31:38 +0200 Subject: [PATCH 5/8] Address PR remarks --- cscs-checks/apps/greasy/greasy_check.py | 85 ++++++++++++------------- 1 file changed, 41 insertions(+), 44 deletions(-) diff --git a/cscs-checks/apps/greasy/greasy_check.py b/cscs-checks/apps/greasy/greasy_check.py index d2787205b4..bbfe314ce2 100644 --- a/cscs-checks/apps/greasy/greasy_check.py +++ b/cscs-checks/apps/greasy/greasy_check.py @@ -63,11 +63,12 @@ def __init__(self, variant, partition, num_greasy_tasks, nworkes_per_node, self.num_tasks = self.num_tasks_per_node * nnodes self.num_cpus_per_task = ncpus_per_worker self.sanity_patterns = self.eval_sanity() + # Reference value is system agnostic - refperf = self.sleep_time * num_greasy_tasks / nworkes_per_node / nnodes + refperf = self.sleep_time*num_greasy_tasks / nworkes_per_node / nnodes self.reference = { '*': { - 'time': (refperf, None, 0.3, 's') + 'time': (refperf, None, 0.5, 's') } } self.perf_patterns = { @@ -90,7 +91,7 @@ def __init__(self, variant, partition, num_greasy_tasks, nworkes_per_node, def generate_tasks_file(self): with open(os.path.join(self.stagedir, self.tasks_file), 'w') as fp: for i in range(self.num_greasy_tasks): - fp.write('./%s output-%d\n' % (self.executable, i)) + fp.write(f'./{self.executable} output-{i}\n') @rfm.run_before('run') def daint_dom_gpu_specific_workaround(self): @@ -126,7 +127,7 @@ def set_launcher(self): def eval_sanity(self): output_files = [] output_files = [file for file in os.listdir(self.stagedir) - if file.startswith('output-')] + if file.startswith('output-')] num_greasy_tasks = len(output_files) failure_msg = (f'Requested {self.num_greasy_tasks} task(s), but ' f'executed only {num_greasy_tasks} tasks(s)') @@ -162,45 +163,41 @@ def num_ranks(match): sn.evaluate(sn.all( sn.chain( - sn.map(lambda x: sn.assert_lt(tid(x), num_threads(x), - msg=f'Found {tid(x)} threads ' - 'rather than ' - f'{num_threads(x)}'), result), - sn.map(lambda x: sn.assert_lt(rank(x), num_ranks(x), - msg=f'Rank id {rank(x)} is ' - 'not lower than the number' - 'of ranks ' - f'{self.nranks_per_worker} in' - ' output file '), result), - sn.map(lambda x: sn.assert_lt(tid(x), - self.num_cpus_per_task, - msg=f'Rank id {tid(x)} is not' - ' lower than the number of ' - 'cpus per task ' - f'{self.num_cpus_per_task} in' - f' output file {output_file}'), - result), - sn.map(lambda x: sn.assert_eq(num_threads(x), - num_cpus_per_task, - msg=f'Found {num_threads(x)} ' - 'threads rather ' - 'than ' - f'{self.num_cpus_per_task}' - ' in output file ' - f'{output_file}'), result), - sn.map(lambda x: sn.assert_lt(rank(x), num_tasks, - msg=f'Found {rank(x)} threads' - ' rather than ' - f'{self.num_cpus_per_task} in' - f' output file {output_file}'), - result), - sn.map(lambda x: sn.assert_eq(num_ranks(x), num_tasks, - msg=f'Number of ranks ' - f'{num_ranks(x)} is not ' - 'equal to ' - f'{self.nranks_per_worker}' - ' in output file ' - f'{output_file}'), result), + sn.map(lambda x: sn.assert_lt( + tid(x), num_threads(x), + msg=f'Found {tid(x)} threads rather than ' + f'{num_threads(x)}'), result + ), + sn.map(lambda x: sn.assert_lt( + rank(x), num_ranks(x), + msg=f'Rank id {rank(x)} is not lower than the number of' + f' ranks {self.nranks_per_worker} in output file'), + result + ), + sn.map(lambda x: sn.assert_lt( + tid(x), self.num_cpus_per_task, + msg=f'Rank id {tid(x)} is not lower than the number of ' + f'cpus per task {self.num_cpus_per_task} in output ' + f'file {output_file}'), result + ), + sn.map(lambda x: sn.assert_eq( + num_threads(x), num_cpus_per_task, + msg=f'Found {num_threads(x)} threads rather than ' + f'{self.num_cpus_per_task} in output file ' + f'{output_file}'), result + ), + sn.map(lambda x: sn.assert_lt( + rank(x), num_tasks, + msg=f'Found {rank(x)} threads rather than ' + f'{self.num_cpus_per_task} in output file ' + f'{output_file}'), result + ), + sn.map(lambda x: sn.assert_eq( + num_ranks(x), num_tasks, + msg=f'Number of ranks {num_ranks(x)} is not equal to ' + f'{self.nranks_per_worker} in output file ' + f'{output_file}'), result + ), ) )) @@ -211,7 +208,7 @@ def num_ranks(match): f'{self.num_greasy_tasks} OK, ' f'0 FAILED, ' f'0 CANCELLED, ' - f'0 INVALID\.', + fr'0 INVALID\.', self.greasy_logfile) ) From 5f0f3f4f2c23f19bd88e16dbcfadf4aae369b036 Mon Sep 17 00:00:00 2001 From: Victor Holanda Rusu Date: Thu, 9 Apr 2020 13:46:51 +0200 Subject: [PATCH 6/8] Add delay for srun slowdowns --- cscs-checks/apps/greasy/greasy_check.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cscs-checks/apps/greasy/greasy_check.py b/cscs-checks/apps/greasy/greasy_check.py index bbfe314ce2..a2ba1a5358 100644 --- a/cscs-checks/apps/greasy/greasy_check.py +++ b/cscs-checks/apps/greasy/greasy_check.py @@ -65,7 +65,10 @@ def __init__(self, variant, partition, num_greasy_tasks, nworkes_per_node, self.sanity_patterns = self.eval_sanity() # Reference value is system agnostic - refperf = self.sleep_time*num_greasy_tasks / nworkes_per_node / nnodes + # Adding 10 secs of slowdown per greasy tasks + # this is to compensate for whenever the systems are full and srun gets + # slightly slower + refperf = (self.sleep_time+10)*num_greasy_tasks / nworkes_per_node / nnodes self.reference = { '*': { 'time': (refperf, None, 0.5, 's') From 85b7307c2d915c6a6a1df7db0008d326d059dd88 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Thu, 9 Apr 2020 14:39:27 +0200 Subject: [PATCH 7/8] Fix PEP8 issues --- cscs-checks/apps/greasy/greasy_check.py | 101 +++++++++++++----------- 1 file changed, 57 insertions(+), 44 deletions(-) diff --git a/cscs-checks/apps/greasy/greasy_check.py b/cscs-checks/apps/greasy/greasy_check.py index a2ba1a5358..69d70214e7 100644 --- a/cscs-checks/apps/greasy/greasy_check.py +++ b/cscs-checks/apps/greasy/greasy_check.py @@ -63,12 +63,14 @@ def __init__(self, variant, partition, num_greasy_tasks, nworkes_per_node, self.num_tasks = self.num_tasks_per_node * nnodes self.num_cpus_per_task = ncpus_per_worker self.sanity_patterns = self.eval_sanity() - + # Reference value is system agnostic # Adding 10 secs of slowdown per greasy tasks # this is to compensate for whenever the systems are full and srun gets # slightly slower - refperf = (self.sleep_time+10)*num_greasy_tasks / nworkes_per_node / nnodes + refperf = ( + (self.sleep_time+10)*num_greasy_tasks / nworkes_per_node / nnodes + ) self.reference = { '*': { 'time': (refperf, None, 0.5, 's') @@ -158,61 +160,72 @@ def num_ranks(match): failure_msg = (f'Found {sn.count(result)} Hello, World... ' f'pattern(s) but expected ' - f'{num_tasks * num_cpus_per_task} pattern(s) inside ' - f'the output file {output_file}') + f'{num_tasks * num_cpus_per_task} pattern(s) ' + f'inside the output file {output_file}') sn.evaluate(sn.assert_eq(sn.count(result), num_tasks * num_cpus_per_task, msg=failure_msg)) sn.evaluate(sn.all( sn.chain( - sn.map(lambda x: sn.assert_lt( - tid(x), num_threads(x), - msg=f'Found {tid(x)} threads rather than ' - f'{num_threads(x)}'), result - ), - sn.map(lambda x: sn.assert_lt( - rank(x), num_ranks(x), - msg=f'Rank id {rank(x)} is not lower than the number of' - f' ranks {self.nranks_per_worker} in output file'), - result + sn.map( + lambda x: sn.assert_lt( + tid(x), num_threads(x), + msg=(f'Found {tid(x)} threads rather than ' + f'{num_threads(x)}') + ), result ), - sn.map(lambda x: sn.assert_lt( - tid(x), self.num_cpus_per_task, - msg=f'Rank id {tid(x)} is not lower than the number of ' - f'cpus per task {self.num_cpus_per_task} in output ' - f'file {output_file}'), result + sn.map( + lambda x: sn.assert_lt( + rank(x), num_ranks(x), + msg=(f'Rank id {rank(x)} is not lower than the ' + f'number of ranks {self.nranks_per_worker} ' + f'in output file') + ), result ), - sn.map(lambda x: sn.assert_eq( - num_threads(x), num_cpus_per_task, - msg=f'Found {num_threads(x)} threads rather than ' - f'{self.num_cpus_per_task} in output file ' - f'{output_file}'), result + sn.map( + lambda x: sn.assert_lt( + tid(x), self.num_cpus_per_task, + msg=(f'Rank id {tid(x)} is not lower than the ' + f'number of cpus per task ' + f'{self.num_cpus_per_task} in output ' + f'file {output_file}') + ), result ), - sn.map(lambda x: sn.assert_lt( - rank(x), num_tasks, - msg=f'Found {rank(x)} threads rather than ' - f'{self.num_cpus_per_task} in output file ' - f'{output_file}'), result + sn.map( + lambda x: sn.assert_eq( + num_threads(x), num_cpus_per_task, + msg=(f'Found {num_threads(x)} threads rather than ' + f'{self.num_cpus_per_task} in output file ' + f'{output_file}') + ), result ), - sn.map(lambda x: sn.assert_eq( - num_ranks(x), num_tasks, - msg=f'Number of ranks {num_ranks(x)} is not equal to ' - f'{self.nranks_per_worker} in output file ' - f'{output_file}'), result + sn.map( + lambda x: sn.assert_lt( + rank(x), num_tasks, + msg=(f'Found {rank(x)} threads rather than ' + f'{self.num_cpus_per_task} in output file ' + f'{output_file}') + ), result ), + sn.map( + lambda x: sn.assert_eq( + num_ranks(x), num_tasks, + msg=(f'Number of ranks {num_ranks(x)} is not equal to ' + f'{self.nranks_per_worker} in output file ' + f'{output_file}') + ), result + ) ) )) - sn.evaluate(sn.assert_found(r'Finished greasing', self.greasy_logfile)) - - sn.evaluate(sn.assert_found(f'INFO: Summary of {self.num_greasy_tasks} ' - f'tasks: ' - f'{self.num_greasy_tasks} OK, ' - f'0 FAILED, ' - f'0 CANCELLED, ' - fr'0 INVALID\.', - self.greasy_logfile) - ) + sn.evaluate(sn.assert_found( + (f'INFO: Summary of {self.num_greasy_tasks} ' + f'tasks: ' + f'{self.num_greasy_tasks} OK, ' + f'0 FAILED, ' + f'0 CANCELLED, ' + fr'0 INVALID\.'), self.greasy_logfile + )) return True From aed5438e1106174890bc00262ff24f649785c99c Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Thu, 9 Apr 2020 14:42:15 +0200 Subject: [PATCH 8/8] Fix remaining PEP8 issues --- cscs-checks/apps/greasy/greasy_check.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cscs-checks/apps/greasy/greasy_check.py b/cscs-checks/apps/greasy/greasy_check.py index 69d70214e7..eaf4fd89f0 100644 --- a/cscs-checks/apps/greasy/greasy_check.py +++ b/cscs-checks/apps/greasy/greasy_check.py @@ -211,9 +211,9 @@ def num_ranks(match): sn.map( lambda x: sn.assert_eq( num_ranks(x), num_tasks, - msg=(f'Number of ranks {num_ranks(x)} is not equal to ' - f'{self.nranks_per_worker} in output file ' - f'{output_file}') + msg=(f'Number of ranks {num_ranks(x)} is not ' + f'equal to {self.nranks_per_worker} in ' + f'output file {output_file}') ), result ) )