From 45d092024e669dd553378baef21b0c3436d6dda9 Mon Sep 17 00:00:00 2001 From: jgp Date: Fri, 20 Nov 2020 14:47:49 +0100 Subject: [PATCH 01/10] eatmem_mpi --- cscs-checks/system/slurm/slurm.py | 54 ++++++ cscs-checks/system/slurm/src/eatmemory_mpi.c | 173 +++++++++++++++++++ 2 files changed, 227 insertions(+) create mode 100644 cscs-checks/system/slurm/src/eatmemory_mpi.c diff --git a/cscs-checks/system/slurm/slurm.py b/cscs-checks/system/slurm/slurm.py index 3b1edc2a78..e47999c426 100644 --- a/cscs-checks/system/slurm/slurm.py +++ b/cscs-checks/system/slurm/slurm.py @@ -187,3 +187,57 @@ def __init__(self): @rfm.run_before('run') def set_memory_limit(self): self.job.options += ['--mem=2000'] + + +@rfm.simple_test +class MemoryMpiCheck(SlurmCompiledBaseCheck): + def __init__(self): + super().__init__() + self.valid_systems.append('eiger:mc') + self.time_limit = '5m' + self.sourcepath = 'eatmemory_mpi.c' + self.tags.add('mem') + self.executable_opts = ['100%'] + self.sanity_patterns = sn.assert_found(r'(oom-kill)|(Killed)', + self.stderr) + # {{{ perf + regex = (r'^Eating 256 MB\/mpi \*\d+mpi = -\d+ MB Mem: total: \d+ GB, ' + r'free: \d+ GB, avail: \d+ GB, using: (\d+) GB') + self.perf_patterns = { + 'max_cn_memory': sn.getattr(self, 'reference_meminfo'), + 'max_allocated_memory': sn.max(sn.extractall(regex, self.stdout, 1, + int)), + } + no_limit = (0, None, None, 'GB') + self.reference = { + '*': { + 'max_cn_memory': no_limit, + 'max_allocated_memory': (sn.getattr(self, 'reference_meminfo'), + -0.05, 0.05, 'GB'), + } + } + # }}} + + # {{{ hooks + @rfm.run_before('run') + def set_tasks(self): + tasks_per_node = { + 'dom:mc': 36, + 'daint:mc': 36, + 'dom:gpu': 12, + 'daint:gpu': 12, + 'eiger:mc': 128, + } + self.num_tasks_per_node = \ + tasks_per_node[self.current_partition.fullname] + self.num_tasks = self.num_tasks_per_node + self.job.launcher.options = ['-u'] + + @rfm.run_before('sanity') + def get_meminfo(self): + regex_mem = r'^Currently avail memory: (\d+)' + # regex_mem = r'^Currently total memory: (\d+)' + self.reference_meminfo = \ + sn.extractsingle(regex_mem, self.stdout, 1, + conv=lambda x: int(int(x) / 1024**3)) + # }}} diff --git a/cscs-checks/system/slurm/src/eatmemory_mpi.c b/cscs-checks/system/slurm/src/eatmemory_mpi.c new file mode 100644 index 0000000000..9498ce67c5 --- /dev/null +++ b/cscs-checks/system/slurm/src/eatmemory_mpi.c @@ -0,0 +1,173 @@ +// MPI version of eatmemory.c from Julio Viera +#include +#include +#include +#include +#include +#include +#include +#include + +#define PROC_FILE "/proc/meminfo" +#define MEMTOTAL 0 +#define MEMFREE 1 +#define MEMCACHED 2 +#define SWAPTOTAL 3 +#define SWAPFREE 4 +#define SWAPCACHED 5 +#define MEMAVAIL 6 +#define MEMORY_PERCENTAGE + +typedef struct { + char *str; + uint32_t val; +} meminfo_t; + +int cscs_read_proc_meminfo(int); + +#ifdef MEMORY_PERCENTAGE +size_t getTotalSystemMemory() { + long pages = sysconf(_SC_PHYS_PAGES); + long page_size = sysconf(_SC_PAGE_SIZE); + return pages * page_size; +} + +size_t getFreeSystemMemory() { + long pages = sysconf(_SC_AVPHYS_PAGES); + long page_size = sysconf(_SC_PAGE_SIZE); + return pages * page_size; +} +#endif + +bool eat(long total, int chunk) { + long i; + int rank, mpi_size; + MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + for (i = 0; i < total; i += chunk) { + if (rank == 0) { + int mb_mpi = chunk / 1048576; + printf("Eating %d MB/mpi *%dmpi = -%d MB ", mb_mpi, mpi_size, + mb_mpi * mpi_size); + cscs_read_proc_meminfo(i); + } + short *buffer = malloc(sizeof(char) * chunk); + if (buffer == NULL) { + return false; + } + memset(buffer, 0, chunk); + } + return true; +} + +int cscs_read_proc_meminfo(int i) { + FILE *fp; + meminfo_t meminfo[] = {{"MemTotal:", 0}, {"MemFree:", 0}, + {"Cached:", 0}, {"SwapCached:", 0}, + {"SwapTotal:", 0}, {"SwapFree:", 0}, + {"MemAvailable:", 0}, {NULL, 0}}; + fp = fopen(PROC_FILE, "r"); + if (!fp) { + printf("Cannot read %s", PROC_FILE); + return -1; + } + char buf[80]; + while (fgets(buf, sizeof(buf), fp)) { + int i; + for (i = 0; meminfo[i].str; i++) { + size_t len = strlen(meminfo[i].str); + if (!strncmp(buf, meminfo[i].str, len)) { + char *ptr = buf + len + 1; + while (isspace(*ptr)) + ptr++; + sscanf(ptr, "%u kB", &meminfo[i].val); + } + } + } + fclose(fp); + + printf("Mem: total: %u GB, free: %u GB, avail: %u GB, using: %u GB\n", + meminfo[MEMTOTAL].val / 1048576, meminfo[MEMFREE].val / 1048576, + meminfo[MEMAVAIL].val / 1048576, + (meminfo[MEMTOTAL].val - meminfo[MEMAVAIL].val) / 1048576); + return 0; +} + +int main(int argc, char *argv[]) { + int rank, mpi_size; + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); +#ifdef MEMORY_PERCENTAGE + if (rank == 0) { + printf("Currently total memory: %zd\n", getTotalSystemMemory()); + printf("Currently avail memory: %zd\n", getFreeSystemMemory()); + } +#endif + int i; + for (i = 0; i < argc; i++) { + char *arg = argv[i]; + if (strcmp(arg, "-h") == 0 || strcmp(arg, "-?") == 0 || argc == 1) { + printf("Usage: eatmemory \n"); + printf("Size can be specified in megabytes or gigabytes in the following " + "way:\n"); + printf("# # Bytes example: 1024\n"); + printf("#M # Megabytes example: 15M\n"); + printf("#G # Gigabytes example: 2G\n"); +#ifdef MEMORY_PERCENTAGE + printf("#%% # Percent example: 50%%\n"); +#endif + printf("\n"); + } else if (i > 0) { + int len = strlen(arg); + char unit = arg[len - 1]; + long size = -1; + int chunk = 268435456; // = 256M + // int chunk=536870912; // = 512M + // int chunk=1073741824; // = 1G + if (!isdigit(unit)) { + if (unit == 'M' || unit == 'G') { + arg[len - 1] = 0; + size = atol(arg) * (unit == 'M' ? 1024 * 1024 : 1024 * 1024 * 1024); + } +#ifdef MEMORY_PERCENTAGE + else if (unit == '%') { + size = (atol(arg) * (long)getFreeSystemMemory()) / 100; + } +#endif + else { + printf("Invalid size format\n"); + exit(0); + } + } else { + size = atoi(arg); + } + + if (rank == 0) { + cscs_read_proc_meminfo(i); + printf("Peak: %d mpi * %ld bytes = %ld Mbytes\n", mpi_size, size, + mpi_size * size / 1000000); + printf("Eating %ld bytes in chunks of %d...\n", size, chunk); + printf("Eating %ld (1byte=8bits) Mbytes in chunks of %d Kbytes\n", + (size / 1000000), (chunk / 1000)); + } + if (eat(size, chunk)) { + if (isatty(fileno(stdin))) { + printf("Done, press any key to free the memory\n"); + } else { + if (rank == 0) + printf("rank %d Done, kill this process to free the memory\n", + rank); + while (true) { + sleep(1); + } + } + } else { + printf("ERROR: Could not allocate the memory"); + } + } + } + + MPI_Finalize(); + return 0; +} From 18abccaf8ea3190185e7285ded11b45092b363ac Mon Sep 17 00:00:00 2001 From: jgp Date: Fri, 20 Nov 2020 15:02:01 +0100 Subject: [PATCH 02/10] fix for amd --- cscs-checks/system/slurm/slurm.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cscs-checks/system/slurm/slurm.py b/cscs-checks/system/slurm/slurm.py index e47999c426..7045e3cf3b 100644 --- a/cscs-checks/system/slurm/slurm.py +++ b/cscs-checks/system/slurm/slurm.py @@ -193,7 +193,7 @@ def set_memory_limit(self): class MemoryMpiCheck(SlurmCompiledBaseCheck): def __init__(self): super().__init__() - self.valid_systems.append('eiger:mc') + self.valid_systems.append('eiger:mc_lowmem') self.time_limit = '5m' self.sourcepath = 'eatmemory_mpi.c' self.tags.add('mem') @@ -226,7 +226,7 @@ def set_tasks(self): 'daint:mc': 36, 'dom:gpu': 12, 'daint:gpu': 12, - 'eiger:mc': 128, + 'eiger:mc_lowmem': 128, } self.num_tasks_per_node = \ tasks_per_node[self.current_partition.fullname] @@ -236,7 +236,6 @@ def set_tasks(self): @rfm.run_before('sanity') def get_meminfo(self): regex_mem = r'^Currently avail memory: (\d+)' - # regex_mem = r'^Currently total memory: (\d+)' self.reference_meminfo = \ sn.extractsingle(regex_mem, self.stdout, 1, conv=lambda x: int(int(x) / 1024**3)) From 421fe35b03a4305a33c08fa72543d3defc828667 Mon Sep 17 00:00:00 2001 From: jgp Date: Mon, 23 Nov 2020 09:00:47 +0100 Subject: [PATCH 03/10] fix for comment --- cscs-checks/system/slurm/slurm.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/cscs-checks/system/slurm/slurm.py b/cscs-checks/system/slurm/slurm.py index 7045e3cf3b..777d1c3da6 100644 --- a/cscs-checks/system/slurm/slurm.py +++ b/cscs-checks/system/slurm/slurm.py @@ -201,6 +201,10 @@ def __init__(self): self.sanity_patterns = sn.assert_found(r'(oom-kill)|(Killed)', self.stderr) # {{{ perf + regex_mem = r'^Currently avail memory: (\d+)' + self.reference_meminfo = \ + sn.extractsingle(regex_mem, self.stdout, 1, + conv=lambda x: int(int(x) / 1024**3)) regex = (r'^Eating 256 MB\/mpi \*\d+mpi = -\d+ MB Mem: total: \d+ GB, ' r'free: \d+ GB, avail: \d+ GB, using: (\d+) GB') self.perf_patterns = { @@ -213,7 +217,7 @@ def __init__(self): '*': { 'max_cn_memory': no_limit, 'max_allocated_memory': (sn.getattr(self, 'reference_meminfo'), - -0.05, 0.05, 'GB'), + -0.05, None, 'GB'), } } # }}} @@ -232,11 +236,4 @@ def set_tasks(self): tasks_per_node[self.current_partition.fullname] self.num_tasks = self.num_tasks_per_node self.job.launcher.options = ['-u'] - - @rfm.run_before('sanity') - def get_meminfo(self): - regex_mem = r'^Currently avail memory: (\d+)' - self.reference_meminfo = \ - sn.extractsingle(regex_mem, self.stdout, 1, - conv=lambda x: int(int(x) / 1024**3)) # }}} From e65da20624c5781637b555e9141b17960434aff0 Mon Sep 17 00:00:00 2001 From: jgp Date: Mon, 23 Nov 2020 14:42:51 +0100 Subject: [PATCH 04/10] fix for review --- cscs-checks/system/slurm/slurm.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/cscs-checks/system/slurm/slurm.py b/cscs-checks/system/slurm/slurm.py index 777d1c3da6..04cfd98039 100644 --- a/cscs-checks/system/slurm/slurm.py +++ b/cscs-checks/system/slurm/slurm.py @@ -3,6 +3,7 @@ # # SPDX-License-Identifier: BSD-3-Clause +import os import reframe as rfm import reframe.utility.osext as osext import reframe.utility.sanity as sn @@ -201,10 +202,6 @@ def __init__(self): self.sanity_patterns = sn.assert_found(r'(oom-kill)|(Killed)', self.stderr) # {{{ perf - regex_mem = r'^Currently avail memory: (\d+)' - self.reference_meminfo = \ - sn.extractsingle(regex_mem, self.stdout, 1, - conv=lambda x: int(int(x) / 1024**3)) regex = (r'^Eating 256 MB\/mpi \*\d+mpi = -\d+ MB Mem: total: \d+ GB, ' r'free: \d+ GB, avail: \d+ GB, using: (\d+) GB') self.perf_patterns = { @@ -236,4 +233,12 @@ def set_tasks(self): tasks_per_node[self.current_partition.fullname] self.num_tasks = self.num_tasks_per_node self.job.launcher.options = ['-u'] + + @rfm.run_after('run') + def get_meminfo(self): + regex_mem = r'^Currently avail memory: (\d+)' + abs_path = os.path.join(self.stagedir, str(self.stdout)) + self.reference_meminfo = \ + sn.extractsingle(regex_mem, abs_path, 1, + conv=lambda x: int(int(x) / 1024**3)) # }}} From 4f3225683f6bdb0dd8dcd3089368d6d97df4436a Mon Sep 17 00:00:00 2001 From: jgp Date: Wed, 2 Dec 2020 08:19:47 +0100 Subject: [PATCH 05/10] fix for review --- cscs-checks/system/slurm/slurm.py | 10 +++++----- cscs-checks/system/slurm/src/eatmemory_mpi.c | 1 + 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/cscs-checks/system/slurm/slurm.py b/cscs-checks/system/slurm/slurm.py index 04cfd98039..b5ba7d25cf 100644 --- a/cscs-checks/system/slurm/slurm.py +++ b/cscs-checks/system/slurm/slurm.py @@ -194,6 +194,7 @@ def set_memory_limit(self): class MemoryMpiCheck(SlurmCompiledBaseCheck): def __init__(self): super().__init__() + self.maintainers = ['JG'] self.valid_systems.append('eiger:mc_lowmem') self.time_limit = '5m' self.sourcepath = 'eatmemory_mpi.c' @@ -229,8 +230,8 @@ def set_tasks(self): 'daint:gpu': 12, 'eiger:mc_lowmem': 128, } - self.num_tasks_per_node = \ - tasks_per_node[self.current_partition.fullname] + partname = self.current_partition.fullname + self.num_tasks_per_node = tasks_per_node[partname] self.num_tasks = self.num_tasks_per_node self.job.launcher.options = ['-u'] @@ -238,7 +239,6 @@ def set_tasks(self): def get_meminfo(self): regex_mem = r'^Currently avail memory: (\d+)' abs_path = os.path.join(self.stagedir, str(self.stdout)) - self.reference_meminfo = \ - sn.extractsingle(regex_mem, abs_path, 1, - conv=lambda x: int(int(x) / 1024**3)) + self.reference_meminfo = sn.extractsingle( + regex_mem, abs_path, 1, conv=lambda x: int(int(x) / 1024 ** 3)) # }}} diff --git a/cscs-checks/system/slurm/src/eatmemory_mpi.c b/cscs-checks/system/slurm/src/eatmemory_mpi.c index 9498ce67c5..0bfd91d804 100644 --- a/cscs-checks/system/slurm/src/eatmemory_mpi.c +++ b/cscs-checks/system/slurm/src/eatmemory_mpi.c @@ -1,4 +1,5 @@ // MPI version of eatmemory.c from Julio Viera +// 12/2020: add cscs_read_proc_meminfo from jg (cscs) #include #include #include From 86ad6fc8410747dd1b8a87d95341fdc9bd07c85e Mon Sep 17 00:00:00 2001 From: jgp Date: Sat, 5 Dec 2020 15:28:18 +0100 Subject: [PATCH 06/10] simpler --- cscs-checks/system/slurm/slurm.py | 31 +++++++++++++++----- cscs-checks/system/slurm/src/eatmemory_mpi.c | 12 +++++--- 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/cscs-checks/system/slurm/slurm.py b/cscs-checks/system/slurm/slurm.py index b5ba7d25cf..3186e9f181 100644 --- a/cscs-checks/system/slurm/slurm.py +++ b/cscs-checks/system/slurm/slurm.py @@ -195,7 +195,7 @@ class MemoryMpiCheck(SlurmCompiledBaseCheck): def __init__(self): super().__init__() self.maintainers = ['JG'] - self.valid_systems.append('eiger:mc_lowmem') + self.valid_systems.append('eiger:mc', 'pilatus:mc') self.time_limit = '5m' self.sourcepath = 'eatmemory_mpi.c' self.tags.add('mem') @@ -228,17 +228,32 @@ def set_tasks(self): 'daint:mc': 36, 'dom:gpu': 12, 'daint:gpu': 12, - 'eiger:mc_lowmem': 128, + 'eiger:mc': 128, + 'pilatus:mc': 128, } partname = self.current_partition.fullname self.num_tasks_per_node = tasks_per_node[partname] self.num_tasks = self.num_tasks_per_node self.job.launcher.options = ['-u'] - @rfm.run_after('run') - def get_meminfo(self): - regex_mem = r'^Currently avail memory: (\d+)' - abs_path = os.path.join(self.stagedir, str(self.stdout)) - self.reference_meminfo = sn.extractsingle( - regex_mem, abs_path, 1, conv=lambda x: int(int(x) / 1024 ** 3)) + @rfm.run_before('run') + def set_reference_memory(self): + reference_meminfo = { + 'dom:gpu': 64, + 'dom:mc': 64, + 'daint:gpu': 64, + 'daint:mc': 64, # this will pass with 64 GB and above memory sizes + # this will pass with 256 GB and above memory sizes: + 'eiger:mc': 250, + 'pilatus:mc': 250, + } + partname = self.current_partition.fullname + self.reference_meminfo = reference_meminfo[partname] + +## @rfm.run_after('run') +## def get_meminfo(self): +## regex_mem = r'^Currently avail memory: (\d+)' +## abs_path = os.path.join(self.stagedir, str(self.stdout)) +## self.reference_meminfo = sn.extractsingle( +## regex_mem, abs_path, 1, conv=lambda x: int(int(x) / 1024 ** 3)) # }}} diff --git a/cscs-checks/system/slurm/src/eatmemory_mpi.c b/cscs-checks/system/slurm/src/eatmemory_mpi.c index 0bfd91d804..84be87c8c2 100644 --- a/cscs-checks/system/slurm/src/eatmemory_mpi.c +++ b/cscs-checks/system/slurm/src/eatmemory_mpi.c @@ -87,7 +87,8 @@ int cscs_read_proc_meminfo(int i) { } fclose(fp); - printf("Mem: total: %u GB, free: %u GB, avail: %u GB, using: %u GB\n", + printf("memory from %s: total: %u GB, free: %u GB, avail: %u GB, using: %u GB\n", + PROC_FILE, meminfo[MEMTOTAL].val / 1048576, meminfo[MEMFREE].val / 1048576, meminfo[MEMAVAIL].val / 1048576, (meminfo[MEMTOTAL].val - meminfo[MEMAVAIL].val) / 1048576); @@ -101,8 +102,8 @@ int main(int argc, char *argv[]) { MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); #ifdef MEMORY_PERCENTAGE if (rank == 0) { - printf("Currently total memory: %zd\n", getTotalSystemMemory()); - printf("Currently avail memory: %zd\n", getFreeSystemMemory()); + printf("memory from sysconf: total: %zd avail: %zd\n", \ + getTotalSystemMemory(), getFreeSystemMemory() ); } #endif int i; @@ -123,7 +124,10 @@ int main(int argc, char *argv[]) { int len = strlen(arg); char unit = arg[len - 1]; long size = -1; - int chunk = 268435456; // = 256M + int chunk = 33554432; // 32M + // int chunk = 67108864; // 64M + // int chunk = 134217728; // 128M + // int chunk = 268435456; // = 256M // int chunk=536870912; // = 512M // int chunk=1073741824; // = 1G if (!isdigit(unit)) { From 2680f696f39f63ee2ecd8029868acf644d667cee Mon Sep 17 00:00:00 2001 From: jgp Date: Sat, 5 Dec 2020 15:43:19 +0100 Subject: [PATCH 07/10] simpler --- cscs-checks/system/slurm/slurm.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/cscs-checks/system/slurm/slurm.py b/cscs-checks/system/slurm/slurm.py index 3186e9f181..3362ea54a0 100644 --- a/cscs-checks/system/slurm/slurm.py +++ b/cscs-checks/system/slurm/slurm.py @@ -195,7 +195,8 @@ class MemoryMpiCheck(SlurmCompiledBaseCheck): def __init__(self): super().__init__() self.maintainers = ['JG'] - self.valid_systems.append('eiger:mc', 'pilatus:mc') + self.valid_systems.append('eiger:mc') + self.valid_systems.append('pilatus:mc') self.time_limit = '5m' self.sourcepath = 'eatmemory_mpi.c' self.tags.add('mem') @@ -203,8 +204,9 @@ def __init__(self): self.sanity_patterns = sn.assert_found(r'(oom-kill)|(Killed)', self.stderr) # {{{ perf - regex = (r'^Eating 256 MB\/mpi \*\d+mpi = -\d+ MB Mem: total: \d+ GB, ' - r'free: \d+ GB, avail: \d+ GB, using: (\d+) GB') + regex = (r'^Eating \d+ MB\/mpi \*\d+mpi = -\d+ MB memory from \/proc\/' + r'meminfo: total: \d+ GB, free: \d+ GB, avail: \d+ GB, using:' + r' (\d+) GB') self.perf_patterns = { 'max_cn_memory': sn.getattr(self, 'reference_meminfo'), 'max_allocated_memory': sn.max(sn.extractall(regex, self.stdout, 1, @@ -249,11 +251,4 @@ def set_reference_memory(self): } partname = self.current_partition.fullname self.reference_meminfo = reference_meminfo[partname] - -## @rfm.run_after('run') -## def get_meminfo(self): -## regex_mem = r'^Currently avail memory: (\d+)' -## abs_path = os.path.join(self.stagedir, str(self.stdout)) -## self.reference_meminfo = sn.extractsingle( -## regex_mem, abs_path, 1, conv=lambda x: int(int(x) / 1024 ** 3)) # }}} From aad86494e2e9acb39357a65eb3455fd68d879e21 Mon Sep 17 00:00:00 2001 From: jgp Date: Sun, 6 Dec 2020 15:51:40 +0100 Subject: [PATCH 08/10] remove import os --- cscs-checks/system/slurm/slurm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cscs-checks/system/slurm/slurm.py b/cscs-checks/system/slurm/slurm.py index 3362ea54a0..e19b699c61 100644 --- a/cscs-checks/system/slurm/slurm.py +++ b/cscs-checks/system/slurm/slurm.py @@ -3,7 +3,6 @@ # # SPDX-License-Identifier: BSD-3-Clause -import os import reframe as rfm import reframe.utility.osext as osext import reframe.utility.sanity as sn From 5ec66bcc29ce8fdab31f7179b16a59a42ab46f4b Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Mon, 7 Dec 2020 12:02:25 +0100 Subject: [PATCH 09/10] Coding style changes --- cscs-checks/system/slurm/slurm.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/cscs-checks/system/slurm/slurm.py b/cscs-checks/system/slurm/slurm.py index e19b699c61..412c1ad5fd 100644 --- a/cscs-checks/system/slurm/slurm.py +++ b/cscs-checks/system/slurm/slurm.py @@ -190,12 +190,11 @@ def set_memory_limit(self): @rfm.simple_test -class MemoryMpiCheck(SlurmCompiledBaseCheck): +class MemoryOverconsumptionMpiCheck(SlurmCompiledBaseCheck): def __init__(self): super().__init__() self.maintainers = ['JG'] - self.valid_systems.append('eiger:mc') - self.valid_systems.append('pilatus:mc') + self.valid_systems += ['eiger:mc', 'pilatus:mc'] self.time_limit = '5m' self.sourcepath = 'eatmemory_mpi.c' self.tags.add('mem') @@ -208,15 +207,17 @@ def __init__(self): r' (\d+) GB') self.perf_patterns = { 'max_cn_memory': sn.getattr(self, 'reference_meminfo'), - 'max_allocated_memory': sn.max(sn.extractall(regex, self.stdout, 1, - int)), + 'max_allocated_memory': sn.max( + sn.extractall(regex, self.stdout, 1, int) + ), } no_limit = (0, None, None, 'GB') self.reference = { '*': { 'max_cn_memory': no_limit, - 'max_allocated_memory': (sn.getattr(self, 'reference_meminfo'), - -0.05, None, 'GB'), + 'max_allocated_memory': ( + sn.getattr(self, 'reference_meminfo'), -0.05, None, 'GB' + ), } } # }}} @@ -236,9 +237,11 @@ def set_tasks(self): self.num_tasks_per_node = tasks_per_node[partname] self.num_tasks = self.num_tasks_per_node self.job.launcher.options = ['-u'] + # }}} - @rfm.run_before('run') - def set_reference_memory(self): + @property + @sn.sanity_function + def reference_meminfo(self): reference_meminfo = { 'dom:gpu': 64, 'dom:mc': 64, @@ -248,6 +251,4 @@ def set_reference_memory(self): 'eiger:mc': 250, 'pilatus:mc': 250, } - partname = self.current_partition.fullname - self.reference_meminfo = reference_meminfo[partname] - # }}} + return reference_meminfo[self.current_partition.fullname] From 193da196f744aebc8359b30fd4eb00a2c2622ceb Mon Sep 17 00:00:00 2001 From: jgp Date: Mon, 7 Dec 2020 12:29:17 +0100 Subject: [PATCH 10/10] fix for review --- cscs-checks/system/slurm/slurm.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cscs-checks/system/slurm/slurm.py b/cscs-checks/system/slurm/slurm.py index 412c1ad5fd..339bffb83e 100644 --- a/cscs-checks/system/slurm/slurm.py +++ b/cscs-checks/system/slurm/slurm.py @@ -243,10 +243,10 @@ def set_tasks(self): @sn.sanity_function def reference_meminfo(self): reference_meminfo = { - 'dom:gpu': 64, - 'dom:mc': 64, - 'daint:gpu': 64, - 'daint:mc': 64, # this will pass with 64 GB and above memory sizes + 'dom:gpu': 62, + 'dom:mc': 62, + 'daint:gpu': 62, + 'daint:mc': 62, # this will pass with 64 GB and above memory sizes # this will pass with 256 GB and above memory sizes: 'eiger:mc': 250, 'pilatus:mc': 250,