From 3f89a5ae6858140ec4ea658ded2198566856139d Mon Sep 17 00:00:00 2001 From: Victor Holanda Date: Thu, 18 Oct 2018 15:09:05 +0200 Subject: [PATCH 1/8] Promote DGEMM check to multinode --- cscs-checks/libraries/math/dgemm.py | 114 ++++++++++++++++++------- cscs-checks/libraries/math/src/dgemm.c | 34 +++++--- 2 files changed, 105 insertions(+), 43 deletions(-) diff --git a/cscs-checks/libraries/math/dgemm.py b/cscs-checks/libraries/math/dgemm.py index cae4dcd5ea..80da2fb7e8 100644 --- a/cscs-checks/libraries/math/dgemm.py +++ b/cscs-checks/libraries/math/dgemm.py @@ -2,48 +2,100 @@ import reframe.utility.sanity as sn +@rfm.required_version('>=2.14') +@rfm.simple_test class DGEMMTest(rfm.RegressionTest): def __init__(self): super().__init__() self.descr = 'DGEMM performance test' self.sourcepath = 'dgemm.c' - self.executable_opts = ['5000', '5000', '5000'] - self.sanity_patterns = sn.assert_found( - r'Time for \d+ DGEMM operations', self.stdout) - self.maintainers = ['AJ'] - self.tags = {'production'} + self.sanity_patterns = self.eval_sanity() + # the perf patterns are automaticaly generated inside sanity + self.perf_patterns = {} -@rfm.required_version('>=2.14') -@rfm.simple_test -class DGEMMTestMonch(DGEMMTest): - def __init__(self): - super().__init__() - self.tags = {'monch_acceptance'} - self.valid_systems = ['monch:compute'] - self.valid_prog_environs = ['PrgEnv-gnu'] - self.num_tasks = 1 + self.valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc', + 'monch:compute'] + self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-gnu', 'PrgEnv-intel'] + + # FIXME: set the num_tasks to zero. + self.num_tasks = 2 self.num_tasks_per_node = 1 self.num_tasks_per_core = 1 - self.num_cpus_per_task = 20 - self.num_tasks_per_socket = 10 + self.num_tasks_per_socket = 1 self.use_multithreading = False + + self.build_system = 'SingleSource' + self.build_system.cflags = ['-O3'] + + self.my_reference = { + 'daint:gpu': (430, -0.1, None), + 'daint:mc': (430, -0.1, None), + 'dom:gpu': (430, -0.1, None), + 'dom:mc': (430, -0.1, None), + 'monch:compute': (350, -0.1, None), + } + + self.maintainers = ['AJ', 'VH', 'VK'] + self.tags = {'production'} + + + def setup(self, partition, environ, **job_opts): + if partition.fullname in ['daint:gpu', 'dom:gpu']: + self.num_cpus_per_task = 12 + self.executable_opts = ['6000', '6000', '6000'] + + elif partition.fullname in ['daint:mc', 'dom:mc']: + self.num_cpus_per_task = 36 + self.executable_opts = ['6000', '6000', '6000'] + + elif partition.fullname in ['monch:compute']: + self.num_cpus_per_task = 20 + self.executable_opts = ['5000', '5000', '5000'] + self.build_system.cflags += ['-I$EBROOTOPENBLAS/include'] + self.build_system.ldflags = ['-L$EBROOTOPENBLAS/lib', '-lopenblas', + '-lpthread', '-lgfortran'] + self.variables = { 'OMP_NUM_THREADS': str(self.num_cpus_per_task), 'MV2_ENABLE_AFFINITY': '0' } - self.build_system = 'SingleSource' - self.build_system.cflags = ['-O3', '-I$EBROOTOPENBLAS/include'] - self.build_system.ldflags = ['-L$EBROOTOPENBLAS/lib', '-lopenblas', - '-lpthread', '-lgfortran'] - self.perf_patterns = { - 'perf': sn.max( - sn.extractall(r'Run\s\d\s+:\s+(?P\S+)\s\S+', - self.stdout, "gflops", float) - ) - } - self.reference = { - 'monch:compute': { - 'perf': (350, -0.1, None) - } - } + + if environ.name.startswith('PrgEnv-cray'): + self.build_system.cflags += ['-hnoomp'] + + super().setup(partition, environ, **job_opts) + + + @sn.sanity_function + def eval_sanity(self): + failures = [] + + all_tested_nodes = sn.evaluate(sn.findall( + r'(?P.*):\s+Time for \d+ DGEMM operations', + self.stdout + )) + number_of_tested_nodes = len(all_tested_nodes) + + if number_of_tested_nodes != self.num_tasks: + failures.append('Requested %s nodes, but found %s nodes)' % + (self.num_tasks, number_of_tested_nodes)) + #FIXME: list detected nodes in error message + sn.assert_false(failures, msg=', '.join(failures)) + + update_reference = False + if self.my_reference[self.current_partition.fullname]: + update_reference = True + + for node in all_tested_nodes: + nodename = node.group('name') + + if update_reference: + partition_name = self.current_partition.fullname + ref_name = '%s:%s' % (partition_name, nodename) + self.reference[ref_name] = self.my_reference[partition_name] + self.perf_patterns[nodename] = sn.extractsingle( + '%s:\\s+Flops based on.*:\\s+(?P.*)\\sGFlops\\/sec' + % nodename, self.stdout, "gflops", float) + + return sn.assert_false(failures, msg=', '.join(failures)) diff --git a/cscs-checks/libraries/math/src/dgemm.c b/cscs-checks/libraries/math/src/dgemm.c index 961698a7a5..00f3c1206f 100644 --- a/cscs-checks/libraries/math/src/dgemm.c +++ b/cscs-checks/libraries/math/src/dgemm.c @@ -1,6 +1,7 @@ #include #include #include +#include extern void dgemm_(char*, char*, int*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int*); @@ -22,7 +23,15 @@ int main(int argc, char* argv[]) char tb='N'; struct timeval start_time, end_time, duration[LOOP_COUNT]; - + + +#ifndef HOST_NAME_MAX +#define HOST_NAME_MAX sysconf (_SC_HOST_NAME_MAX) +#endif + + char hostname[HOST_NAME_MAX]; + gethostname(hostname, sizeof(hostname)); + if (argc >= 2) m = atoi(argv[1]); if (argc >= 3) n = atoi(argv[2]); if (argc >= 4) k = atoi(argv[3]); @@ -34,10 +43,10 @@ int main(int argc, char* argv[]) double* B = (double*)malloc(sizeof(double)*k*n); double* C = (double*)malloc(sizeof(double)*m*n); - printf("Size of Matrix A(mxk)\t\t:\t%d x %d\n", m, k); - printf("Size of Matrix B(kxn)\t\t:\t%d x %d\n", k, n); - printf("Size of Matrix C(mxn)\t\t:\t%d x %d\n", m, n); - printf("LOOP COUNT\t\t\t:\t%d \n", LOOP_COUNT); + printf("%s: Size of Matrix A(mxk)\t\t:\t%d x %d\n", hostname, m, k); + printf("%s: Size of Matrix B(kxn)\t\t:\t%d x %d\n", hostname, k, n); + printf("%s: Size of Matrix C(mxn)\t\t:\t%d x %d\n", hostname, m, n); + printf("%s: LOOP COUNT\t\t\t:\t%d \n", hostname, LOOP_COUNT); printf("\n"); for (i=0; i Date: Thu, 18 Oct 2018 16:53:48 +0200 Subject: [PATCH 2/8] Adapt DGEMM to flexible allocation --- cscs-checks/libraries/math/dgemm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cscs-checks/libraries/math/dgemm.py b/cscs-checks/libraries/math/dgemm.py index 80da2fb7e8..238e9defc0 100644 --- a/cscs-checks/libraries/math/dgemm.py +++ b/cscs-checks/libraries/math/dgemm.py @@ -77,9 +77,9 @@ def eval_sanity(self): )) number_of_tested_nodes = len(all_tested_nodes) - if number_of_tested_nodes != self.num_tasks: + if number_of_tested_nodes != self.job.num_tasks: failures.append('Requested %s nodes, but found %s nodes)' % - (self.num_tasks, number_of_tested_nodes)) + (self.job.num_tasks, number_of_tested_nodes)) #FIXME: list detected nodes in error message sn.assert_false(failures, msg=', '.join(failures)) From 95af8a881e9872ca7e915aeef26398aacfcc61ed Mon Sep 17 00:00:00 2001 From: Victor Holanda Date: Mon, 19 Nov 2018 21:35:33 +0100 Subject: [PATCH 3/8] Make DGEMM a flexible check --- cscs-checks/libraries/math/dgemm.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/cscs-checks/libraries/math/dgemm.py b/cscs-checks/libraries/math/dgemm.py index a62c060425..6b5fe47f90 100644 --- a/cscs-checks/libraries/math/dgemm.py +++ b/cscs-checks/libraries/math/dgemm.py @@ -17,7 +17,6 @@ def __init__(self): self.valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc'] self.valid_prog_environs = ['PrgEnv-gnu'] - # FIXME: set the num_tasks to zero. self.num_tasks = 0 self.num_tasks_per_node = 1 self.num_tasks_per_core = 1 @@ -55,27 +54,25 @@ def setup(self, partition, environ, **job_opts): self.variables = { 'OMP_NUM_THREADS': str(self.num_cpus_per_task), - 'MV2_ENABLE_AFFINITY': '0' } - - if environ.name.startswith('PrgEnv-cray'): - self.build_system.cflags += ['-hnoomp'] - super().setup(partition, environ, **job_opts) @sn.sanity_function def eval_sanity(self): - all_tested_nodes = sn.evaluate(sn.findall( + failure_msg = "" + + all_tested_nodes = sn.evaluate(sn.extractall( r'(?P.*):\s+Time for \d+ DGEMM operations', self.stdout )) - number_of_tested_nodes = len(all_tested_nodes) + num_tested_nodes = len(all_tested_nodes) - if number_of_tested_nodes != self.job.num_tasks: - failed_nodes.append('Requested %s nodes, but found %s nodes)' % - (self.job.num_tasks, number_of_tested_nodes)) - sn.assert_false(failed_nodes, msg=', '.join(failed_nodes)) + # if num_tested_nodes != self.job.num_tasks: + if num_tested_nodes != self.job.num_tasks: + failure_msg = ('Requested %s nodes, but found %s nodes' % + (self.job.num_tasks, num_tested_nodes)) + sn.assert_false(failure_msg, msg=failure_msg) for node in all_tested_nodes: nodename = node.group('name') @@ -88,4 +85,4 @@ def eval_sanity(self): r'%s:\s+Flops based on.*:\s+(?P.*)\sGFlops\/sec' % nodename, self.stdout, 'gflops', float) - return sn.assert_false(failed_nodes, msg=', '.join(failures)) + return sn.assert_false(failure_msg, msg=failure_msg) From ca11f81af38dcf704e088ce40b59e76380b17630 Mon Sep 17 00:00:00 2001 From: Victor Holanda Rusu Date: Wed, 28 Nov 2018 14:39:20 +0100 Subject: [PATCH 4/8] Clean the flexible DGEMM test --- cscs-checks/libraries/math/dgemm.py | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/cscs-checks/libraries/math/dgemm.py b/cscs-checks/libraries/math/dgemm.py index 6b5fe47f90..33e053fe57 100644 --- a/cscs-checks/libraries/math/dgemm.py +++ b/cscs-checks/libraries/math/dgemm.py @@ -35,7 +35,7 @@ def __init__(self): } self.maintainers = ['AJ', 'VH', 'VK'] - self.tags = {'production'} + self.tags = {'diagnostic'} def setup(self, partition, environ, **job_opts): @@ -53,36 +53,30 @@ def setup(self, partition, environ, **job_opts): '-lpthread', '-lgfortran'] self.variables = { - 'OMP_NUM_THREADS': str(self.num_cpus_per_task), + 'OMP_NUM_THREADS': str(self.num_cpus_per_task) } super().setup(partition, environ, **job_opts) @sn.sanity_function def eval_sanity(self): - failure_msg = "" - all_tested_nodes = sn.evaluate(sn.extractall( - r'(?P.*):\s+Time for \d+ DGEMM operations', - self.stdout - )) + r'(?P\S+):\s+Time for \d+ DGEMM operations', + self.stdout, 'hostname')) num_tested_nodes = len(all_tested_nodes) - # if num_tested_nodes != self.job.num_tasks: if num_tested_nodes != self.job.num_tasks: - failure_msg = ('Requested %s nodes, but found %s nodes' % + failure_msg = ('Requested %s node(s), but found %s node(s)' % (self.job.num_tasks, num_tested_nodes)) - sn.assert_false(failure_msg, msg=failure_msg) - - for node in all_tested_nodes: - nodename = node.group('name') + return sn.assert_false(failure_msg, msg=failure_msg) + for hostname in all_tested_nodes: if self.sys_reference[self.current_partition.fullname]: partition_name = self.current_partition.fullname - ref_name = '%s:%s' % (partition_name, nodename) + ref_name = '%s:%s' % (partition_name, hostname) self.reference[ref_name] = self.sys_reference[partition_name] - self.perf_patterns[nodename] = sn.extractsingle( + self.perf_patterns[hostname] = sn.extractsingle( r'%s:\s+Flops based on.*:\s+(?P.*)\sGFlops\/sec' % - nodename, self.stdout, 'gflops', float) + hostname, self.stdout, 'gflops', float) - return sn.assert_false(failure_msg, msg=failure_msg) + return True From b2f0c0e173fbdc10c04d87cb5f041a49b2432261 Mon Sep 17 00:00:00 2001 From: Theofilos Manitaras Date: Wed, 5 Dec 2018 16:11:42 +0100 Subject: [PATCH 5/8] Fix printing of the average performance * Correct the reference values for daint/dom. * Minor code style fixes --- cscs-checks/libraries/math/dgemm.py | 18 +++++++++--------- cscs-checks/libraries/math/src/dgemm.c | 6 ++++-- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/cscs-checks/libraries/math/dgemm.py b/cscs-checks/libraries/math/dgemm.py index 33e053fe57..06f7039057 100644 --- a/cscs-checks/libraries/math/dgemm.py +++ b/cscs-checks/libraries/math/dgemm.py @@ -24,20 +24,20 @@ def __init__(self): self.use_multithreading = False self.build_system = 'SingleSource' - self.build_system.cflags = ['-O3'] + self.build_system.cflags = ['-O3', '-fopenmp'] self.sys_reference = { - 'daint:gpu': (460, -0.1, None), - 'daint:mc': (460, -0.1, None), - 'dom:gpu': (460, -0.1, None), - 'dom:mc': (460, -0.1, None), + 'daint:gpu': (300.0, -0.15, None), + 'daint:mc': (860.0, -0.15, None), + 'dom:gpu': (300.0, -0.15, None), + 'dom:mc': (860.0, -0.15, None), + # FIXME update the values for monch 'monch:compute': (350, -0.1, None), } self.maintainers = ['AJ', 'VH', 'VK'] self.tags = {'diagnostic'} - def setup(self, partition, environ, **job_opts): if partition.fullname in ['daint:gpu', 'dom:gpu']: self.num_cpus_per_task = 12 @@ -67,7 +67,7 @@ def eval_sanity(self): if num_tested_nodes != self.job.num_tasks: failure_msg = ('Requested %s node(s), but found %s node(s)' % - (self.job.num_tasks, num_tested_nodes)) + (self.job.num_tasks, num_tested_nodes)) return sn.assert_false(failure_msg, msg=failure_msg) for hostname in all_tested_nodes: @@ -76,7 +76,7 @@ def eval_sanity(self): ref_name = '%s:%s' % (partition_name, hostname) self.reference[ref_name] = self.sys_reference[partition_name] self.perf_patterns[hostname] = sn.extractsingle( - r'%s:\s+Flops based on.*:\s+(?P.*)\sGFlops\/sec' % - hostname, self.stdout, 'gflops', float) + r'%s:\s+Avg\. performance\s+:\s+(?P\S+)' + r'\sGFlops/sec' % hostname, self.stdout, 'gflops', float) return True diff --git a/cscs-checks/libraries/math/src/dgemm.c b/cscs-checks/libraries/math/src/dgemm.c index 00f3c1206f..ef4bc5431d 100644 --- a/cscs-checks/libraries/math/src/dgemm.c +++ b/cscs-checks/libraries/math/src/dgemm.c @@ -77,9 +77,11 @@ int main(int argc, char* argv[]) printf("%s: Run %d \t\t\t\t:\t%.5f GFlops/sec\n", hostname, i, perf[i]); } + printf("\n"); - printf("%s: Flops based on given dimensions\t:\t%.5f GFlops/sec\n", hostname, gflop); - printf("%s: Avg. time / DGEMM operation\t:\t%f secs \n", hostname, time_avg/LOOP_COUNT); + printf("%s: Flops based on given dimensions\t:\t%.5f GFlops\n", hostname, gflop); + printf("%s: Avg. performance \t:\t%.5f GFlops/sec\n", hostname, gflop * LOOP_COUNT / time_avg); + printf("%s: Avg. time / DGEMM operation\t:\t%f secs \n", hostname, time_avg / LOOP_COUNT); printf("%s: Time for %d DGEMM operations\t:\t%f secs \n", hostname, LOOP_COUNT, time_avg); printf("\n"); From f6addce2308b6b9e22a2686b785233baf61a8274 Mon Sep 17 00:00:00 2001 From: Theofilos Manitaras Date: Wed, 5 Dec 2018 16:17:47 +0100 Subject: [PATCH 6/8] Fix code style problem --- cscs-checks/libraries/math/dgemm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cscs-checks/libraries/math/dgemm.py b/cscs-checks/libraries/math/dgemm.py index 06f7039057..55dd521423 100644 --- a/cscs-checks/libraries/math/dgemm.py +++ b/cscs-checks/libraries/math/dgemm.py @@ -57,7 +57,6 @@ def setup(self, partition, environ, **job_opts): } super().setup(partition, environ, **job_opts) - @sn.sanity_function def eval_sanity(self): all_tested_nodes = sn.evaluate(sn.extractall( From aa1c4f33ebcec20ef1d5c7a81bfe06498b027922 Mon Sep 17 00:00:00 2001 From: Theofilos Manitaras Date: Thu, 6 Dec 2018 11:27:56 +0100 Subject: [PATCH 7/8] Address PR comments and add perf units --- cscs-checks/libraries/math/dgemm.py | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/cscs-checks/libraries/math/dgemm.py b/cscs-checks/libraries/math/dgemm.py index 55dd521423..9d2689cec3 100644 --- a/cscs-checks/libraries/math/dgemm.py +++ b/cscs-checks/libraries/math/dgemm.py @@ -2,7 +2,7 @@ import reframe.utility.sanity as sn -@rfm.required_version('>=2.14') +@rfm.required_version('>=2.16-dev0') @rfm.simple_test class DGEMMTest(rfm.RegressionTest): def __init__(self): @@ -16,23 +16,21 @@ def __init__(self): self.valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc'] self.valid_prog_environs = ['PrgEnv-gnu'] - self.num_tasks = 0 self.num_tasks_per_node = 1 self.num_tasks_per_core = 1 self.num_tasks_per_socket = 1 self.use_multithreading = False - self.build_system = 'SingleSource' self.build_system.cflags = ['-O3', '-fopenmp'] - + perf_units = 'GFlops/sec' self.sys_reference = { - 'daint:gpu': (300.0, -0.15, None), - 'daint:mc': (860.0, -0.15, None), - 'dom:gpu': (300.0, -0.15, None), - 'dom:mc': (860.0, -0.15, None), + 'daint:gpu': (300.0, -0.15, None, perf_units), + 'daint:mc': (860.0, -0.15, None, perf_units), + 'dom:gpu': (300.0, -0.15, None, perf_units), + 'dom:mc': (860.0, -0.15, None, perf_units), # FIXME update the values for monch - 'monch:compute': (350, -0.1, None), + 'monch:compute': (350, -0.1, None, perf_units), } self.maintainers = ['AJ', 'VH', 'VK'] @@ -63,11 +61,9 @@ def eval_sanity(self): r'(?P\S+):\s+Time for \d+ DGEMM operations', self.stdout, 'hostname')) num_tested_nodes = len(all_tested_nodes) - - if num_tested_nodes != self.job.num_tasks: - failure_msg = ('Requested %s node(s), but found %s node(s)' % - (self.job.num_tasks, num_tested_nodes)) - return sn.assert_false(failure_msg, msg=failure_msg) + failure_msg = ('Requested %s node(s), but found %s node(s)' % + (self.job.num_tasks, num_tested_nodes)) + sn.assert_eq(num_tested_nodes, self.job.num_tasks, msg=failure_msg) for hostname in all_tested_nodes: if self.sys_reference[self.current_partition.fullname]: From f314f0d88d463bbb0ca7761cc9c9f06dfb97b865 Mon Sep 17 00:00:00 2001 From: Theofilos Manitaras Date: Thu, 6 Dec 2018 13:09:36 +0100 Subject: [PATCH 8/8] Address PR comments (version 2) --- cscs-checks/libraries/math/dgemm.py | 13 ++++++------- cscs-checks/libraries/math/src/dgemm.c | 4 ++-- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/cscs-checks/libraries/math/dgemm.py b/cscs-checks/libraries/math/dgemm.py index 9d2689cec3..32a984b1a2 100644 --- a/cscs-checks/libraries/math/dgemm.py +++ b/cscs-checks/libraries/math/dgemm.py @@ -23,14 +23,13 @@ def __init__(self): self.use_multithreading = False self.build_system = 'SingleSource' self.build_system.cflags = ['-O3', '-fopenmp'] - perf_units = 'GFlops/sec' self.sys_reference = { - 'daint:gpu': (300.0, -0.15, None, perf_units), - 'daint:mc': (860.0, -0.15, None, perf_units), - 'dom:gpu': (300.0, -0.15, None, perf_units), - 'dom:mc': (860.0, -0.15, None, perf_units), + 'daint:gpu': (300.0, -0.15, None, 'Gflop/s'), + 'daint:mc': (860.0, -0.15, None, 'Gflop/s'), + 'dom:gpu': (300.0, -0.15, None, 'Gflop/s'), + 'dom:mc': (860.0, -0.15, None, 'Gflop/s'), # FIXME update the values for monch - 'monch:compute': (350, -0.1, None, perf_units), + 'monch:compute': (350, -0.1, None, 'Gflop/s'), } self.maintainers = ['AJ', 'VH', 'VK'] @@ -72,6 +71,6 @@ def eval_sanity(self): self.reference[ref_name] = self.sys_reference[partition_name] self.perf_patterns[hostname] = sn.extractsingle( r'%s:\s+Avg\. performance\s+:\s+(?P\S+)' - r'\sGFlops/sec' % hostname, self.stdout, 'gflops', float) + r'\sGflop/s' % hostname, self.stdout, 'gflops', float) return True diff --git a/cscs-checks/libraries/math/src/dgemm.c b/cscs-checks/libraries/math/src/dgemm.c index ef4bc5431d..211c346671 100644 --- a/cscs-checks/libraries/math/src/dgemm.c +++ b/cscs-checks/libraries/math/src/dgemm.c @@ -79,8 +79,8 @@ int main(int argc, char* argv[]) printf("\n"); - printf("%s: Flops based on given dimensions\t:\t%.5f GFlops\n", hostname, gflop); - printf("%s: Avg. performance \t:\t%.5f GFlops/sec\n", hostname, gflop * LOOP_COUNT / time_avg); + printf("%s: Flops based on given dimensions\t:\t%.5f Gflops\n", hostname, gflop); + printf("%s: Avg. performance \t:\t%.5f Gflop/s\n", hostname, gflop * LOOP_COUNT / time_avg); printf("%s: Avg. time / DGEMM operation\t:\t%f secs \n", hostname, time_avg / LOOP_COUNT); printf("%s: Time for %d DGEMM operations\t:\t%f secs \n", hostname, LOOP_COUNT, time_avg); printf("\n");