From 2265821b8c48fb46ad4d47a20b85970ffe0c654c Mon Sep 17 00:00:00 2001 From: Sergei Kliavinek Date: Thu, 9 Sep 2021 17:36:32 +0300 Subject: [PATCH 1/6] add redesigned spark tests --- cscs-checks/apps/spark/spark_check.py | 33 ++++------ hpctestlib/apps/spark/base_check.py | 65 +++++++++++++++++++ .../apps/spark/src/spark_pi.py | 0 3 files changed, 76 insertions(+), 22 deletions(-) create mode 100644 hpctestlib/apps/spark/base_check.py rename {cscs-checks => hpctestlib}/apps/spark/src/spark_pi.py (100%) diff --git a/cscs-checks/apps/spark/spark_check.py b/cscs-checks/apps/spark/spark_check.py index 3a89e5483a..8d7075a8dc 100644 --- a/cscs-checks/apps/spark/spark_check.py +++ b/cscs-checks/apps/spark/spark_check.py @@ -8,27 +8,17 @@ import reframe as rfm import reframe.utility.sanity as sn from reframe.core.backends import getlauncher - +from hpctestlib.apps.spark.base_check import Spark_BaseCheck @rfm.simple_test -class SparkCheck(rfm.RunOnlyRegressionTest): - variant = parameter(['spark', 'pyspark']) - - def __init__(self): - self.descr = f'Simple calculation of pi with {self.variant}' - self.valid_systems = ['daint:gpu', 'daint:mc', - 'dom:gpu', 'dom:mc'] - self.valid_prog_environs = ['builtin'] - self.modules = ['Spark'] - self.prerun_cmds = ['start-all.sh'] - self.postrun_cmds = ['stop-all.sh'] - self.num_tasks = 3 - self.num_tasks_per_node = 1 - pi_value = sn.extractsingle(r'Pi is roughly\s+(?P\S+)', - self.stdout, 'pi', float) - self.sanity_patterns = sn.assert_lt(sn.abs(pi_value - math.pi), 0.01) - self.maintainers = ['TM', 'RS'] - self.tags = {'production'} +class SparkCheck(Spark_BaseCheck): + valid_systems = ['daint:gpu', 'daint:mc','dom:gpu', 'dom:mc'] + valid_prog_environs = ['builtin'] + modules = ['Spark'] + num_tasks = 3 + num_tasks_per_node = 1 + maintainers = ['TM', 'RS'] + tags = {'production'} @run_before('run') def prepare_run(self): @@ -43,7 +33,6 @@ def prepare_run(self): 'SPARK_WORKER_CORES': str(num_workers), 'SPARK_LOCAL_DIRS': '"/tmp"', } - self.executable = 'spark-submit' self.executable_opts = [ f'--conf spark.default.parallelism={num_workers}', f'--conf spark.executor.cores={exec_cores}', @@ -55,9 +44,9 @@ def prepare_run(self): '--class org.apache.spark.examples.SparkPi', '$EBROOTSPARK/examples/jars/spark-examples*.jar 10000' ] - else: - self.executable_opts.append('spark_pi.py') + @run_before('run') + def set_job_launcher(self): # The job launcher has to be changed since the `spark-submit` # script is not used with srun. self.job.launcher = getlauncher('local')() diff --git a/hpctestlib/apps/spark/base_check.py b/hpctestlib/apps/spark/base_check.py new file mode 100644 index 0000000000..4eeb5fd89c --- /dev/null +++ b/hpctestlib/apps/spark/base_check.py @@ -0,0 +1,65 @@ +# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich) +# ReFrame Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: BSD-3-Clause + +import math + +import reframe as rfm +import reframe.utility.sanity as sn +from reframe.core.backends import getlauncher + + +class Spark_BaseCheck(rfm.RunOnlyRegressionTest, pin_prefix=True): + '''Base class for the Spark Test. + + Apache Spark is a unified analytics engine for large-scale data + processing. It provides high-level APIs in Java, Scala, Python + and R, and an optimized engine that supports general execution + graphs. It also supports a rich set of higher-level tools including + Spark SQL for SQL and structured data processing, MLlib for machine + learning, GraphX for graph processing, and Structured Streaming for + incremental computation and stream processing (see spark.apache.org). + + The presented abstract run-only class checks the spark perfomance. + To do this, it is necessary to define the tolerance of admissible + deviation . This data is used to check if + the task is being executed correctly, that is, the final value of pi + is correct (approximately the same as obtained from library math). + The default assumption is that Spark is already installed on + the device under test. + ''' + + #: Name of the package to be checked + variant = parameter(['spark', 'pyspark']) + + #: Maximum deviation from the table value of pi, + #: that is acceptable. + #: + #: :type: float + #: :default: 0.01 + tolerance = variable(float, value=0.01) + + + @run_after('init') + def set_description(self): + self.mydescr = f'Simple calculation of pi with {self.variant}' + + @run_before('run') + def set_run_cmds(self): + self.prerun_cmds = ['start-all.sh'] + self.postrun_cmds = ['stop-all.sh'] + + @run_before('run') + def set_executable_opts(self): + self.executable = 'spark-submit' + if self.variant == 'pyspark': + self.executable_opts.append('spark_pi.py') + + @sanity_function + def assert_pi_readout(self): + '''Assert the obtained pi value meets the specified tolerances.''' + + pi_value = sn.extractsingle(r'Pi is roughly\s+(?P\S+)', + self.stdout, 'pi', float) + return sn.assert_lt(sn.abs(pi_value - math.pi), self.tolerance) diff --git a/cscs-checks/apps/spark/src/spark_pi.py b/hpctestlib/apps/spark/src/spark_pi.py similarity index 100% rename from cscs-checks/apps/spark/src/spark_pi.py rename to hpctestlib/apps/spark/src/spark_pi.py From 7988d3a8633499ace69a70ec5662bce1fad26846 Mon Sep 17 00:00:00 2001 From: Sergei Kliavinek Date: Thu, 9 Sep 2021 17:41:12 +0300 Subject: [PATCH 2/6] fix problems with white space and blank lines --- cscs-checks/apps/spark/spark_check.py | 2 +- hpctestlib/apps/spark/base_check.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/cscs-checks/apps/spark/spark_check.py b/cscs-checks/apps/spark/spark_check.py index 8d7075a8dc..20767e32d1 100644 --- a/cscs-checks/apps/spark/spark_check.py +++ b/cscs-checks/apps/spark/spark_check.py @@ -12,7 +12,7 @@ @rfm.simple_test class SparkCheck(Spark_BaseCheck): - valid_systems = ['daint:gpu', 'daint:mc','dom:gpu', 'dom:mc'] + valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc'] valid_prog_environs = ['builtin'] modules = ['Spark'] num_tasks = 3 diff --git a/hpctestlib/apps/spark/base_check.py b/hpctestlib/apps/spark/base_check.py index 4eeb5fd89c..e857213509 100644 --- a/hpctestlib/apps/spark/base_check.py +++ b/hpctestlib/apps/spark/base_check.py @@ -40,7 +40,6 @@ class Spark_BaseCheck(rfm.RunOnlyRegressionTest, pin_prefix=True): #: :default: 0.01 tolerance = variable(float, value=0.01) - @run_after('init') def set_description(self): self.mydescr = f'Simple calculation of pi with {self.variant}' @@ -52,9 +51,9 @@ def set_run_cmds(self): @run_before('run') def set_executable_opts(self): - self.executable = 'spark-submit' - if self.variant == 'pyspark': - self.executable_opts.append('spark_pi.py') + self.executable = 'spark-submit' + if self.variant == 'pyspark': + self.executable_opts.append('spark_pi.py') @sanity_function def assert_pi_readout(self): From beb072df8b00d82ba5e52becba66e19d60034595 Mon Sep 17 00:00:00 2001 From: Theofilos Manitaras Date: Thu, 30 Sep 2021 16:39:42 +0200 Subject: [PATCH 3/6] Improve formatting and remove unused imports --- cscs-checks/apps/spark/spark_check.py | 1 + hpctestlib/apps/spark/base_check.py | 23 +++++++---------------- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/cscs-checks/apps/spark/spark_check.py b/cscs-checks/apps/spark/spark_check.py index 20767e32d1..6a441b82b1 100644 --- a/cscs-checks/apps/spark/spark_check.py +++ b/cscs-checks/apps/spark/spark_check.py @@ -7,6 +7,7 @@ import reframe as rfm import reframe.utility.sanity as sn + from reframe.core.backends import getlauncher from hpctestlib.apps.spark.base_check import Spark_BaseCheck diff --git a/hpctestlib/apps/spark/base_check.py b/hpctestlib/apps/spark/base_check.py index e857213509..3202a4d92f 100644 --- a/hpctestlib/apps/spark/base_check.py +++ b/hpctestlib/apps/spark/base_check.py @@ -7,7 +7,6 @@ import reframe as rfm import reframe.utility.sanity as sn -from reframe.core.backends import getlauncher class Spark_BaseCheck(rfm.RunOnlyRegressionTest, pin_prefix=True): @@ -21,23 +20,15 @@ class Spark_BaseCheck(rfm.RunOnlyRegressionTest, pin_prefix=True): learning, GraphX for graph processing, and Structured Streaming for incremental computation and stream processing (see spark.apache.org). - The presented abstract run-only class checks the spark perfomance. - To do this, it is necessary to define the tolerance of admissible - deviation . This data is used to check if - the task is being executed correctly, that is, the final value of pi - is correct (approximately the same as obtained from library math). - The default assumption is that Spark is already installed on - the device under test. + The present abstract run-only class checks the Spark perfomance. + To do this, it is necessary to define the tolerance of acceptable + deviation. The tolerance is used to check if the task executed correctly, + comparing the value of pi calculated to the one obtained from the math + library. The default assumption is that Spark is already installed on the + system under test. ''' - #: Name of the package to be checked variant = parameter(['spark', 'pyspark']) - - #: Maximum deviation from the table value of pi, - #: that is acceptable. - #: - #: :type: float - #: :default: 0.01 tolerance = variable(float, value=0.01) @run_after('init') @@ -57,7 +48,7 @@ def set_executable_opts(self): @sanity_function def assert_pi_readout(self): - '''Assert the obtained pi value meets the specified tolerances.''' + '''Assert that the obtained pi value meets the specified tolerances.''' pi_value = sn.extractsingle(r'Pi is roughly\s+(?P\S+)', self.stdout, 'pi', float) From cdbc3f54100212a286911e8723ab088a5a65eaf0 Mon Sep 17 00:00:00 2001 From: Theofilos Manitaras Date: Thu, 30 Sep 2021 16:51:03 +0200 Subject: [PATCH 4/6] Remove remaining unused imports --- cscs-checks/apps/spark/spark_check.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/cscs-checks/apps/spark/spark_check.py b/cscs-checks/apps/spark/spark_check.py index 6a441b82b1..8101d41755 100644 --- a/cscs-checks/apps/spark/spark_check.py +++ b/cscs-checks/apps/spark/spark_check.py @@ -3,10 +3,7 @@ # # SPDX-License-Identifier: BSD-3-Clause -import math - import reframe as rfm -import reframe.utility.sanity as sn from reframe.core.backends import getlauncher from hpctestlib.apps.spark.base_check import Spark_BaseCheck From 142bfb69d028280cfc9634ad59f4db9f95e01a9c Mon Sep 17 00:00:00 2001 From: Theofilos Manitaras Date: Fri, 1 Oct 2021 11:25:03 +0200 Subject: [PATCH 5/6] Move test variables outside of methods --- hpctestlib/apps/spark/base_check.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/hpctestlib/apps/spark/base_check.py b/hpctestlib/apps/spark/base_check.py index 3202a4d92f..46f2e8bbd1 100644 --- a/hpctestlib/apps/spark/base_check.py +++ b/hpctestlib/apps/spark/base_check.py @@ -30,19 +30,17 @@ class Spark_BaseCheck(rfm.RunOnlyRegressionTest, pin_prefix=True): variant = parameter(['spark', 'pyspark']) tolerance = variable(float, value=0.01) + prerun_cmds = ['start-all.sh'] + postrun_cmds = ['stop-all.sh'] + executable = 'spark-submit' + executable_opts = required @run_after('init') def set_description(self): self.mydescr = f'Simple calculation of pi with {self.variant}' @run_before('run') - def set_run_cmds(self): - self.prerun_cmds = ['start-all.sh'] - self.postrun_cmds = ['stop-all.sh'] - - @run_before('run') - def set_executable_opts(self): - self.executable = 'spark-submit' + def set_pyspark_opts(self): if self.variant == 'pyspark': self.executable_opts.append('spark_pi.py') From a868ccfc216e858de1213851843e1ce628f8510c Mon Sep 17 00:00:00 2001 From: Theofilos Manitaras Date: Fri, 8 Oct 2021 09:29:13 +0200 Subject: [PATCH 6/6] Address PR comments --- cscs-checks/apps/spark/spark_check.py | 15 ++++++--------- .../{base_check.py => compute_pi/__init__.py} | 7 +------ .../apps/spark/{ => compute_pi}/src/spark_pi.py | 0 3 files changed, 7 insertions(+), 15 deletions(-) rename hpctestlib/apps/spark/{base_check.py => compute_pi/__init__.py} (89%) rename hpctestlib/apps/spark/{ => compute_pi}/src/spark_pi.py (100%) diff --git a/cscs-checks/apps/spark/spark_check.py b/cscs-checks/apps/spark/spark_check.py index 8101d41755..97e716f3c2 100644 --- a/cscs-checks/apps/spark/spark_check.py +++ b/cscs-checks/apps/spark/spark_check.py @@ -6,10 +6,10 @@ import reframe as rfm from reframe.core.backends import getlauncher -from hpctestlib.apps.spark.base_check import Spark_BaseCheck +from hpctestlib.apps.spark.compute_pi import ComputePi @rfm.simple_test -class SparkCheck(Spark_BaseCheck): +class SparkCheck(ComputePi): valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc'] valid_prog_environs = ['builtin'] modules = ['Spark'] @@ -20,13 +20,8 @@ class SparkCheck(Spark_BaseCheck): @run_before('run') def prepare_run(self): - if self.current_partition.fullname in ['daint:gpu', 'dom:gpu']: - num_workers = 12 - exec_cores = 3 - else: - num_workers = 36 - exec_cores = 9 - + num_workers = self.current_partition.processor.num_cores + exec_cores = num_workers // 4 self.variables = { 'SPARK_WORKER_CORES': str(num_workers), 'SPARK_LOCAL_DIRS': '"/tmp"', @@ -42,6 +37,8 @@ def prepare_run(self): '--class org.apache.spark.examples.SparkPi', '$EBROOTSPARK/examples/jars/spark-examples*.jar 10000' ] + elif self.variant == 'pyspark': + self.executable_opts += ['spark_pi.py'] @run_before('run') def set_job_launcher(self): diff --git a/hpctestlib/apps/spark/base_check.py b/hpctestlib/apps/spark/compute_pi/__init__.py similarity index 89% rename from hpctestlib/apps/spark/base_check.py rename to hpctestlib/apps/spark/compute_pi/__init__.py index 46f2e8bbd1..323bee711d 100644 --- a/hpctestlib/apps/spark/base_check.py +++ b/hpctestlib/apps/spark/compute_pi/__init__.py @@ -9,7 +9,7 @@ import reframe.utility.sanity as sn -class Spark_BaseCheck(rfm.RunOnlyRegressionTest, pin_prefix=True): +class ComputePi(rfm.RunOnlyRegressionTest, pin_prefix=True): '''Base class for the Spark Test. Apache Spark is a unified analytics engine for large-scale data @@ -39,11 +39,6 @@ class Spark_BaseCheck(rfm.RunOnlyRegressionTest, pin_prefix=True): def set_description(self): self.mydescr = f'Simple calculation of pi with {self.variant}' - @run_before('run') - def set_pyspark_opts(self): - if self.variant == 'pyspark': - self.executable_opts.append('spark_pi.py') - @sanity_function def assert_pi_readout(self): '''Assert that the obtained pi value meets the specified tolerances.''' diff --git a/hpctestlib/apps/spark/src/spark_pi.py b/hpctestlib/apps/spark/compute_pi/src/spark_pi.py similarity index 100% rename from hpctestlib/apps/spark/src/spark_pi.py rename to hpctestlib/apps/spark/compute_pi/src/spark_pi.py