From 2265821b8c48fb46ad4d47a20b85970ffe0c654c Mon Sep 17 00:00:00 2001
From: Sergei Kliavinek <klyavinekss@gmail.com>
Date: Thu, 9 Sep 2021 17:36:32 +0300
Subject: [PATCH 1/6] add redesigned spark tests

---
 cscs-checks/apps/spark/spark_check.py         | 33 ++++------
 hpctestlib/apps/spark/base_check.py           | 65 +++++++++++++++++++
 .../apps/spark/src/spark_pi.py                |  0
 3 files changed, 76 insertions(+), 22 deletions(-)
 create mode 100644 hpctestlib/apps/spark/base_check.py
 rename {cscs-checks => hpctestlib}/apps/spark/src/spark_pi.py (100%)
diff --git a/cscs-checks/apps/spark/spark_check.py b/cscs-checks/apps/spark/spark_check.py
index 3a89e5483a..8d7075a8dc 100644
--- a/cscs-checks/apps/spark/spark_check.py
+++ b/cscs-checks/apps/spark/spark_check.py
@@ -8,27 +8,17 @@
 import reframe as rfm
 import reframe.utility.sanity as sn
 from reframe.core.backends import getlauncher
-
+from hpctestlib.apps.spark.base_check import Spark_BaseCheck
 
 @rfm.simple_test
-class SparkCheck(rfm.RunOnlyRegressionTest):
-    variant = parameter(['spark', 'pyspark'])
-
-    def __init__(self):
-        self.descr = f'Simple calculation of pi with {self.variant}'
-        self.valid_systems = ['daint:gpu', 'daint:mc',
-                              'dom:gpu', 'dom:mc']
-        self.valid_prog_environs = ['builtin']
-        self.modules = ['Spark']
-        self.prerun_cmds = ['start-all.sh']
-        self.postrun_cmds = ['stop-all.sh']
-        self.num_tasks = 3
-        self.num_tasks_per_node = 1
-        pi_value = sn.extractsingle(r'Pi is roughly\s+(?P<pi>\S+)',
-                                    self.stdout, 'pi', float)
-        self.sanity_patterns = sn.assert_lt(sn.abs(pi_value - math.pi), 0.01)
-        self.maintainers = ['TM', 'RS']
-        self.tags = {'production'}
+class SparkCheck(Spark_BaseCheck):
+    valid_systems = ['daint:gpu', 'daint:mc','dom:gpu', 'dom:mc']
+    valid_prog_environs = ['builtin']
+    modules = ['Spark']
+    num_tasks = 3
+    num_tasks_per_node = 1
+    maintainers = ['TM', 'RS']
+    tags = {'production'}
 
     @run_before('run')
     def prepare_run(self):
@@ -43,7 +33,6 @@ def prepare_run(self):
             'SPARK_WORKER_CORES': str(num_workers),
             'SPARK_LOCAL_DIRS': '"/tmp"',
         }
-        self.executable = 'spark-submit'
         self.executable_opts = [
             f'--conf spark.default.parallelism={num_workers}',
             f'--conf spark.executor.cores={exec_cores}',
@@ -55,9 +44,9 @@ def prepare_run(self):
                 '--class org.apache.spark.examples.SparkPi',
                 '$EBROOTSPARK/examples/jars/spark-examples*.jar 10000'
             ]
-        else:
-            self.executable_opts.append('spark_pi.py')
 
+    @run_before('run')
+    def set_job_launcher(self):
         # The job launcher has to be changed since the `spark-submit`
         # script is not used with srun.
         self.job.launcher = getlauncher('local')()
diff --git a/hpctestlib/apps/spark/base_check.py b/hpctestlib/apps/spark/base_check.py
new file mode 100644
index 0000000000..4eeb5fd89c
--- /dev/null
+++ b/hpctestlib/apps/spark/base_check.py
@@ -0,0 +1,65 @@
+# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
+# ReFrame Project Developers. See the top-level LICENSE file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import math
+
+import reframe as rfm
+import reframe.utility.sanity as sn
+from reframe.core.backends import getlauncher
+
+
+class Spark_BaseCheck(rfm.RunOnlyRegressionTest, pin_prefix=True):
+    '''Base class for the Spark Test.
+
+    Apache Spark is a unified analytics engine for large-scale data
+    processing. It provides high-level APIs in Java, Scala, Python
+    and R, and an optimized engine that supports general execution
+    graphs. It also supports a rich set of higher-level tools including
+    Spark SQL for SQL and structured data processing, MLlib for machine
+    learning, GraphX for graph processing, and Structured Streaming for
+    incremental computation and stream processing (see spark.apache.org).
+
+    The presented abstract run-only class checks the spark perfomance.
+    To do this, it is necessary to define the tolerance of admissible
+    deviation . This data is used to check if
+    the task is being executed correctly, that is, the final value of pi
+    is correct (approximately the same as obtained from library math).
+    The default assumption is that Spark is already installed on
+    the device under test.
+    '''
+
+    #: Name of the package to be checked
+    variant = parameter(['spark', 'pyspark'])
+
+    #: Maximum deviation from the table value of pi,
+    #: that is acceptable.
+    #:
+    #: :type: float
+    #: :default: 0.01
+    tolerance = variable(float, value=0.01)
+
+
+    @run_after('init')
+    def set_description(self):
+        self.mydescr = f'Simple calculation of pi with {self.variant}'
+
+    @run_before('run')
+    def set_run_cmds(self):
+        self.prerun_cmds = ['start-all.sh']
+        self.postrun_cmds = ['stop-all.sh']
+
+    @run_before('run')
+    def set_executable_opts(self):
+            self.executable = 'spark-submit'
+            if self.variant == 'pyspark':
+                self.executable_opts.append('spark_pi.py')
+
+    @sanity_function
+    def assert_pi_readout(self):
+        '''Assert the obtained pi value meets the specified tolerances.'''
+
+        pi_value = sn.extractsingle(r'Pi is roughly\s+(?P<pi>\S+)',
+                                    self.stdout, 'pi', float)
+        return sn.assert_lt(sn.abs(pi_value - math.pi), self.tolerance)
diff --git a/cscs-checks/apps/spark/src/spark_pi.py b/hpctestlib/apps/spark/src/spark_pi.py
similarity index 100%
rename from cscs-checks/apps/spark/src/spark_pi.py
rename to hpctestlib/apps/spark/src/spark_pi.py

From 7988d3a8633499ace69a70ec5662bce1fad26846 Mon Sep 17 00:00:00 2001
From: Sergei Kliavinek <klyavinekss@gmail.com>
Date: Thu, 9 Sep 2021 17:41:12 +0300
Subject: [PATCH 2/6] fix problems with white space and blank lines

---
 cscs-checks/apps/spark/spark_check.py | 2 +-
 hpctestlib/apps/spark/base_check.py   | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/cscs-checks/apps/spark/spark_check.py b/cscs-checks/apps/spark/spark_check.py
index 8d7075a8dc..20767e32d1 100644
--- a/cscs-checks/apps/spark/spark_check.py
+++ b/cscs-checks/apps/spark/spark_check.py
@@ -12,7 +12,7 @@
 
 @rfm.simple_test
 class SparkCheck(Spark_BaseCheck):
-    valid_systems = ['daint:gpu', 'daint:mc','dom:gpu', 'dom:mc']
+    valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc']
     valid_prog_environs = ['builtin']
     modules = ['Spark']
     num_tasks = 3
diff --git a/hpctestlib/apps/spark/base_check.py b/hpctestlib/apps/spark/base_check.py
index 4eeb5fd89c..e857213509 100644
--- a/hpctestlib/apps/spark/base_check.py
+++ b/hpctestlib/apps/spark/base_check.py
@@ -40,7 +40,6 @@ class Spark_BaseCheck(rfm.RunOnlyRegressionTest, pin_prefix=True):
     #: :default: 0.01
     tolerance = variable(float, value=0.01)
 
-
     @run_after('init')
     def set_description(self):
         self.mydescr = f'Simple calculation of pi with {self.variant}'
@@ -52,9 +51,9 @@ def set_run_cmds(self):
 
     @run_before('run')
     def set_executable_opts(self):
-            self.executable = 'spark-submit'
-            if self.variant == 'pyspark':
-                self.executable_opts.append('spark_pi.py')
+        self.executable = 'spark-submit'
+        if self.variant == 'pyspark':
+            self.executable_opts.append('spark_pi.py')
 
     @sanity_function
     def assert_pi_readout(self):

From beb072df8b00d82ba5e52becba66e19d60034595 Mon Sep 17 00:00:00 2001
From: Theofilos Manitaras <manitaras@cscs.ch>
Date: Thu, 30 Sep 2021 16:39:42 +0200
Subject: [PATCH 3/6] Improve formatting and remove unused imports

---
 cscs-checks/apps/spark/spark_check.py |  1 +
 hpctestlib/apps/spark/base_check.py   | 23 +++++++----------------
 2 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/cscs-checks/apps/spark/spark_check.py b/cscs-checks/apps/spark/spark_check.py
index 20767e32d1..6a441b82b1 100644
--- a/cscs-checks/apps/spark/spark_check.py
+++ b/cscs-checks/apps/spark/spark_check.py
@@ -7,6 +7,7 @@
 
 import reframe as rfm
 import reframe.utility.sanity as sn
+
 from reframe.core.backends import getlauncher
 from hpctestlib.apps.spark.base_check import Spark_BaseCheck
 
diff --git a/hpctestlib/apps/spark/base_check.py b/hpctestlib/apps/spark/base_check.py
index e857213509..3202a4d92f 100644
--- a/hpctestlib/apps/spark/base_check.py
+++ b/hpctestlib/apps/spark/base_check.py
@@ -7,7 +7,6 @@
 
 import reframe as rfm
 import reframe.utility.sanity as sn
-from reframe.core.backends import getlauncher
 
 
 class Spark_BaseCheck(rfm.RunOnlyRegressionTest, pin_prefix=True):
@@ -21,23 +20,15 @@ class Spark_BaseCheck(rfm.RunOnlyRegressionTest, pin_prefix=True):
     learning, GraphX for graph processing, and Structured Streaming for
     incremental computation and stream processing (see spark.apache.org).
 
-    The presented abstract run-only class checks the spark perfomance.
-    To do this, it is necessary to define the tolerance of admissible
-    deviation . This data is used to check if
-    the task is being executed correctly, that is, the final value of pi
-    is correct (approximately the same as obtained from library math).
-    The default assumption is that Spark is already installed on
-    the device under test.
+    The present abstract run-only class checks the Spark perfomance.
+    To do this, it is necessary to define the tolerance of acceptable
+    deviation. The tolerance is used to check if the task executed correctly,
+    comparing the value of pi calculated to the one obtained from the math
+    library. The default assumption is that Spark is already installed on the
+    system under test.
     '''
 
-    #: Name of the package to be checked
     variant = parameter(['spark', 'pyspark'])
-
-    #: Maximum deviation from the table value of pi,
-    #: that is acceptable.
-    #:
-    #: :type: float
-    #: :default: 0.01
     tolerance = variable(float, value=0.01)
 
     @run_after('init')
@@ -57,7 +48,7 @@ def set_executable_opts(self):
 
     @sanity_function
     def assert_pi_readout(self):
-        '''Assert the obtained pi value meets the specified tolerances.'''
+        '''Assert that the obtained pi value meets the specified tolerances.'''
 
         pi_value = sn.extractsingle(r'Pi is roughly\s+(?P<pi>\S+)',
                                     self.stdout, 'pi', float)

From cdbc3f54100212a286911e8723ab088a5a65eaf0 Mon Sep 17 00:00:00 2001
From: Theofilos Manitaras <manitaras@cscs.ch>
Date: Thu, 30 Sep 2021 16:51:03 +0200
Subject: [PATCH 4/6] Remove remaining unused imports

---
 cscs-checks/apps/spark/spark_check.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/cscs-checks/apps/spark/spark_check.py b/cscs-checks/apps/spark/spark_check.py
index 6a441b82b1..8101d41755 100644
--- a/cscs-checks/apps/spark/spark_check.py
+++ b/cscs-checks/apps/spark/spark_check.py
@@ -3,10 +3,7 @@
 #
 # SPDX-License-Identifier: BSD-3-Clause
 
-import math
-
 import reframe as rfm
-import reframe.utility.sanity as sn
 
 from reframe.core.backends import getlauncher
 from hpctestlib.apps.spark.base_check import Spark_BaseCheck

From 142bfb69d028280cfc9634ad59f4db9f95e01a9c Mon Sep 17 00:00:00 2001
From: Theofilos Manitaras <manitaras@cscs.ch>
Date: Fri, 1 Oct 2021 11:25:03 +0200
Subject: [PATCH 5/6] Move test variables outside of methods

---
 hpctestlib/apps/spark/base_check.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/hpctestlib/apps/spark/base_check.py b/hpctestlib/apps/spark/base_check.py
index 3202a4d92f..46f2e8bbd1 100644
--- a/hpctestlib/apps/spark/base_check.py
+++ b/hpctestlib/apps/spark/base_check.py
@@ -30,19 +30,17 @@ class Spark_BaseCheck(rfm.RunOnlyRegressionTest, pin_prefix=True):
 
     variant = parameter(['spark', 'pyspark'])
     tolerance = variable(float, value=0.01)
+    prerun_cmds = ['start-all.sh']
+    postrun_cmds = ['stop-all.sh']
+    executable = 'spark-submit'
+    executable_opts = required
 
     @run_after('init')
     def set_description(self):
         self.mydescr = f'Simple calculation of pi with {self.variant}'
 
     @run_before('run')
-    def set_run_cmds(self):
-        self.prerun_cmds = ['start-all.sh']
-        self.postrun_cmds = ['stop-all.sh']
-
-    @run_before('run')
-    def set_executable_opts(self):
-        self.executable = 'spark-submit'
+    def set_pyspark_opts(self):
         if self.variant == 'pyspark':
             self.executable_opts.append('spark_pi.py')
 

From a868ccfc216e858de1213851843e1ce628f8510c Mon Sep 17 00:00:00 2001
From: Theofilos Manitaras <manitaras@cscs.ch>
Date: Fri, 8 Oct 2021 09:29:13 +0200
Subject: [PATCH 6/6] Address PR comments

---
 cscs-checks/apps/spark/spark_check.py             | 15 ++++++---------
 .../{base_check.py => compute_pi/__init__.py}     |  7 +------
 .../apps/spark/{ => compute_pi}/src/spark_pi.py   |  0
 3 files changed, 7 insertions(+), 15 deletions(-)
 rename hpctestlib/apps/spark/{base_check.py => compute_pi/__init__.py} (89%)
 rename hpctestlib/apps/spark/{ => compute_pi}/src/spark_pi.py (100%)

diff --git a/cscs-checks/apps/spark/spark_check.py b/cscs-checks/apps/spark/spark_check.py
index 8101d41755..97e716f3c2 100644
--- a/cscs-checks/apps/spark/spark_check.py
+++ b/cscs-checks/apps/spark/spark_check.py
@@ -6,10 +6,10 @@
 import reframe as rfm
 
 from reframe.core.backends import getlauncher
-from hpctestlib.apps.spark.base_check import Spark_BaseCheck
+from hpctestlib.apps.spark.compute_pi import ComputePi
 
 @rfm.simple_test
-class SparkCheck(Spark_BaseCheck):
+class SparkCheck(ComputePi):
     valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc']
     valid_prog_environs = ['builtin']
     modules = ['Spark']
@@ -20,13 +20,8 @@ class SparkCheck(Spark_BaseCheck):
 
     @run_before('run')
     def prepare_run(self):
-        if self.current_partition.fullname in ['daint:gpu', 'dom:gpu']:
-            num_workers = 12
-            exec_cores = 3
-        else:
-            num_workers = 36
-            exec_cores = 9
-
+        num_workers = self.current_partition.processor.num_cores
+        exec_cores = num_workers // 4
         self.variables = {
             'SPARK_WORKER_CORES': str(num_workers),
             'SPARK_LOCAL_DIRS': '"/tmp"',
@@ -42,6 +37,8 @@ def prepare_run(self):
                 '--class org.apache.spark.examples.SparkPi',
                 '$EBROOTSPARK/examples/jars/spark-examples*.jar 10000'
             ]
+        elif self.variant == 'pyspark':
+            self.executable_opts += ['spark_pi.py']
 
     @run_before('run')
     def set_job_launcher(self):
diff --git a/hpctestlib/apps/spark/base_check.py b/hpctestlib/apps/spark/compute_pi/__init__.py
similarity index 89%
rename from hpctestlib/apps/spark/base_check.py
rename to hpctestlib/apps/spark/compute_pi/__init__.py
index 46f2e8bbd1..323bee711d 100644
--- a/hpctestlib/apps/spark/base_check.py
+++ b/hpctestlib/apps/spark/compute_pi/__init__.py
@@ -9,7 +9,7 @@
 import reframe.utility.sanity as sn
 
 
-class Spark_BaseCheck(rfm.RunOnlyRegressionTest, pin_prefix=True):
+class ComputePi(rfm.RunOnlyRegressionTest, pin_prefix=True):
     '''Base class for the Spark Test.
 
     Apache Spark is a unified analytics engine for large-scale data
@@ -39,11 +39,6 @@ class Spark_BaseCheck(rfm.RunOnlyRegressionTest, pin_prefix=True):
     def set_description(self):
         self.mydescr = f'Simple calculation of pi with {self.variant}'
 
-    @run_before('run')
-    def set_pyspark_opts(self):
-        if self.variant == 'pyspark':
-            self.executable_opts.append('spark_pi.py')
-
     @sanity_function
     def assert_pi_readout(self):
         '''Assert that the obtained pi value meets the specified tolerances.'''
diff --git a/hpctestlib/apps/spark/src/spark_pi.py b/hpctestlib/apps/spark/compute_pi/src/spark_pi.py
similarity index 100%
rename from hpctestlib/apps/spark/src/spark_pi.py
rename to hpctestlib/apps/spark/compute_pi/src/spark_pi.py