From 86576608d35cb0e82f4f5ff3a71430c785e6da36 Mon Sep 17 00:00:00 2001
From: Luca <lucamar@cscs.ch>
Date: Mon, 11 May 2020 21:42:24 +0200
Subject: [PATCH 01/11] Source code for check Set OpenACC device

---
 cscs-checks/mch/src/set_openacc_cuda_mpi.F90 | 147 +++++++++++++++++++
 1 file changed, 147 insertions(+)
 create mode 100644 cscs-checks/mch/src/set_openacc_cuda_mpi.F90

diff --git a/cscs-checks/mch/src/set_openacc_cuda_mpi.F90 b/cscs-checks/mch/src/set_openacc_cuda_mpi.F90
new file mode 100644
index 0000000000..2f03e58928
--- /dev/null
+++ b/cscs-checks/mch/src/set_openacc_cuda_mpi.F90
@@ -0,0 +1,147 @@
+! This code tests MPI tasks communication with GPU devices 
+! using OpenACC directives and setting one device per task
+program set_openacc_cuda_mpi
+  use openacc
+  implicit none
+
+  include 'mpif.h'
+#ifdef CRAY
+  integer, parameter :: ACC_DEVICE_TYPE = 8
+#else
+  integer, parameter :: ACC_DEVICE_TYPE = 4
+#endif
+  integer, parameter :: ARRAYSIZE = 10
+  integer(kind=ACC_DEVICE_TYPE) :: devicetype
+  integer :: status(MPI_STATUS_SIZE), mpi_size, mpi_rank
+  integer :: ierr, i, gpuid, ngpus, localsum(2), globalsum(2)
+  real, allocatable :: array1(:), array2(:) 
+  
+  call MPI_Init(ierr)
+  call MPI_Comm_size(MPI_COMM_WORLD, mpi_size, ierr)
+  call MPI_Comm_rank(MPI_COMM_WORLD, mpi_rank, ierr)
+
+! each task creates two different arrays: the sum of their elements will be 10*mpi_rank
+  allocate(array1(ARRAYSIZE))
+  allocate(array2(ARRAYSIZE))
+
+#ifdef _OPENACC
+  if (mpi_rank == 0) then 
+    devicetype = acc_get_device_type()
+    ngpus = acc_get_num_devices(devicetype)
+    write(*,*) "MPI test with OpenACC using", mpi_size, "tasks and ", ngpus-1, "GPU devices"
+    do i = 1, ARRAYSIZE
+     array1(i) = .0
+     array2(i) = .0
+    end do
+  else
+   ! each task different from 0 addresses a different GPU device
+   gpuid = mod(mpi_rank, ngpus)
+   call acc_set_device_num(gpuid, acc_device_nvidia)
+   call acc_init(acc_device_nvidia)
+   gpuid = acc_get_device_num(devicetype)
+   write(*,*) "MPI task ", mpi_rank, "is using GPU id ", gpuid
+
+   !$acc data pcreate(array1,array2)
+   !$acc parallel loop  
+   do i = 1, ARRAYSIZE
+     array1(i) = mpi_rank*0.25
+     array2(i) = mpi_rank*0.75
+   end do
+   !$acc update host(array1,array2)
+
+! the current mpi_rank computes localsum(1)
+   localsum(1) = sum(array1)+sum(array2)
+   call call_cpp_std(array1, ARRAYSIZE, i)
+
+   ! compute the sum of the arrays on the GPU calling a CUDA kernel using device ptr 
+   call call_cuda_kernel_no_copy(array1, array2, ARRAYSIZE)
+   !$acc update host(array1)
+   !$acc end data
+
+! array1 is now equal to sum(array1)+sum(array2): compute localsum(2)
+   localsum(2) = sum(array1)
+  end if
+#endif
+
+! the current mpi_rank sends localsum to compute globalsum over all mpi tasks
+  call MPI_Reduce(localsum, globalsum, 2, MPI_INTEGER, MPI_SUM, 0, MPI_COMM_WORLD, ierr)
+
+ if(mpi_rank == 0) then
+    if (globalsum(1) == globalsum(2)) then
+      write (*,*) "CPU sum : ", globalsum(1), " GPU sum : ", globalsum(2)
+      write (*,*) "Test Result : OK"
+    else
+      write (*,*) "CPU sum : ", globalsum(1), " GPU sum : ", globalsum(2)
+      write (*,*) "Test Result : FAIL"
+    end if
+  end if
+
+  deallocate(array1)
+  deallocate(array2)
+  call MPI_Finalize(ierr);
+
+
+contains
+  subroutine call_cuda_kernel_with_copy(a,b,n)
+    use, intrinsic :: iso_c_binding
+    implicit none
+    real, intent(inout), target :: a(:)
+    real, intent(in), target :: b(:)
+    integer, intent(in) :: n
+  
+    interface
+      subroutine cuda_kernel_with_copy(a,b,n) bind(c,name='cuda_kernel_with_copy')
+        use, intrinsic :: iso_c_binding
+        type(c_ptr), intent(in), value :: a, b
+        integer, intent(in), value :: n
+      end subroutine cuda_kernel_with_copy
+    end interface
+
+    call cuda_kernel_with_copy(c_loc(a(1)), c_loc(b(1)), n)
+  end subroutine call_cuda_kernel_with_copy
+
+  subroutine call_cuda_kernel_no_copy(a,b,n)
+    use, intrinsic :: iso_c_binding
+    implicit none
+    real, intent(inout), target :: a(:)
+    real, intent(in), target :: b(:)
+    integer, intent(in) :: n
+  
+    interface
+      subroutine cuda_kernel_no_copy(a,b,n) bind(c,name='cuda_kernel_no_copy')
+        use, intrinsic :: iso_c_binding
+        type(c_ptr), intent(in), value :: a, b
+        integer, intent(in), value :: n
+      end subroutine cuda_kernel_no_copy
+    end interface
+
+    !$acc data present(a, b)
+    !$acc host_data use_device(a, b)
+    call cuda_kernel_no_copy(c_loc(a(1)), c_loc(b(1)), n)
+    !$acc end host_data
+    !$acc end data
+  end subroutine call_cuda_kernel_no_copy
+
+  subroutine call_cpp_std(f,n,i)
+    use, intrinsic :: iso_c_binding
+    implicit none
+    real(kind=c_float), intent(in), target :: f(:)
+    real(kind=c_float), pointer :: fp(:)
+    integer, intent(in) :: n
+    integer(kind=c_int), intent(out) :: i
+
+    interface
+      subroutine cpp_call(f,n,i) bind(c,name='do_smth_with_std')
+        use, intrinsic :: iso_c_binding
+        type(c_ptr), intent(in), value :: f
+        integer, intent(in), value :: n
+        integer(kind=c_int), intent(out) :: i
+      end subroutine cpp_call
+    end interface
+
+    fp => f
+
+    call cpp_call(c_loc(fp(1)), n, i)
+  end subroutine call_cpp_std
+
+end program set_openacc_cuda_mpi

From b2feb1fc5037a2346f8d3e18609aafdc1897b5f0 Mon Sep 17 00:00:00 2001
From: Luca <lucamar@cscs.ch>
Date: Wed, 13 May 2020 00:03:26 +0200
Subject: [PATCH 02/11] Check Set_Openacc_Device for Tsa

---
 cscs-checks/mch/set_openacc_device.py         | 63 +++++++++++++++++++
 .../mch/src/Makefile.set_openacc_device       | 29 +++++++++
 ...cc_cuda_mpi.F90 => set_openacc_device.F90} | 40 +++++++-----
 3 files changed, 116 insertions(+), 16 deletions(-)
 create mode 100644 cscs-checks/mch/set_openacc_device.py
 create mode 100644 cscs-checks/mch/src/Makefile.set_openacc_device
 rename cscs-checks/mch/src/{set_openacc_cuda_mpi.F90 => set_openacc_device.F90} (78%)

diff --git a/cscs-checks/mch/set_openacc_device.py b/cscs-checks/mch/set_openacc_device.py
new file mode 100644
index 0000000000..9bc97c1d1e
--- /dev/null
+++ b/cscs-checks/mch/set_openacc_device.py
@@ -0,0 +1,63 @@
+# Copyright 2016-2020 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
+# ReFrame Project Developers. See the top-level LICENSE file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import reframe as rfm
+import reframe.utility.sanity as sn
+
+
+@rfm.simple_test
+class SetOpenaccDevice(rfm.RegressionTest):
+    def __init__(self):
+        self.descr = 'Use OpenAcc, CUDA, MPI, and C++ on multi-device nodes'
+        self.valid_systems = ['arolla:cn', 'tsa:cn']
+        self.valid_prog_environs = ['PrgEnv-cce', 'PrgEnv-pgi']
+        self.build_system = 'Make'
+        self.build_system.makefile = 'Makefile.set_openacc_device'
+        self.build_system.fflags = ['-O2']
+
+#        if self.current_system.name == 'kesch':
+#            self.exclusive_access = True
+#            self.modules = ['cudatoolkit/8.0.61']
+#            self.num_tasks = 9
+#            self.num_tasks_per_node = 9
+#            self.num_gpus_per_node = 8
+#            self.build_system.options = ['NVCC_FLAGS="-arch=compute_37"']
+#            self.variables = {
+#                'MV2_USE_CUDA': '1',
+#                'G2G': '1'
+#            }
+        if self.current_system.name in ['arolla', 'tsa']:
+            self.exclusive_access = True
+            self.modules = ['cuda/10.1.243']
+            self.num_tasks = 9
+            self.num_tasks_per_node = 9
+            self.num_gpus_per_node = 8
+            self.build_system.options = ['NVCC_FLAGS="-arch=compute_70"']
+            self.variables = {
+                'G2G': '1'
+            }
+
+        self.executable = 'set_openacc_device'
+        self.sanity_patterns = sn.assert_found(r'Test\sResult\s*:\s+OK', self.stdout)
+        self.maintainers = ['LM']
+        self.tags = {'production', 'mch'}
+
+    @rfm.run_before('compile')
+    def setflags(self):
+        if self.current_environ.name.startswith('PrgEnv-pgi'):
+            self.build_system.fflags += ['-acc']
+            #if self.current_system.name == 'kesch':
+            #    self.build_system.fflags += ['-ta=tesla,cc35,cuda8.0']
+            #    self.build_system.ldflags = [
+            #        '-acc', '-ta:tesla:cc35,cuda8.0', '-lstdc++',
+            #        '-L/global/opt/nvidia/cudatoolkit/8.0.61/lib64',
+            #        '-lcublas', '-lcudart'
+            #    ]
+            if self.current_system.name in ['arolla', 'tsa']:
+                self.build_system.fflags += ['-ta=tesla,cc70,cuda10.1']
+                self.build_system.ldflags = [
+                    '-acc', '-ta:tesla:cc70,cuda10.1', '-lstdc++',
+                    '-L$EBROOTCUDA/lib64', '-lcublas', '-lcudart'
+                ]
diff --git a/cscs-checks/mch/src/Makefile.set_openacc_device b/cscs-checks/mch/src/Makefile.set_openacc_device
new file mode 100644
index 0000000000..fb0d18c05d
--- /dev/null
+++ b/cscs-checks/mch/src/Makefile.set_openacc_device
@@ -0,0 +1,29 @@
+RM := rm -f
+EXECUTABLE := set_openacc_device
+
+all: $(EXECUTABLE)
+LD = $(FC)
+
+OBJS = compute_cuda.o set_openacc_device.o std_cpp_call.o
+# OBJ2 = $(subst _,$(PE_ENV)_,$(OBJ))
+LIB  =
+
+.SUFFIXES: .o .cu .cpp .F90
+
+%.o: %.cu
+	$(NVCC) $(CPPFLAGS) $(NVCC_FLAGS) -c $< -o $@
+
+%.o: %.cpp
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $< -o $@
+
+%.o: %.F90
+	$(FC) $(CPPFLAGS) $(FCFLAGS) -c $< -o $@
+
+$(EXECUTABLE): $(OBJS)
+	$(LD) $(OBJS) -o $@ $(LDFLAGS) $(LIB)
+
+clean:
+	-$(RM) $(OBJS)
+
+distclean:
+	-$(RM) $(OBJS) $(EXECUTABLE)
diff --git a/cscs-checks/mch/src/set_openacc_cuda_mpi.F90 b/cscs-checks/mch/src/set_openacc_device.F90
similarity index 78%
rename from cscs-checks/mch/src/set_openacc_cuda_mpi.F90
rename to cscs-checks/mch/src/set_openacc_device.F90
index 2f03e58928..b397f977b8 100644
--- a/cscs-checks/mch/src/set_openacc_cuda_mpi.F90
+++ b/cscs-checks/mch/src/set_openacc_device.F90
@@ -1,6 +1,6 @@
 ! This code tests MPI tasks communication with GPU devices 
 ! using OpenACC directives and setting one device per task
-program set_openacc_cuda_mpi
+program set_openacc_device
   use openacc
   implicit none
 
@@ -20,54 +20,62 @@ program set_openacc_cuda_mpi
   call MPI_Comm_size(MPI_COMM_WORLD, mpi_size, ierr)
   call MPI_Comm_rank(MPI_COMM_WORLD, mpi_rank, ierr)
 
-! each task creates two different arrays: the sum of their elements will be 10*mpi_rank
+! each task creates two arrays: the sum of their elements will be 10*mpi_rank
   allocate(array1(ARRAYSIZE))
   allocate(array2(ARRAYSIZE))
 
-#ifdef _OPENACC
+! get number of gpu devices
+  devicetype = acc_get_device_type()
+  ngpus = acc_get_num_devices(devicetype)
+
+! rank 0 prints number of tasks and number of gpu devices
   if (mpi_rank == 0) then 
-    devicetype = acc_get_device_type()
-    ngpus = acc_get_num_devices(devicetype)
-    write(*,*) "MPI test with OpenACC using", mpi_size, "tasks and ", ngpus-1, "GPU devices"
+    write(*,*) "MPI test with OpenACC using", mpi_size, "tasks and ", ngpus, "GPU devices"
+! initialization of the arrays on rank 0
     do i = 1, ARRAYSIZE
      array1(i) = .0
      array2(i) = .0
     end do
   else
-   ! each task different from 0 addresses a different GPU device
+! each MPI rank different from 0 addresses a different GPU device
    gpuid = mod(mpi_rank, ngpus)
    call acc_set_device_num(gpuid, acc_device_nvidia)
    call acc_init(acc_device_nvidia)
    gpuid = acc_get_device_num(devicetype)
-   write(*,*) "MPI task ", mpi_rank, "is using GPU id ", gpuid
+   write(*,*) "MPI task ", mpi_rank, "is using GPU id ", gpuid, "out of ", ngpus
 
+! initialization of the arrays on the gpu device used by the current MPI rank 
    !$acc data pcreate(array1,array2)
    !$acc parallel loop  
    do i = 1, ARRAYSIZE
      array1(i) = mpi_rank*0.25
      array2(i) = mpi_rank*0.75
    end do
+! update the arrays on the current MPI rank
    !$acc update host(array1,array2)
 
-! the current mpi_rank computes localsum(1)
+! the current MPI rank computes localsum(1)
    localsum(1) = sum(array1)+sum(array2)
+  
+! call external c++ function 
    call call_cpp_std(array1, ARRAYSIZE, i)
 
-   ! compute the sum of the arrays on the GPU calling a CUDA kernel using device ptr 
+! compute the sum of the arrays on the GPU using device ptr 
    call call_cuda_kernel_no_copy(array1, array2, ARRAYSIZE)
+! update array1 on the current MPI rank
    !$acc update host(array1)
    !$acc end data
+! array1 is now equal to sum(array1)+sum(array2)
 
-! array1 is now equal to sum(array1)+sum(array2): compute localsum(2)
+! compute localsum(2)
    localsum(2) = sum(array1)
   end if
-#endif
 
-! the current mpi_rank sends localsum to compute globalsum over all mpi tasks
+! the current MPI rank adds localsum to globalsum on rank 0
   call MPI_Reduce(localsum, globalsum, 2, MPI_INTEGER, MPI_SUM, 0, MPI_COMM_WORLD, ierr)
-
+! globalsum is 10*n*(n+1)/2 where n is the number of gpu devices
  if(mpi_rank == 0) then
-    if (globalsum(1) == globalsum(2)) then
+   if (globalsum(1) == globalsum(2)) then
       write (*,*) "CPU sum : ", globalsum(1), " GPU sum : ", globalsum(2)
       write (*,*) "Test Result : OK"
     else
@@ -144,4 +152,4 @@ end subroutine cpp_call
     call cpp_call(c_loc(fp(1)), n, i)
   end subroutine call_cpp_std
 
-end program set_openacc_cuda_mpi
+end program set_openacc_device

From df52de7b973ce4089d608c960e15252c56d3b1e2 Mon Sep 17 00:00:00 2001
From: Luca <lucamar@cscs.ch>
Date: Wed, 13 May 2020 00:23:02 +0200
Subject: [PATCH 03/11] Fix long line an d comments

---
 cscs-checks/mch/set_openacc_device.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/cscs-checks/mch/set_openacc_device.py b/cscs-checks/mch/set_openacc_device.py
index 9bc97c1d1e..1e383c5524 100644
--- a/cscs-checks/mch/set_openacc_device.py
+++ b/cscs-checks/mch/set_openacc_device.py
@@ -40,7 +40,8 @@ def __init__(self):
             }
 
         self.executable = 'set_openacc_device'
-        self.sanity_patterns = sn.assert_found(r'Test\sResult\s*:\s+OK', self.stdout)
+        self.sanity_patterns = sn.assert_found(r'Test\sResult\s*:\s+OK', 
+                                               self.stdout)
         self.maintainers = ['LM']
         self.tags = {'production', 'mch'}
 
@@ -48,13 +49,13 @@ def __init__(self):
     def setflags(self):
         if self.current_environ.name.startswith('PrgEnv-pgi'):
             self.build_system.fflags += ['-acc']
-            #if self.current_system.name == 'kesch':
-            #    self.build_system.fflags += ['-ta=tesla,cc35,cuda8.0']
-            #    self.build_system.ldflags = [
-            #        '-acc', '-ta:tesla:cc35,cuda8.0', '-lstdc++',
-            #        '-L/global/opt/nvidia/cudatoolkit/8.0.61/lib64',
-            #        '-lcublas', '-lcudart'
-            #    ]
+#            if self.current_system.name == 'kesch':
+#                self.build_system.fflags += ['-ta=tesla,cc35,cuda8.0']
+#                self.build_system.ldflags = [
+#                    '-acc', '-ta:tesla:cc35,cuda8.0', '-lstdc++',
+#                    '-L/global/opt/nvidia/cudatoolkit/8.0.61/lib64',
+#                    '-lcublas', '-lcudart'
+#                ]
             if self.current_system.name in ['arolla', 'tsa']:
                 self.build_system.fflags += ['-ta=tesla,cc70,cuda10.1']
                 self.build_system.ldflags = [

From 06ca72d27a93683e34842766d23361325408b4bf Mon Sep 17 00:00:00 2001
From: Luca <lucamar@cscs.ch>
Date: Wed, 13 May 2020 00:26:00 +0200
Subject: [PATCH 04/11] Removed trailing white space on line 43

---
 cscs-checks/mch/set_openacc_device.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cscs-checks/mch/set_openacc_device.py b/cscs-checks/mch/set_openacc_device.py
index 1e383c5524..7dabb0d880 100644
--- a/cscs-checks/mch/set_openacc_device.py
+++ b/cscs-checks/mch/set_openacc_device.py
@@ -40,7 +40,7 @@ def __init__(self):
             }
 
         self.executable = 'set_openacc_device'
-        self.sanity_patterns = sn.assert_found(r'Test\sResult\s*:\s+OK', 
+        self.sanity_patterns = sn.assert_found(r'Test\sResult\s*:\s+OK',
                                                self.stdout)
         self.maintainers = ['LM']
         self.tags = {'production', 'mch'}

From 80c248f5367f111624a360454a242097f75d160c Mon Sep 17 00:00:00 2001
From: lucamar <luca.marsella@cscs.ch>
Date: Thu, 14 May 2020 12:53:10 +0200
Subject: [PATCH 05/11] Update cscs-checks/mch/set_openacc_device.py

Co-authored-by: Vasileios Karakasis <vkarak@gmail.com>
---
 cscs-checks/mch/set_openacc_device.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cscs-checks/mch/set_openacc_device.py b/cscs-checks/mch/set_openacc_device.py
index 7dabb0d880..835b35e978 100644
--- a/cscs-checks/mch/set_openacc_device.py
+++ b/cscs-checks/mch/set_openacc_device.py
@@ -8,7 +8,7 @@
 
 
 @rfm.simple_test
-class SetOpenaccDevice(rfm.RegressionTest):
+class SetOpenaccDeviceTest(rfm.RegressionTest):
     def __init__(self):
         self.descr = 'Use OpenAcc, CUDA, MPI, and C++ on multi-device nodes'
         self.valid_systems = ['arolla:cn', 'tsa:cn']

From d223354a9ec58e7ef3b201056e0fd3831ce5e02c Mon Sep 17 00:00:00 2001
From: Luca <lucamar@cscs.ch>
Date: Fri, 15 May 2020 11:44:56 +0200
Subject: [PATCH 06/11] Changing name of the test as suggested

---
 .../mch/{set_openacc_device.py => multi_device_openacc}   | 8 ++++----
 ...e.set_openacc_device => Makefile.multi_device_openacc} | 4 ++--
 .../{set_openacc_device.F90 => multi_device_openacc.F90}  | 0
 3 files changed, 6 insertions(+), 6 deletions(-)
 rename cscs-checks/mch/{set_openacc_device.py => multi_device_openacc} (89%)
 rename cscs-checks/mch/src/{Makefile.set_openacc_device => Makefile.multi_device_openacc} (81%)
 rename cscs-checks/mch/src/{set_openacc_device.F90 => multi_device_openacc.F90} (100%)

diff --git a/cscs-checks/mch/set_openacc_device.py b/cscs-checks/mch/multi_device_openacc
similarity index 89%
rename from cscs-checks/mch/set_openacc_device.py
rename to cscs-checks/mch/multi_device_openacc
index 835b35e978..1e5cfaa932 100644
--- a/cscs-checks/mch/set_openacc_device.py
+++ b/cscs-checks/mch/multi_device_openacc
@@ -8,13 +8,13 @@
 
 
 @rfm.simple_test
-class SetOpenaccDeviceTest(rfm.RegressionTest):
+class MultiDeviceOpenaccTest(rfm.RegressionTest):
     def __init__(self):
-        self.descr = 'Use OpenAcc, CUDA, MPI, and C++ on multi-device nodes'
+        self.descr = 'Allocate one accelerator per MPI task using OpenAcc with CUDA, MPI, and C++ on multi-device nodes'
         self.valid_systems = ['arolla:cn', 'tsa:cn']
         self.valid_prog_environs = ['PrgEnv-cce', 'PrgEnv-pgi']
         self.build_system = 'Make'
-        self.build_system.makefile = 'Makefile.set_openacc_device'
+        self.build_system.makefile = 'Makefile.multi_device_openacc'
         self.build_system.fflags = ['-O2']
 
 #        if self.current_system.name == 'kesch':
@@ -39,7 +39,7 @@ def __init__(self):
                 'G2G': '1'
             }
 
-        self.executable = 'set_openacc_device'
+        self.executable = 'multi_device_openacc'
         self.sanity_patterns = sn.assert_found(r'Test\sResult\s*:\s+OK',
                                                self.stdout)
         self.maintainers = ['LM']
diff --git a/cscs-checks/mch/src/Makefile.set_openacc_device b/cscs-checks/mch/src/Makefile.multi_device_openacc
similarity index 81%
rename from cscs-checks/mch/src/Makefile.set_openacc_device
rename to cscs-checks/mch/src/Makefile.multi_device_openacc
index fb0d18c05d..eaaacd697c 100644
--- a/cscs-checks/mch/src/Makefile.set_openacc_device
+++ b/cscs-checks/mch/src/Makefile.multi_device_openacc
@@ -1,10 +1,10 @@
 RM := rm -f
-EXECUTABLE := set_openacc_device
+EXECUTABLE := multi_device_openacc
 
 all: $(EXECUTABLE)
 LD = $(FC)
 
-OBJS = compute_cuda.o set_openacc_device.o std_cpp_call.o
+OBJS = compute_cuda.o multi_device_openacc.o std_cpp_call.o
 # OBJ2 = $(subst _,$(PE_ENV)_,$(OBJ))
 LIB  =
 
diff --git a/cscs-checks/mch/src/set_openacc_device.F90 b/cscs-checks/mch/src/multi_device_openacc.F90
similarity index 100%
rename from cscs-checks/mch/src/set_openacc_device.F90
rename to cscs-checks/mch/src/multi_device_openacc.F90

From cb682dc3606a66fa34869095ddf7085c06560956 Mon Sep 17 00:00:00 2001
From: Luca <lucamar@cscs.ch>
Date: Fri, 15 May 2020 12:22:16 +0200
Subject: [PATCH 07/11] Adding python extension to check

---
 cscs-checks/mch/{multi_device_openacc => multi_device_openacc.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename cscs-checks/mch/{multi_device_openacc => multi_device_openacc.py} (100%)

diff --git a/cscs-checks/mch/multi_device_openacc b/cscs-checks/mch/multi_device_openacc.py
similarity index 100%
rename from cscs-checks/mch/multi_device_openacc
rename to cscs-checks/mch/multi_device_openacc.py

From 89c10c47739c80e7b7402629849c9f29b2a779bb Mon Sep 17 00:00:00 2001
From: Luca <lucamar@cscs.ch>
Date: Fri, 15 May 2020 12:24:46 +0200
Subject: [PATCH 08/11] Improving test description as suggested

---
 cscs-checks/mch/multi_device_openacc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cscs-checks/mch/multi_device_openacc.py b/cscs-checks/mch/multi_device_openacc.py
index 1e5cfaa932..de5ac46bba 100644
--- a/cscs-checks/mch/multi_device_openacc.py
+++ b/cscs-checks/mch/multi_device_openacc.py
@@ -10,7 +10,7 @@
 @rfm.simple_test
 class MultiDeviceOpenaccTest(rfm.RegressionTest):
     def __init__(self):
-        self.descr = 'Allocate one accelerator per MPI task using OpenAcc with CUDA, MPI, and C++ on multi-device nodes'
+        self.descr = 'Allocate one accelerator per MPI task using OpenAcc on multi-device nodes with additional CUDA, MPI, and C++ calls'
         self.valid_systems = ['arolla:cn', 'tsa:cn']
         self.valid_prog_environs = ['PrgEnv-cce', 'PrgEnv-pgi']
         self.build_system = 'Make'

From f627414bc8b44870448bade5dfcc3e29d7d4d1fa Mon Sep 17 00:00:00 2001
From: Luca <lucamar@cscs.ch>
Date: Fri, 15 May 2020 12:38:24 +0200
Subject: [PATCH 09/11] Fixing program name and restoring Kesch

---
 cscs-checks/mch/multi_device_openacc.py      | 43 ++++++++++----------
 cscs-checks/mch/src/multi_device_openacc.F90 |  4 +-
 2 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/cscs-checks/mch/multi_device_openacc.py b/cscs-checks/mch/multi_device_openacc.py
index de5ac46bba..fb13e03824 100644
--- a/cscs-checks/mch/multi_device_openacc.py
+++ b/cscs-checks/mch/multi_device_openacc.py
@@ -11,24 +11,24 @@
 class MultiDeviceOpenaccTest(rfm.RegressionTest):
     def __init__(self):
         self.descr = 'Allocate one accelerator per MPI task using OpenAcc on multi-device nodes with additional CUDA, MPI, and C++ calls'
-        self.valid_systems = ['arolla:cn', 'tsa:cn']
+        self.valid_systems = ['arolla:cn', 'tsa:cn', 'kesch:cn']
         self.valid_prog_environs = ['PrgEnv-cce', 'PrgEnv-pgi']
         self.build_system = 'Make'
         self.build_system.makefile = 'Makefile.multi_device_openacc'
         self.build_system.fflags = ['-O2']
 
-#        if self.current_system.name == 'kesch':
-#            self.exclusive_access = True
-#            self.modules = ['cudatoolkit/8.0.61']
-#            self.num_tasks = 9
-#            self.num_tasks_per_node = 9
-#            self.num_gpus_per_node = 8
-#            self.build_system.options = ['NVCC_FLAGS="-arch=compute_37"']
-#            self.variables = {
-#                'MV2_USE_CUDA': '1',
-#                'G2G': '1'
-#            }
-        if self.current_system.name in ['arolla', 'tsa']:
+        if self.current_system.name == 'kesch':
+            self.exclusive_access = True
+            self.modules = ['cudatoolkit/8.0.61']
+            self.num_tasks = 9
+            self.num_tasks_per_node = 9
+            self.num_gpus_per_node = 8
+            self.build_system.options = ['NVCC_FLAGS="-arch=compute_37"']
+            self.variables = {
+                'MV2_USE_CUDA': '1',
+                'G2G': '1'
+            }
+        elif self.current_system.name in ['arolla', 'tsa']:
             self.exclusive_access = True
             self.modules = ['cuda/10.1.243']
             self.num_tasks = 9
@@ -49,14 +49,15 @@ def __init__(self):
     def setflags(self):
         if self.current_environ.name.startswith('PrgEnv-pgi'):
             self.build_system.fflags += ['-acc']
-#            if self.current_system.name == 'kesch':
-#                self.build_system.fflags += ['-ta=tesla,cc35,cuda8.0']
-#                self.build_system.ldflags = [
-#                    '-acc', '-ta:tesla:cc35,cuda8.0', '-lstdc++',
-#                    '-L/global/opt/nvidia/cudatoolkit/8.0.61/lib64',
-#                    '-lcublas', '-lcudart'
-#                ]
-            if self.current_system.name in ['arolla', 'tsa']:
+
+            if self.current_system.name == 'kesch':
+                self.build_system.fflags += ['-ta=tesla,cc35,cuda8.0']
+                self.build_system.ldflags = [
+                    '-acc', '-ta:tesla:cc35,cuda8.0', '-lstdc++',
+                    '-L/global/opt/nvidia/cudatoolkit/8.0.61/lib64',
+                    '-lcublas', '-lcudart'
+                ]
+            elif self.current_system.name in ['arolla', 'tsa']:
                 self.build_system.fflags += ['-ta=tesla,cc70,cuda10.1']
                 self.build_system.ldflags = [
                     '-acc', '-ta:tesla:cc70,cuda10.1', '-lstdc++',
diff --git a/cscs-checks/mch/src/multi_device_openacc.F90 b/cscs-checks/mch/src/multi_device_openacc.F90
index b397f977b8..8a4ba28531 100644
--- a/cscs-checks/mch/src/multi_device_openacc.F90
+++ b/cscs-checks/mch/src/multi_device_openacc.F90
@@ -1,6 +1,6 @@
 ! This code tests MPI tasks communication with GPU devices 
 ! using OpenACC directives and setting one device per task
-program set_openacc_device
+program multi_device_openacc
   use openacc
   implicit none
 
@@ -152,4 +152,4 @@ end subroutine cpp_call
     call cpp_call(c_loc(fp(1)), n, i)
   end subroutine call_cpp_std
 
-end program set_openacc_device
+end program multi_device_openacc

From dff8beca2d239e6d7111a6c889d775da381d334d Mon Sep 17 00:00:00 2001
From: Luca <lucamar@cscs.ch>
Date: Mon, 18 May 2020 10:56:26 +0200
Subject: [PATCH 10/11] Fixing test on Kesch

---
 cscs-checks/mch/multi_device_openacc.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/cscs-checks/mch/multi_device_openacc.py b/cscs-checks/mch/multi_device_openacc.py
index fb13e03824..413abcaf3b 100644
--- a/cscs-checks/mch/multi_device_openacc.py
+++ b/cscs-checks/mch/multi_device_openacc.py
@@ -12,7 +12,7 @@ class MultiDeviceOpenaccTest(rfm.RegressionTest):
     def __init__(self):
         self.descr = 'Allocate one accelerator per MPI task using OpenAcc on multi-device nodes with additional CUDA, MPI, and C++ calls'
         self.valid_systems = ['arolla:cn', 'tsa:cn', 'kesch:cn']
-        self.valid_prog_environs = ['PrgEnv-cce', 'PrgEnv-pgi']
+        self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-pgi']
         self.build_system = 'Make'
         self.build_system.makefile = 'Makefile.multi_device_openacc'
         self.build_system.fflags = ['-O2']
@@ -24,10 +24,6 @@ def __init__(self):
             self.num_tasks_per_node = 9
             self.num_gpus_per_node = 8
             self.build_system.options = ['NVCC_FLAGS="-arch=compute_37"']
-            self.variables = {
-                'MV2_USE_CUDA': '1',
-                'G2G': '1'
-            }
         elif self.current_system.name in ['arolla', 'tsa']:
             self.exclusive_access = True
             self.modules = ['cuda/10.1.243']
@@ -35,9 +31,6 @@ def __init__(self):
             self.num_tasks_per_node = 9
             self.num_gpus_per_node = 8
             self.build_system.options = ['NVCC_FLAGS="-arch=compute_70"']
-            self.variables = {
-                'G2G': '1'
-            }
 
         self.executable = 'multi_device_openacc'
         self.sanity_patterns = sn.assert_found(r'Test\sResult\s*:\s+OK',
@@ -49,7 +42,6 @@ def __init__(self):
     def setflags(self):
         if self.current_environ.name.startswith('PrgEnv-pgi'):
             self.build_system.fflags += ['-acc']
-
             if self.current_system.name == 'kesch':
                 self.build_system.fflags += ['-ta=tesla,cc35,cuda8.0']
                 self.build_system.ldflags = [
@@ -63,3 +55,9 @@ def setflags(self):
                     '-acc', '-ta:tesla:cc70,cuda10.1', '-lstdc++',
                     '-L$EBROOTCUDA/lib64', '-lcublas', '-lcudart'
                 ]
+        elif self.current_environ.name.startswith('PrgEnv-cray'):
+           self.build_system.fflags += ['-DCRAY', '-hacc', '-hnoomp']
+           self.variables = {
+               'CRAY_ACCEL_TARGET': 'nvidia35',
+               'MV2_USE_CUDA': '1'
+           }

From 64e109765792cbb2bc8535706a135da52c1a9942 Mon Sep 17 00:00:00 2001
From: Vasileios Karakasis <karakasis@cscs.ch>
Date: Mon, 25 May 2020 21:13:31 +0200
Subject: [PATCH 11/11] Address PR comments

---
 cscs-checks/mch/multi_device_openacc.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/cscs-checks/mch/multi_device_openacc.py b/cscs-checks/mch/multi_device_openacc.py
index 413abcaf3b..50f2e5c026 100644
--- a/cscs-checks/mch/multi_device_openacc.py
+++ b/cscs-checks/mch/multi_device_openacc.py
@@ -10,13 +10,15 @@
 @rfm.simple_test
 class MultiDeviceOpenaccTest(rfm.RegressionTest):
     def __init__(self):
-        self.descr = 'Allocate one accelerator per MPI task using OpenAcc on multi-device nodes with additional CUDA, MPI, and C++ calls'
+        self.descr = (
+            'Allocate one accelerator per MPI task using OpenACC on '
+            'multi-device nodes with additional CUDA, MPI, and C++ calls'
+        )
         self.valid_systems = ['arolla:cn', 'tsa:cn', 'kesch:cn']
         self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-pgi']
         self.build_system = 'Make'
         self.build_system.makefile = 'Makefile.multi_device_openacc'
         self.build_system.fflags = ['-O2']
-
         if self.current_system.name == 'kesch':
             self.exclusive_access = True
             self.modules = ['cudatoolkit/8.0.61']
@@ -35,7 +37,7 @@ def __init__(self):
         self.executable = 'multi_device_openacc'
         self.sanity_patterns = sn.assert_found(r'Test\sResult\s*:\s+OK',
                                                self.stdout)
-        self.maintainers = ['LM']
+        self.maintainers = ['LM', 'AJ']
         self.tags = {'production', 'mch'}
 
     @rfm.run_before('compile')
@@ -56,8 +58,8 @@ def setflags(self):
                     '-L$EBROOTCUDA/lib64', '-lcublas', '-lcudart'
                 ]
         elif self.current_environ.name.startswith('PrgEnv-cray'):
-           self.build_system.fflags += ['-DCRAY', '-hacc', '-hnoomp']
-           self.variables = {
-               'CRAY_ACCEL_TARGET': 'nvidia35',
-               'MV2_USE_CUDA': '1'
-           }
+            self.build_system.fflags += ['-DCRAY', '-hacc', '-hnoomp']
+            self.variables = {
+                'CRAY_ACCEL_TARGET': 'nvidia35',
+                'MV2_USE_CUDA': '1'
+            }