reframe-hpc · vkarak · May 25, 2020 · May 11, 2020 · May 12, 2020 · May 12, 2020
diff --git a/cscs-checks/mch/multi_device_openacc.py b/cscs-checks/mch/multi_device_openacc.py
@@ -0,0 +1,65 @@
+# Copyright 2016-2020 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
+# ReFrame Project Developers. See the top-level LICENSE file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import reframe as rfm
+import reframe.utility.sanity as sn
+
+
+@rfm.simple_test
+class MultiDeviceOpenaccTest(rfm.RegressionTest):
+    def __init__(self):
+        self.descr = (
+            'Allocate one accelerator per MPI task using OpenACC on '
+            'multi-device nodes with additional CUDA, MPI, and C++ calls'
+        )
+        self.valid_systems = ['arolla:cn', 'tsa:cn', 'kesch:cn']
+        self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-pgi']
+        self.build_system = 'Make'
+        self.build_system.makefile = 'Makefile.multi_device_openacc'
+        self.build_system.fflags = ['-O2']
+        if self.current_system.name == 'kesch':
+            self.exclusive_access = True
+            self.modules = ['cudatoolkit/8.0.61']
+            self.num_tasks = 9
+            self.num_tasks_per_node = 9
+            self.num_gpus_per_node = 8
+            self.build_system.options = ['NVCC_FLAGS="-arch=compute_37"']
+        elif self.current_system.name in ['arolla', 'tsa']:
+            self.exclusive_access = True
+            self.modules = ['cuda/10.1.243']
+            self.num_tasks = 9
+            self.num_tasks_per_node = 9
+            self.num_gpus_per_node = 8
+            self.build_system.options = ['NVCC_FLAGS="-arch=compute_70"']
+
+        self.executable = 'multi_device_openacc'
+        self.sanity_patterns = sn.assert_found(r'Test\sResult\s*:\s+OK',
+                                               self.stdout)
+        self.maintainers = ['LM', 'AJ']
+        self.tags = {'production', 'mch'}
+
+    @rfm.run_before('compile')
+    def setflags(self):
+        if self.current_environ.name.startswith('PrgEnv-pgi'):
+            self.build_system.fflags += ['-acc']
+            if self.current_system.name == 'kesch':
+                self.build_system.fflags += ['-ta=tesla,cc35,cuda8.0']
+                self.build_system.ldflags = [
+                    '-acc', '-ta:tesla:cc35,cuda8.0', '-lstdc++',
+                    '-L/global/opt/nvidia/cudatoolkit/8.0.61/lib64',
+                    '-lcublas', '-lcudart'
+                ]
+            elif self.current_system.name in ['arolla', 'tsa']:
+                self.build_system.fflags += ['-ta=tesla,cc70,cuda10.1']
+                self.build_system.ldflags = [
+                    '-acc', '-ta:tesla:cc70,cuda10.1', '-lstdc++',
+                    '-L$EBROOTCUDA/lib64', '-lcublas', '-lcudart'
+                ]
+        elif self.current_environ.name.startswith('PrgEnv-cray'):
+            self.build_system.fflags += ['-DCRAY', '-hacc', '-hnoomp']
+            self.variables = {
+                'CRAY_ACCEL_TARGET': 'nvidia35',
+                'MV2_USE_CUDA': '1'
+            }
diff --git a/cscs-checks/mch/src/Makefile.multi_device_openacc b/cscs-checks/mch/src/Makefile.multi_device_openacc
@@ -0,0 +1,29 @@
+RM := rm -f
+EXECUTABLE := multi_device_openacc
+
+all: $(EXECUTABLE)
+LD = $(FC)
+
+OBJS = compute_cuda.o multi_device_openacc.o std_cpp_call.o
+# OBJ2 = $(subst _,$(PE_ENV)_,$(OBJ))
+LIB  =
+
+.SUFFIXES: .o .cu .cpp .F90
+
+%.o: %.cu
+	$(NVCC) $(CPPFLAGS) $(NVCC_FLAGS) -c $< -o $@
+
+%.o: %.cpp
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $< -o $@
+
+%.o: %.F90
+	$(FC) $(CPPFLAGS) $(FCFLAGS) -c $< -o $@
+
+$(EXECUTABLE): $(OBJS)
+	$(LD) $(OBJS) -o $@ $(LDFLAGS) $(LIB)
+
+clean:
+	-$(RM) $(OBJS)
+
+distclean:
+	-$(RM) $(OBJS) $(EXECUTABLE)
diff --git a/cscs-checks/mch/src/multi_device_openacc.F90 b/cscs-checks/mch/src/multi_device_openacc.F90
@@ -0,0 +1,155 @@
+! This code tests MPI tasks communication with GPU devices 
+! using OpenACC directives and setting one device per task
+program multi_device_openacc
+  use openacc
+  implicit none
+
+  include 'mpif.h'
+#ifdef CRAY
+  integer, parameter :: ACC_DEVICE_TYPE = 8
+#else
+  integer, parameter :: ACC_DEVICE_TYPE = 4
+#endif
+  integer, parameter :: ARRAYSIZE = 10
+  integer(kind=ACC_DEVICE_TYPE) :: devicetype
+  integer :: status(MPI_STATUS_SIZE), mpi_size, mpi_rank
+  integer :: ierr, i, gpuid, ngpus, localsum(2), globalsum(2)
+  real, allocatable :: array1(:), array2(:) 
+
+  call MPI_Init(ierr)
+  call MPI_Comm_size(MPI_COMM_WORLD, mpi_size, ierr)
+  call MPI_Comm_rank(MPI_COMM_WORLD, mpi_rank, ierr)
+
+! each task creates two arrays: the sum of their elements will be 10*mpi_rank
+  allocate(array1(ARRAYSIZE))
+  allocate(array2(ARRAYSIZE))
+
+! get number of gpu devices
+  devicetype = acc_get_device_type()
+  ngpus = acc_get_num_devices(devicetype)
+
+! rank 0 prints number of tasks and number of gpu devices
+  if (mpi_rank == 0) then 
+    write(*,*) "MPI test with OpenACC using", mpi_size, "tasks and ", ngpus, "GPU devices"
+! initialization of the arrays on rank 0
+    do i = 1, ARRAYSIZE
+     array1(i) = .0
+     array2(i) = .0
+    end do
+  else
+! each MPI rank different from 0 addresses a different GPU device
+   gpuid = mod(mpi_rank, ngpus)
+   call acc_set_device_num(gpuid, acc_device_nvidia)
+   call acc_init(acc_device_nvidia)
+   gpuid = acc_get_device_num(devicetype)
+   write(*,*) "MPI task ", mpi_rank, "is using GPU id ", gpuid, "out of ", ngpus
+
+! initialization of the arrays on the gpu device used by the current MPI rank 
+   !$acc data pcreate(array1,array2)
+   !$acc parallel loop  
+   do i = 1, ARRAYSIZE
+     array1(i) = mpi_rank*0.25
+     array2(i) = mpi_rank*0.75
+   end do
+! update the arrays on the current MPI rank
+   !$acc update host(array1,array2)
+
+! the current MPI rank computes localsum(1)
+   localsum(1) = sum(array1)+sum(array2)
+
+! call external c++ function 
+   call call_cpp_std(array1, ARRAYSIZE, i)
+
+! compute the sum of the arrays on the GPU using device ptr 
+   call call_cuda_kernel_no_copy(array1, array2, ARRAYSIZE)
+! update array1 on the current MPI rank
+   !$acc update host(array1)
+   !$acc end data
+! array1 is now equal to sum(array1)+sum(array2)
+
+! compute localsum(2)
+   localsum(2) = sum(array1)
+  end if
+
+! the current MPI rank adds localsum to globalsum on rank 0
+  call MPI_Reduce(localsum, globalsum, 2, MPI_INTEGER, MPI_SUM, 0, MPI_COMM_WORLD, ierr)
+! globalsum is 10*n*(n+1)/2 where n is the number of gpu devices
+ if(mpi_rank == 0) then
+   if (globalsum(1) == globalsum(2)) then
+      write (*,*) "CPU sum : ", globalsum(1), " GPU sum : ", globalsum(2)
+      write (*,*) "Test Result : OK"
+    else
+      write (*,*) "CPU sum : ", globalsum(1), " GPU sum : ", globalsum(2)
+      write (*,*) "Test Result : FAIL"
+    end if
+  end if
+
+  deallocate(array1)
+  deallocate(array2)
+  call MPI_Finalize(ierr);
+
+
+contains
+  subroutine call_cuda_kernel_with_copy(a,b,n)
+    use, intrinsic :: iso_c_binding
+    implicit none
+    real, intent(inout), target :: a(:)
+    real, intent(in), target :: b(:)
+    integer, intent(in) :: n
+
+    interface
+      subroutine cuda_kernel_with_copy(a,b,n) bind(c,name='cuda_kernel_with_copy')
+        use, intrinsic :: iso_c_binding
+        type(c_ptr), intent(in), value :: a, b
+        integer, intent(in), value :: n
+      end subroutine cuda_kernel_with_copy
+    end interface
+
+    call cuda_kernel_with_copy(c_loc(a(1)), c_loc(b(1)), n)
+  end subroutine call_cuda_kernel_with_copy
+
+  subroutine call_cuda_kernel_no_copy(a,b,n)
+    use, intrinsic :: iso_c_binding
+    implicit none
+    real, intent(inout), target :: a(:)
+    real, intent(in), target :: b(:)
+    integer, intent(in) :: n
+
+    interface
+      subroutine cuda_kernel_no_copy(a,b,n) bind(c,name='cuda_kernel_no_copy')
+        use, intrinsic :: iso_c_binding
+        type(c_ptr), intent(in), value :: a, b
+        integer, intent(in), value :: n
+      end subroutine cuda_kernel_no_copy
+    end interface
+
+    !$acc data present(a, b)
+    !$acc host_data use_device(a, b)
+    call cuda_kernel_no_copy(c_loc(a(1)), c_loc(b(1)), n)
+    !$acc end host_data
+    !$acc end data
+  end subroutine call_cuda_kernel_no_copy
+
+  subroutine call_cpp_std(f,n,i)
+    use, intrinsic :: iso_c_binding
+    implicit none
+    real(kind=c_float), intent(in), target :: f(:)
+    real(kind=c_float), pointer :: fp(:)
+    integer, intent(in) :: n
+    integer(kind=c_int), intent(out) :: i
+
+    interface
+      subroutine cpp_call(f,n,i) bind(c,name='do_smth_with_std')
+        use, intrinsic :: iso_c_binding
+        type(c_ptr), intent(in), value :: f
+        integer, intent(in), value :: n
+        integer(kind=c_int), intent(out) :: i
+      end subroutine cpp_call
+    end interface
+
+    fp => f
+
+    call cpp_call(c_loc(fp(1)), n, i)
+  end subroutine call_cpp_std
+
+end program multi_device_openacc