From 86576608d35cb0e82f4f5ff3a71430c785e6da36 Mon Sep 17 00:00:00 2001 From: Luca Date: Mon, 11 May 2020 21:42:24 +0200 Subject: [PATCH 01/11] Source code for check Set OpenACC device --- cscs-checks/mch/src/set_openacc_cuda_mpi.F90 | 147 +++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 cscs-checks/mch/src/set_openacc_cuda_mpi.F90 diff --git a/cscs-checks/mch/src/set_openacc_cuda_mpi.F90 b/cscs-checks/mch/src/set_openacc_cuda_mpi.F90 new file mode 100644 index 0000000000..2f03e58928 --- /dev/null +++ b/cscs-checks/mch/src/set_openacc_cuda_mpi.F90 @@ -0,0 +1,147 @@ +! This code tests MPI tasks communication with GPU devices +! using OpenACC directives and setting one device per task +program set_openacc_cuda_mpi + use openacc + implicit none + + include 'mpif.h' +#ifdef CRAY + integer, parameter :: ACC_DEVICE_TYPE = 8 +#else + integer, parameter :: ACC_DEVICE_TYPE = 4 +#endif + integer, parameter :: ARRAYSIZE = 10 + integer(kind=ACC_DEVICE_TYPE) :: devicetype + integer :: status(MPI_STATUS_SIZE), mpi_size, mpi_rank + integer :: ierr, i, gpuid, ngpus, localsum(2), globalsum(2) + real, allocatable :: array1(:), array2(:) + + call MPI_Init(ierr) + call MPI_Comm_size(MPI_COMM_WORLD, mpi_size, ierr) + call MPI_Comm_rank(MPI_COMM_WORLD, mpi_rank, ierr) + +! each task creates two different arrays: the sum of their elements will be 10*mpi_rank + allocate(array1(ARRAYSIZE)) + allocate(array2(ARRAYSIZE)) + +#ifdef _OPENACC + if (mpi_rank == 0) then + devicetype = acc_get_device_type() + ngpus = acc_get_num_devices(devicetype) + write(*,*) "MPI test with OpenACC using", mpi_size, "tasks and ", ngpus-1, "GPU devices" + do i = 1, ARRAYSIZE + array1(i) = .0 + array2(i) = .0 + end do + else + ! each task different from 0 addresses a different GPU device + gpuid = mod(mpi_rank, ngpus) + call acc_set_device_num(gpuid, acc_device_nvidia) + call acc_init(acc_device_nvidia) + gpuid = acc_get_device_num(devicetype) + write(*,*) "MPI task ", mpi_rank, "is using GPU id ", gpuid + + !$acc data pcreate(array1,array2) + !$acc parallel loop + do i = 1, ARRAYSIZE + array1(i) = mpi_rank*0.25 + array2(i) = mpi_rank*0.75 + end do + !$acc update host(array1,array2) + +! the current mpi_rank computes localsum(1) + localsum(1) = sum(array1)+sum(array2) + call call_cpp_std(array1, ARRAYSIZE, i) + + ! compute the sum of the arrays on the GPU calling a CUDA kernel using device ptr + call call_cuda_kernel_no_copy(array1, array2, ARRAYSIZE) + !$acc update host(array1) + !$acc end data + +! array1 is now equal to sum(array1)+sum(array2): compute localsum(2) + localsum(2) = sum(array1) + end if +#endif + +! the current mpi_rank sends localsum to compute globalsum over all mpi tasks + call MPI_Reduce(localsum, globalsum, 2, MPI_INTEGER, MPI_SUM, 0, MPI_COMM_WORLD, ierr) + + if(mpi_rank == 0) then + if (globalsum(1) == globalsum(2)) then + write (*,*) "CPU sum : ", globalsum(1), " GPU sum : ", globalsum(2) + write (*,*) "Test Result : OK" + else + write (*,*) "CPU sum : ", globalsum(1), " GPU sum : ", globalsum(2) + write (*,*) "Test Result : FAIL" + end if + end if + + deallocate(array1) + deallocate(array2) + call MPI_Finalize(ierr); + + +contains + subroutine call_cuda_kernel_with_copy(a,b,n) + use, intrinsic :: iso_c_binding + implicit none + real, intent(inout), target :: a(:) + real, intent(in), target :: b(:) + integer, intent(in) :: n + + interface + subroutine cuda_kernel_with_copy(a,b,n) bind(c,name='cuda_kernel_with_copy') + use, intrinsic :: iso_c_binding + type(c_ptr), intent(in), value :: a, b + integer, intent(in), value :: n + end subroutine cuda_kernel_with_copy + end interface + + call cuda_kernel_with_copy(c_loc(a(1)), c_loc(b(1)), n) + end subroutine call_cuda_kernel_with_copy + + subroutine call_cuda_kernel_no_copy(a,b,n) + use, intrinsic :: iso_c_binding + implicit none + real, intent(inout), target :: a(:) + real, intent(in), target :: b(:) + integer, intent(in) :: n + + interface + subroutine cuda_kernel_no_copy(a,b,n) bind(c,name='cuda_kernel_no_copy') + use, intrinsic :: iso_c_binding + type(c_ptr), intent(in), value :: a, b + integer, intent(in), value :: n + end subroutine cuda_kernel_no_copy + end interface + + !$acc data present(a, b) + !$acc host_data use_device(a, b) + call cuda_kernel_no_copy(c_loc(a(1)), c_loc(b(1)), n) + !$acc end host_data + !$acc end data + end subroutine call_cuda_kernel_no_copy + + subroutine call_cpp_std(f,n,i) + use, intrinsic :: iso_c_binding + implicit none + real(kind=c_float), intent(in), target :: f(:) + real(kind=c_float), pointer :: fp(:) + integer, intent(in) :: n + integer(kind=c_int), intent(out) :: i + + interface + subroutine cpp_call(f,n,i) bind(c,name='do_smth_with_std') + use, intrinsic :: iso_c_binding + type(c_ptr), intent(in), value :: f + integer, intent(in), value :: n + integer(kind=c_int), intent(out) :: i + end subroutine cpp_call + end interface + + fp => f + + call cpp_call(c_loc(fp(1)), n, i) + end subroutine call_cpp_std + +end program set_openacc_cuda_mpi From b2feb1fc5037a2346f8d3e18609aafdc1897b5f0 Mon Sep 17 00:00:00 2001 From: Luca Date: Wed, 13 May 2020 00:03:26 +0200 Subject: [PATCH 02/11] Check Set_Openacc_Device for Tsa --- cscs-checks/mch/set_openacc_device.py | 63 +++++++++++++++++++ .../mch/src/Makefile.set_openacc_device | 29 +++++++++ ...cc_cuda_mpi.F90 => set_openacc_device.F90} | 40 +++++++----- 3 files changed, 116 insertions(+), 16 deletions(-) create mode 100644 cscs-checks/mch/set_openacc_device.py create mode 100644 cscs-checks/mch/src/Makefile.set_openacc_device rename cscs-checks/mch/src/{set_openacc_cuda_mpi.F90 => set_openacc_device.F90} (78%) diff --git a/cscs-checks/mch/set_openacc_device.py b/cscs-checks/mch/set_openacc_device.py new file mode 100644 index 0000000000..9bc97c1d1e --- /dev/null +++ b/cscs-checks/mch/set_openacc_device.py @@ -0,0 +1,63 @@ +# Copyright 2016-2020 Swiss National Supercomputing Centre (CSCS/ETH Zurich) +# ReFrame Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: BSD-3-Clause + +import reframe as rfm +import reframe.utility.sanity as sn + + +@rfm.simple_test +class SetOpenaccDevice(rfm.RegressionTest): + def __init__(self): + self.descr = 'Use OpenAcc, CUDA, MPI, and C++ on multi-device nodes' + self.valid_systems = ['arolla:cn', 'tsa:cn'] + self.valid_prog_environs = ['PrgEnv-cce', 'PrgEnv-pgi'] + self.build_system = 'Make' + self.build_system.makefile = 'Makefile.set_openacc_device' + self.build_system.fflags = ['-O2'] + +# if self.current_system.name == 'kesch': +# self.exclusive_access = True +# self.modules = ['cudatoolkit/8.0.61'] +# self.num_tasks = 9 +# self.num_tasks_per_node = 9 +# self.num_gpus_per_node = 8 +# self.build_system.options = ['NVCC_FLAGS="-arch=compute_37"'] +# self.variables = { +# 'MV2_USE_CUDA': '1', +# 'G2G': '1' +# } + if self.current_system.name in ['arolla', 'tsa']: + self.exclusive_access = True + self.modules = ['cuda/10.1.243'] + self.num_tasks = 9 + self.num_tasks_per_node = 9 + self.num_gpus_per_node = 8 + self.build_system.options = ['NVCC_FLAGS="-arch=compute_70"'] + self.variables = { + 'G2G': '1' + } + + self.executable = 'set_openacc_device' + self.sanity_patterns = sn.assert_found(r'Test\sResult\s*:\s+OK', self.stdout) + self.maintainers = ['LM'] + self.tags = {'production', 'mch'} + + @rfm.run_before('compile') + def setflags(self): + if self.current_environ.name.startswith('PrgEnv-pgi'): + self.build_system.fflags += ['-acc'] + #if self.current_system.name == 'kesch': + # self.build_system.fflags += ['-ta=tesla,cc35,cuda8.0'] + # self.build_system.ldflags = [ + # '-acc', '-ta:tesla:cc35,cuda8.0', '-lstdc++', + # '-L/global/opt/nvidia/cudatoolkit/8.0.61/lib64', + # '-lcublas', '-lcudart' + # ] + if self.current_system.name in ['arolla', 'tsa']: + self.build_system.fflags += ['-ta=tesla,cc70,cuda10.1'] + self.build_system.ldflags = [ + '-acc', '-ta:tesla:cc70,cuda10.1', '-lstdc++', + '-L$EBROOTCUDA/lib64', '-lcublas', '-lcudart' + ] diff --git a/cscs-checks/mch/src/Makefile.set_openacc_device b/cscs-checks/mch/src/Makefile.set_openacc_device new file mode 100644 index 0000000000..fb0d18c05d --- /dev/null +++ b/cscs-checks/mch/src/Makefile.set_openacc_device @@ -0,0 +1,29 @@ +RM := rm -f +EXECUTABLE := set_openacc_device + +all: $(EXECUTABLE) +LD = $(FC) + +OBJS = compute_cuda.o set_openacc_device.o std_cpp_call.o +# OBJ2 = $(subst _,$(PE_ENV)_,$(OBJ)) +LIB = + +.SUFFIXES: .o .cu .cpp .F90 + +%.o: %.cu + $(NVCC) $(CPPFLAGS) $(NVCC_FLAGS) -c $< -o $@ + +%.o: %.cpp + $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $< -o $@ + +%.o: %.F90 + $(FC) $(CPPFLAGS) $(FCFLAGS) -c $< -o $@ + +$(EXECUTABLE): $(OBJS) + $(LD) $(OBJS) -o $@ $(LDFLAGS) $(LIB) + +clean: + -$(RM) $(OBJS) + +distclean: + -$(RM) $(OBJS) $(EXECUTABLE) diff --git a/cscs-checks/mch/src/set_openacc_cuda_mpi.F90 b/cscs-checks/mch/src/set_openacc_device.F90 similarity index 78% rename from cscs-checks/mch/src/set_openacc_cuda_mpi.F90 rename to cscs-checks/mch/src/set_openacc_device.F90 index 2f03e58928..b397f977b8 100644 --- a/cscs-checks/mch/src/set_openacc_cuda_mpi.F90 +++ b/cscs-checks/mch/src/set_openacc_device.F90 @@ -1,6 +1,6 @@ ! This code tests MPI tasks communication with GPU devices ! using OpenACC directives and setting one device per task -program set_openacc_cuda_mpi +program set_openacc_device use openacc implicit none @@ -20,54 +20,62 @@ program set_openacc_cuda_mpi call MPI_Comm_size(MPI_COMM_WORLD, mpi_size, ierr) call MPI_Comm_rank(MPI_COMM_WORLD, mpi_rank, ierr) -! each task creates two different arrays: the sum of their elements will be 10*mpi_rank +! each task creates two arrays: the sum of their elements will be 10*mpi_rank allocate(array1(ARRAYSIZE)) allocate(array2(ARRAYSIZE)) -#ifdef _OPENACC +! get number of gpu devices + devicetype = acc_get_device_type() + ngpus = acc_get_num_devices(devicetype) + +! rank 0 prints number of tasks and number of gpu devices if (mpi_rank == 0) then - devicetype = acc_get_device_type() - ngpus = acc_get_num_devices(devicetype) - write(*,*) "MPI test with OpenACC using", mpi_size, "tasks and ", ngpus-1, "GPU devices" + write(*,*) "MPI test with OpenACC using", mpi_size, "tasks and ", ngpus, "GPU devices" +! initialization of the arrays on rank 0 do i = 1, ARRAYSIZE array1(i) = .0 array2(i) = .0 end do else - ! each task different from 0 addresses a different GPU device +! each MPI rank different from 0 addresses a different GPU device gpuid = mod(mpi_rank, ngpus) call acc_set_device_num(gpuid, acc_device_nvidia) call acc_init(acc_device_nvidia) gpuid = acc_get_device_num(devicetype) - write(*,*) "MPI task ", mpi_rank, "is using GPU id ", gpuid + write(*,*) "MPI task ", mpi_rank, "is using GPU id ", gpuid, "out of ", ngpus +! initialization of the arrays on the gpu device used by the current MPI rank !$acc data pcreate(array1,array2) !$acc parallel loop do i = 1, ARRAYSIZE array1(i) = mpi_rank*0.25 array2(i) = mpi_rank*0.75 end do +! update the arrays on the current MPI rank !$acc update host(array1,array2) -! the current mpi_rank computes localsum(1) +! the current MPI rank computes localsum(1) localsum(1) = sum(array1)+sum(array2) + +! call external c++ function call call_cpp_std(array1, ARRAYSIZE, i) - ! compute the sum of the arrays on the GPU calling a CUDA kernel using device ptr +! compute the sum of the arrays on the GPU using device ptr call call_cuda_kernel_no_copy(array1, array2, ARRAYSIZE) +! update array1 on the current MPI rank !$acc update host(array1) !$acc end data +! array1 is now equal to sum(array1)+sum(array2) -! array1 is now equal to sum(array1)+sum(array2): compute localsum(2) +! compute localsum(2) localsum(2) = sum(array1) end if -#endif -! the current mpi_rank sends localsum to compute globalsum over all mpi tasks +! the current MPI rank adds localsum to globalsum on rank 0 call MPI_Reduce(localsum, globalsum, 2, MPI_INTEGER, MPI_SUM, 0, MPI_COMM_WORLD, ierr) - +! globalsum is 10*n*(n+1)/2 where n is the number of gpu devices if(mpi_rank == 0) then - if (globalsum(1) == globalsum(2)) then + if (globalsum(1) == globalsum(2)) then write (*,*) "CPU sum : ", globalsum(1), " GPU sum : ", globalsum(2) write (*,*) "Test Result : OK" else @@ -144,4 +152,4 @@ end subroutine cpp_call call cpp_call(c_loc(fp(1)), n, i) end subroutine call_cpp_std -end program set_openacc_cuda_mpi +end program set_openacc_device From df52de7b973ce4089d608c960e15252c56d3b1e2 Mon Sep 17 00:00:00 2001 From: Luca Date: Wed, 13 May 2020 00:23:02 +0200 Subject: [PATCH 03/11] Fix long line an d comments --- cscs-checks/mch/set_openacc_device.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/cscs-checks/mch/set_openacc_device.py b/cscs-checks/mch/set_openacc_device.py index 9bc97c1d1e..1e383c5524 100644 --- a/cscs-checks/mch/set_openacc_device.py +++ b/cscs-checks/mch/set_openacc_device.py @@ -40,7 +40,8 @@ def __init__(self): } self.executable = 'set_openacc_device' - self.sanity_patterns = sn.assert_found(r'Test\sResult\s*:\s+OK', self.stdout) + self.sanity_patterns = sn.assert_found(r'Test\sResult\s*:\s+OK', + self.stdout) self.maintainers = ['LM'] self.tags = {'production', 'mch'} @@ -48,13 +49,13 @@ def __init__(self): def setflags(self): if self.current_environ.name.startswith('PrgEnv-pgi'): self.build_system.fflags += ['-acc'] - #if self.current_system.name == 'kesch': - # self.build_system.fflags += ['-ta=tesla,cc35,cuda8.0'] - # self.build_system.ldflags = [ - # '-acc', '-ta:tesla:cc35,cuda8.0', '-lstdc++', - # '-L/global/opt/nvidia/cudatoolkit/8.0.61/lib64', - # '-lcublas', '-lcudart' - # ] +# if self.current_system.name == 'kesch': +# self.build_system.fflags += ['-ta=tesla,cc35,cuda8.0'] +# self.build_system.ldflags = [ +# '-acc', '-ta:tesla:cc35,cuda8.0', '-lstdc++', +# '-L/global/opt/nvidia/cudatoolkit/8.0.61/lib64', +# '-lcublas', '-lcudart' +# ] if self.current_system.name in ['arolla', 'tsa']: self.build_system.fflags += ['-ta=tesla,cc70,cuda10.1'] self.build_system.ldflags = [ From 06ca72d27a93683e34842766d23361325408b4bf Mon Sep 17 00:00:00 2001 From: Luca Date: Wed, 13 May 2020 00:26:00 +0200 Subject: [PATCH 04/11] Removed trailing white space on line 43 --- cscs-checks/mch/set_openacc_device.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cscs-checks/mch/set_openacc_device.py b/cscs-checks/mch/set_openacc_device.py index 1e383c5524..7dabb0d880 100644 --- a/cscs-checks/mch/set_openacc_device.py +++ b/cscs-checks/mch/set_openacc_device.py @@ -40,7 +40,7 @@ def __init__(self): } self.executable = 'set_openacc_device' - self.sanity_patterns = sn.assert_found(r'Test\sResult\s*:\s+OK', + self.sanity_patterns = sn.assert_found(r'Test\sResult\s*:\s+OK', self.stdout) self.maintainers = ['LM'] self.tags = {'production', 'mch'} From 80c248f5367f111624a360454a242097f75d160c Mon Sep 17 00:00:00 2001 From: lucamar Date: Thu, 14 May 2020 12:53:10 +0200 Subject: [PATCH 05/11] Update cscs-checks/mch/set_openacc_device.py Co-authored-by: Vasileios Karakasis --- cscs-checks/mch/set_openacc_device.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cscs-checks/mch/set_openacc_device.py b/cscs-checks/mch/set_openacc_device.py index 7dabb0d880..835b35e978 100644 --- a/cscs-checks/mch/set_openacc_device.py +++ b/cscs-checks/mch/set_openacc_device.py @@ -8,7 +8,7 @@ @rfm.simple_test -class SetOpenaccDevice(rfm.RegressionTest): +class SetOpenaccDeviceTest(rfm.RegressionTest): def __init__(self): self.descr = 'Use OpenAcc, CUDA, MPI, and C++ on multi-device nodes' self.valid_systems = ['arolla:cn', 'tsa:cn'] From d223354a9ec58e7ef3b201056e0fd3831ce5e02c Mon Sep 17 00:00:00 2001 From: Luca Date: Fri, 15 May 2020 11:44:56 +0200 Subject: [PATCH 06/11] Changing name of the test as suggested --- .../mch/{set_openacc_device.py => multi_device_openacc} | 8 ++++---- ...e.set_openacc_device => Makefile.multi_device_openacc} | 4 ++-- .../{set_openacc_device.F90 => multi_device_openacc.F90} | 0 3 files changed, 6 insertions(+), 6 deletions(-) rename cscs-checks/mch/{set_openacc_device.py => multi_device_openacc} (89%) rename cscs-checks/mch/src/{Makefile.set_openacc_device => Makefile.multi_device_openacc} (81%) rename cscs-checks/mch/src/{set_openacc_device.F90 => multi_device_openacc.F90} (100%) diff --git a/cscs-checks/mch/set_openacc_device.py b/cscs-checks/mch/multi_device_openacc similarity index 89% rename from cscs-checks/mch/set_openacc_device.py rename to cscs-checks/mch/multi_device_openacc index 835b35e978..1e5cfaa932 100644 --- a/cscs-checks/mch/set_openacc_device.py +++ b/cscs-checks/mch/multi_device_openacc @@ -8,13 +8,13 @@ @rfm.simple_test -class SetOpenaccDeviceTest(rfm.RegressionTest): +class MultiDeviceOpenaccTest(rfm.RegressionTest): def __init__(self): - self.descr = 'Use OpenAcc, CUDA, MPI, and C++ on multi-device nodes' + self.descr = 'Allocate one accelerator per MPI task using OpenAcc with CUDA, MPI, and C++ on multi-device nodes' self.valid_systems = ['arolla:cn', 'tsa:cn'] self.valid_prog_environs = ['PrgEnv-cce', 'PrgEnv-pgi'] self.build_system = 'Make' - self.build_system.makefile = 'Makefile.set_openacc_device' + self.build_system.makefile = 'Makefile.multi_device_openacc' self.build_system.fflags = ['-O2'] # if self.current_system.name == 'kesch': @@ -39,7 +39,7 @@ def __init__(self): 'G2G': '1' } - self.executable = 'set_openacc_device' + self.executable = 'multi_device_openacc' self.sanity_patterns = sn.assert_found(r'Test\sResult\s*:\s+OK', self.stdout) self.maintainers = ['LM'] diff --git a/cscs-checks/mch/src/Makefile.set_openacc_device b/cscs-checks/mch/src/Makefile.multi_device_openacc similarity index 81% rename from cscs-checks/mch/src/Makefile.set_openacc_device rename to cscs-checks/mch/src/Makefile.multi_device_openacc index fb0d18c05d..eaaacd697c 100644 --- a/cscs-checks/mch/src/Makefile.set_openacc_device +++ b/cscs-checks/mch/src/Makefile.multi_device_openacc @@ -1,10 +1,10 @@ RM := rm -f -EXECUTABLE := set_openacc_device +EXECUTABLE := multi_device_openacc all: $(EXECUTABLE) LD = $(FC) -OBJS = compute_cuda.o set_openacc_device.o std_cpp_call.o +OBJS = compute_cuda.o multi_device_openacc.o std_cpp_call.o # OBJ2 = $(subst _,$(PE_ENV)_,$(OBJ)) LIB = diff --git a/cscs-checks/mch/src/set_openacc_device.F90 b/cscs-checks/mch/src/multi_device_openacc.F90 similarity index 100% rename from cscs-checks/mch/src/set_openacc_device.F90 rename to cscs-checks/mch/src/multi_device_openacc.F90 From cb682dc3606a66fa34869095ddf7085c06560956 Mon Sep 17 00:00:00 2001 From: Luca Date: Fri, 15 May 2020 12:22:16 +0200 Subject: [PATCH 07/11] Adding python extension to check --- cscs-checks/mch/{multi_device_openacc => multi_device_openacc.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename cscs-checks/mch/{multi_device_openacc => multi_device_openacc.py} (100%) diff --git a/cscs-checks/mch/multi_device_openacc b/cscs-checks/mch/multi_device_openacc.py similarity index 100% rename from cscs-checks/mch/multi_device_openacc rename to cscs-checks/mch/multi_device_openacc.py From 89c10c47739c80e7b7402629849c9f29b2a779bb Mon Sep 17 00:00:00 2001 From: Luca Date: Fri, 15 May 2020 12:24:46 +0200 Subject: [PATCH 08/11] Improving test description as suggested --- cscs-checks/mch/multi_device_openacc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cscs-checks/mch/multi_device_openacc.py b/cscs-checks/mch/multi_device_openacc.py index 1e5cfaa932..de5ac46bba 100644 --- a/cscs-checks/mch/multi_device_openacc.py +++ b/cscs-checks/mch/multi_device_openacc.py @@ -10,7 +10,7 @@ @rfm.simple_test class MultiDeviceOpenaccTest(rfm.RegressionTest): def __init__(self): - self.descr = 'Allocate one accelerator per MPI task using OpenAcc with CUDA, MPI, and C++ on multi-device nodes' + self.descr = 'Allocate one accelerator per MPI task using OpenAcc on multi-device nodes with additional CUDA, MPI, and C++ calls' self.valid_systems = ['arolla:cn', 'tsa:cn'] self.valid_prog_environs = ['PrgEnv-cce', 'PrgEnv-pgi'] self.build_system = 'Make' From f627414bc8b44870448bade5dfcc3e29d7d4d1fa Mon Sep 17 00:00:00 2001 From: Luca Date: Fri, 15 May 2020 12:38:24 +0200 Subject: [PATCH 09/11] Fixing program name and restoring Kesch --- cscs-checks/mch/multi_device_openacc.py | 43 ++++++++++---------- cscs-checks/mch/src/multi_device_openacc.F90 | 4 +- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/cscs-checks/mch/multi_device_openacc.py b/cscs-checks/mch/multi_device_openacc.py index de5ac46bba..fb13e03824 100644 --- a/cscs-checks/mch/multi_device_openacc.py +++ b/cscs-checks/mch/multi_device_openacc.py @@ -11,24 +11,24 @@ class MultiDeviceOpenaccTest(rfm.RegressionTest): def __init__(self): self.descr = 'Allocate one accelerator per MPI task using OpenAcc on multi-device nodes with additional CUDA, MPI, and C++ calls' - self.valid_systems = ['arolla:cn', 'tsa:cn'] + self.valid_systems = ['arolla:cn', 'tsa:cn', 'kesch:cn'] self.valid_prog_environs = ['PrgEnv-cce', 'PrgEnv-pgi'] self.build_system = 'Make' self.build_system.makefile = 'Makefile.multi_device_openacc' self.build_system.fflags = ['-O2'] -# if self.current_system.name == 'kesch': -# self.exclusive_access = True -# self.modules = ['cudatoolkit/8.0.61'] -# self.num_tasks = 9 -# self.num_tasks_per_node = 9 -# self.num_gpus_per_node = 8 -# self.build_system.options = ['NVCC_FLAGS="-arch=compute_37"'] -# self.variables = { -# 'MV2_USE_CUDA': '1', -# 'G2G': '1' -# } - if self.current_system.name in ['arolla', 'tsa']: + if self.current_system.name == 'kesch': + self.exclusive_access = True + self.modules = ['cudatoolkit/8.0.61'] + self.num_tasks = 9 + self.num_tasks_per_node = 9 + self.num_gpus_per_node = 8 + self.build_system.options = ['NVCC_FLAGS="-arch=compute_37"'] + self.variables = { + 'MV2_USE_CUDA': '1', + 'G2G': '1' + } + elif self.current_system.name in ['arolla', 'tsa']: self.exclusive_access = True self.modules = ['cuda/10.1.243'] self.num_tasks = 9 @@ -49,14 +49,15 @@ def __init__(self): def setflags(self): if self.current_environ.name.startswith('PrgEnv-pgi'): self.build_system.fflags += ['-acc'] -# if self.current_system.name == 'kesch': -# self.build_system.fflags += ['-ta=tesla,cc35,cuda8.0'] -# self.build_system.ldflags = [ -# '-acc', '-ta:tesla:cc35,cuda8.0', '-lstdc++', -# '-L/global/opt/nvidia/cudatoolkit/8.0.61/lib64', -# '-lcublas', '-lcudart' -# ] - if self.current_system.name in ['arolla', 'tsa']: + + if self.current_system.name == 'kesch': + self.build_system.fflags += ['-ta=tesla,cc35,cuda8.0'] + self.build_system.ldflags = [ + '-acc', '-ta:tesla:cc35,cuda8.0', '-lstdc++', + '-L/global/opt/nvidia/cudatoolkit/8.0.61/lib64', + '-lcublas', '-lcudart' + ] + elif self.current_system.name in ['arolla', 'tsa']: self.build_system.fflags += ['-ta=tesla,cc70,cuda10.1'] self.build_system.ldflags = [ '-acc', '-ta:tesla:cc70,cuda10.1', '-lstdc++', diff --git a/cscs-checks/mch/src/multi_device_openacc.F90 b/cscs-checks/mch/src/multi_device_openacc.F90 index b397f977b8..8a4ba28531 100644 --- a/cscs-checks/mch/src/multi_device_openacc.F90 +++ b/cscs-checks/mch/src/multi_device_openacc.F90 @@ -1,6 +1,6 @@ ! This code tests MPI tasks communication with GPU devices ! using OpenACC directives and setting one device per task -program set_openacc_device +program multi_device_openacc use openacc implicit none @@ -152,4 +152,4 @@ end subroutine cpp_call call cpp_call(c_loc(fp(1)), n, i) end subroutine call_cpp_std -end program set_openacc_device +end program multi_device_openacc From dff8beca2d239e6d7111a6c889d775da381d334d Mon Sep 17 00:00:00 2001 From: Luca Date: Mon, 18 May 2020 10:56:26 +0200 Subject: [PATCH 10/11] Fixing test on Kesch --- cscs-checks/mch/multi_device_openacc.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/cscs-checks/mch/multi_device_openacc.py b/cscs-checks/mch/multi_device_openacc.py index fb13e03824..413abcaf3b 100644 --- a/cscs-checks/mch/multi_device_openacc.py +++ b/cscs-checks/mch/multi_device_openacc.py @@ -12,7 +12,7 @@ class MultiDeviceOpenaccTest(rfm.RegressionTest): def __init__(self): self.descr = 'Allocate one accelerator per MPI task using OpenAcc on multi-device nodes with additional CUDA, MPI, and C++ calls' self.valid_systems = ['arolla:cn', 'tsa:cn', 'kesch:cn'] - self.valid_prog_environs = ['PrgEnv-cce', 'PrgEnv-pgi'] + self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-pgi'] self.build_system = 'Make' self.build_system.makefile = 'Makefile.multi_device_openacc' self.build_system.fflags = ['-O2'] @@ -24,10 +24,6 @@ def __init__(self): self.num_tasks_per_node = 9 self.num_gpus_per_node = 8 self.build_system.options = ['NVCC_FLAGS="-arch=compute_37"'] - self.variables = { - 'MV2_USE_CUDA': '1', - 'G2G': '1' - } elif self.current_system.name in ['arolla', 'tsa']: self.exclusive_access = True self.modules = ['cuda/10.1.243'] @@ -35,9 +31,6 @@ def __init__(self): self.num_tasks_per_node = 9 self.num_gpus_per_node = 8 self.build_system.options = ['NVCC_FLAGS="-arch=compute_70"'] - self.variables = { - 'G2G': '1' - } self.executable = 'multi_device_openacc' self.sanity_patterns = sn.assert_found(r'Test\sResult\s*:\s+OK', @@ -49,7 +42,6 @@ def __init__(self): def setflags(self): if self.current_environ.name.startswith('PrgEnv-pgi'): self.build_system.fflags += ['-acc'] - if self.current_system.name == 'kesch': self.build_system.fflags += ['-ta=tesla,cc35,cuda8.0'] self.build_system.ldflags = [ @@ -63,3 +55,9 @@ def setflags(self): '-acc', '-ta:tesla:cc70,cuda10.1', '-lstdc++', '-L$EBROOTCUDA/lib64', '-lcublas', '-lcudart' ] + elif self.current_environ.name.startswith('PrgEnv-cray'): + self.build_system.fflags += ['-DCRAY', '-hacc', '-hnoomp'] + self.variables = { + 'CRAY_ACCEL_TARGET': 'nvidia35', + 'MV2_USE_CUDA': '1' + } From 64e109765792cbb2bc8535706a135da52c1a9942 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Mon, 25 May 2020 21:13:31 +0200 Subject: [PATCH 11/11] Address PR comments --- cscs-checks/mch/multi_device_openacc.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/cscs-checks/mch/multi_device_openacc.py b/cscs-checks/mch/multi_device_openacc.py index 413abcaf3b..50f2e5c026 100644 --- a/cscs-checks/mch/multi_device_openacc.py +++ b/cscs-checks/mch/multi_device_openacc.py @@ -10,13 +10,15 @@ @rfm.simple_test class MultiDeviceOpenaccTest(rfm.RegressionTest): def __init__(self): - self.descr = 'Allocate one accelerator per MPI task using OpenAcc on multi-device nodes with additional CUDA, MPI, and C++ calls' + self.descr = ( + 'Allocate one accelerator per MPI task using OpenACC on ' + 'multi-device nodes with additional CUDA, MPI, and C++ calls' + ) self.valid_systems = ['arolla:cn', 'tsa:cn', 'kesch:cn'] self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-pgi'] self.build_system = 'Make' self.build_system.makefile = 'Makefile.multi_device_openacc' self.build_system.fflags = ['-O2'] - if self.current_system.name == 'kesch': self.exclusive_access = True self.modules = ['cudatoolkit/8.0.61'] @@ -35,7 +37,7 @@ def __init__(self): self.executable = 'multi_device_openacc' self.sanity_patterns = sn.assert_found(r'Test\sResult\s*:\s+OK', self.stdout) - self.maintainers = ['LM'] + self.maintainers = ['LM', 'AJ'] self.tags = {'production', 'mch'} @rfm.run_before('compile') @@ -56,8 +58,8 @@ def setflags(self): '-L$EBROOTCUDA/lib64', '-lcublas', '-lcudart' ] elif self.current_environ.name.startswith('PrgEnv-cray'): - self.build_system.fflags += ['-DCRAY', '-hacc', '-hnoomp'] - self.variables = { - 'CRAY_ACCEL_TARGET': 'nvidia35', - 'MV2_USE_CUDA': '1' - } + self.build_system.fflags += ['-DCRAY', '-hacc', '-hnoomp'] + self.variables = { + 'CRAY_ACCEL_TARGET': 'nvidia35', + 'MV2_USE_CUDA': '1' + }