Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions cscs-checks/mch/multi_device_openacc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Copyright 2016-2020 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
# ReFrame Project Developers. See the top-level LICENSE file for details.
#
# SPDX-License-Identifier: BSD-3-Clause

import reframe as rfm
import reframe.utility.sanity as sn


@rfm.simple_test
class MultiDeviceOpenaccTest(rfm.RegressionTest):
def __init__(self):
self.descr = (
'Allocate one accelerator per MPI task using OpenACC on '
'multi-device nodes with additional CUDA, MPI, and C++ calls'
)
self.valid_systems = ['arolla:cn', 'tsa:cn', 'kesch:cn']
self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-pgi']
self.build_system = 'Make'
self.build_system.makefile = 'Makefile.multi_device_openacc'
self.build_system.fflags = ['-O2']
if self.current_system.name == 'kesch':
self.exclusive_access = True
self.modules = ['cudatoolkit/8.0.61']
self.num_tasks = 9
self.num_tasks_per_node = 9
self.num_gpus_per_node = 8
self.build_system.options = ['NVCC_FLAGS="-arch=compute_37"']
elif self.current_system.name in ['arolla', 'tsa']:
self.exclusive_access = True
self.modules = ['cuda/10.1.243']
self.num_tasks = 9
self.num_tasks_per_node = 9
self.num_gpus_per_node = 8
self.build_system.options = ['NVCC_FLAGS="-arch=compute_70"']

self.executable = 'multi_device_openacc'
self.sanity_patterns = sn.assert_found(r'Test\sResult\s*:\s+OK',
self.stdout)
self.maintainers = ['LM', 'AJ']
self.tags = {'production', 'mch'}

@rfm.run_before('compile')
def setflags(self):
if self.current_environ.name.startswith('PrgEnv-pgi'):
self.build_system.fflags += ['-acc']
if self.current_system.name == 'kesch':
self.build_system.fflags += ['-ta=tesla,cc35,cuda8.0']
self.build_system.ldflags = [
'-acc', '-ta:tesla:cc35,cuda8.0', '-lstdc++',
'-L/global/opt/nvidia/cudatoolkit/8.0.61/lib64',
'-lcublas', '-lcudart'
]
elif self.current_system.name in ['arolla', 'tsa']:
self.build_system.fflags += ['-ta=tesla,cc70,cuda10.1']
self.build_system.ldflags = [
'-acc', '-ta:tesla:cc70,cuda10.1', '-lstdc++',
'-L$EBROOTCUDA/lib64', '-lcublas', '-lcudart'
]
elif self.current_environ.name.startswith('PrgEnv-cray'):
self.build_system.fflags += ['-DCRAY', '-hacc', '-hnoomp']
self.variables = {
'CRAY_ACCEL_TARGET': 'nvidia35',
'MV2_USE_CUDA': '1'
}
29 changes: 29 additions & 0 deletions cscs-checks/mch/src/Makefile.multi_device_openacc
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
RM := rm -f
EXECUTABLE := multi_device_openacc

all: $(EXECUTABLE)
LD = $(FC)

OBJS = compute_cuda.o multi_device_openacc.o std_cpp_call.o
# OBJ2 = $(subst _,$(PE_ENV)_,$(OBJ))
LIB =

.SUFFIXES: .o .cu .cpp .F90

%.o: %.cu
$(NVCC) $(CPPFLAGS) $(NVCC_FLAGS) -c $< -o $@

%.o: %.cpp
$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $< -o $@

%.o: %.F90
$(FC) $(CPPFLAGS) $(FCFLAGS) -c $< -o $@

$(EXECUTABLE): $(OBJS)
$(LD) $(OBJS) -o $@ $(LDFLAGS) $(LIB)

clean:
-$(RM) $(OBJS)

distclean:
-$(RM) $(OBJS) $(EXECUTABLE)
155 changes: 155 additions & 0 deletions cscs-checks/mch/src/multi_device_openacc.F90
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
! This code tests MPI tasks communication with GPU devices
! using OpenACC directives and setting one device per task
program multi_device_openacc
use openacc
implicit none

include 'mpif.h'
#ifdef CRAY
integer, parameter :: ACC_DEVICE_TYPE = 8
#else
integer, parameter :: ACC_DEVICE_TYPE = 4
#endif
integer, parameter :: ARRAYSIZE = 10
integer(kind=ACC_DEVICE_TYPE) :: devicetype
integer :: status(MPI_STATUS_SIZE), mpi_size, mpi_rank
integer :: ierr, i, gpuid, ngpus, localsum(2), globalsum(2)
real, allocatable :: array1(:), array2(:)

call MPI_Init(ierr)
call MPI_Comm_size(MPI_COMM_WORLD, mpi_size, ierr)
call MPI_Comm_rank(MPI_COMM_WORLD, mpi_rank, ierr)

! each task creates two arrays: the sum of their elements will be 10*mpi_rank
allocate(array1(ARRAYSIZE))
allocate(array2(ARRAYSIZE))

! get number of gpu devices
devicetype = acc_get_device_type()
ngpus = acc_get_num_devices(devicetype)

! rank 0 prints number of tasks and number of gpu devices
if (mpi_rank == 0) then
write(*,*) "MPI test with OpenACC using", mpi_size, "tasks and ", ngpus, "GPU devices"
! initialization of the arrays on rank 0
do i = 1, ARRAYSIZE
array1(i) = .0
array2(i) = .0
end do
else
! each MPI rank different from 0 addresses a different GPU device
gpuid = mod(mpi_rank, ngpus)
call acc_set_device_num(gpuid, acc_device_nvidia)
call acc_init(acc_device_nvidia)
gpuid = acc_get_device_num(devicetype)
write(*,*) "MPI task ", mpi_rank, "is using GPU id ", gpuid, "out of ", ngpus

! initialization of the arrays on the gpu device used by the current MPI rank
!$acc data pcreate(array1,array2)
!$acc parallel loop
do i = 1, ARRAYSIZE
array1(i) = mpi_rank*0.25
array2(i) = mpi_rank*0.75
end do
! update the arrays on the current MPI rank
!$acc update host(array1,array2)

! the current MPI rank computes localsum(1)
localsum(1) = sum(array1)+sum(array2)

! call external c++ function
call call_cpp_std(array1, ARRAYSIZE, i)

! compute the sum of the arrays on the GPU using device ptr
call call_cuda_kernel_no_copy(array1, array2, ARRAYSIZE)
! update array1 on the current MPI rank
!$acc update host(array1)
!$acc end data
! array1 is now equal to sum(array1)+sum(array2)

! compute localsum(2)
localsum(2) = sum(array1)
end if

! the current MPI rank adds localsum to globalsum on rank 0
call MPI_Reduce(localsum, globalsum, 2, MPI_INTEGER, MPI_SUM, 0, MPI_COMM_WORLD, ierr)
! globalsum is 10*n*(n+1)/2 where n is the number of gpu devices
if(mpi_rank == 0) then
if (globalsum(1) == globalsum(2)) then
write (*,*) "CPU sum : ", globalsum(1), " GPU sum : ", globalsum(2)
write (*,*) "Test Result : OK"
else
write (*,*) "CPU sum : ", globalsum(1), " GPU sum : ", globalsum(2)
write (*,*) "Test Result : FAIL"
end if
end if

deallocate(array1)
deallocate(array2)
call MPI_Finalize(ierr);


contains
subroutine call_cuda_kernel_with_copy(a,b,n)
use, intrinsic :: iso_c_binding
implicit none
real, intent(inout), target :: a(:)
real, intent(in), target :: b(:)
integer, intent(in) :: n

interface
subroutine cuda_kernel_with_copy(a,b,n) bind(c,name='cuda_kernel_with_copy')
use, intrinsic :: iso_c_binding
type(c_ptr), intent(in), value :: a, b
integer, intent(in), value :: n
end subroutine cuda_kernel_with_copy
end interface

call cuda_kernel_with_copy(c_loc(a(1)), c_loc(b(1)), n)
end subroutine call_cuda_kernel_with_copy

subroutine call_cuda_kernel_no_copy(a,b,n)
use, intrinsic :: iso_c_binding
implicit none
real, intent(inout), target :: a(:)
real, intent(in), target :: b(:)
integer, intent(in) :: n

interface
subroutine cuda_kernel_no_copy(a,b,n) bind(c,name='cuda_kernel_no_copy')
use, intrinsic :: iso_c_binding
type(c_ptr), intent(in), value :: a, b
integer, intent(in), value :: n
end subroutine cuda_kernel_no_copy
end interface

!$acc data present(a, b)
!$acc host_data use_device(a, b)
call cuda_kernel_no_copy(c_loc(a(1)), c_loc(b(1)), n)
!$acc end host_data
!$acc end data
end subroutine call_cuda_kernel_no_copy

subroutine call_cpp_std(f,n,i)
use, intrinsic :: iso_c_binding
implicit none
real(kind=c_float), intent(in), target :: f(:)
real(kind=c_float), pointer :: fp(:)
integer, intent(in) :: n
integer(kind=c_int), intent(out) :: i

interface
subroutine cpp_call(f,n,i) bind(c,name='do_smth_with_std')
use, intrinsic :: iso_c_binding
type(c_ptr), intent(in), value :: f
integer, intent(in), value :: n
integer(kind=c_int), intent(out) :: i
end subroutine cpp_call
end interface

fp => f

call cpp_call(c_loc(fp(1)), n, i)
end subroutine call_cpp_std

end program multi_device_openacc