diff --git a/cscs-checks/mch/openacc_cuda_mpi_cppstd.py b/cscs-checks/mch/openacc_cuda_mpi_cppstd.py new file mode 100644 index 0000000000..d24efecf14 --- /dev/null +++ b/cscs-checks/mch/openacc_cuda_mpi_cppstd.py @@ -0,0 +1,69 @@ +import reframe as rfm +import reframe.utility.sanity as sn + + +@rfm.parameterized_test([True], [False]) +class OpenaccCudaCpp(rfm.RegressionTest): + def __init__(self, withmpi): + super().__init__() + name_suffix = 'WithMPI' if withmpi else 'WithoutMPI' + self.name = 'OpenaccCudaCPP' + name_suffix + self.descr = 'test for OpenACC, CUDA, MPI, and C++' + self.valid_systems = ['daint:gpu', 'dom:gpu', 'kesch:cn'] + self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-pgi'] + self.build_system = 'Make' + self.build_system.fflags = ['-O2'] + if self.current_system.name in ['daint', 'dom']: + self.modules = ['craype-accel-nvidia60'] + self.variables = { + 'MPICH_RDMA_ENABLED_CUDA': '1', + 'CRAY_CUDA_MPS': '1' + } + self.num_tasks = 12 + self.num_tasks_per_node = 12 + self.num_gpus_per_node = 1 + self.build_system.options = ['NVCC_FLAGS="-arch=compute_60"'] + elif self.current_system.name in ['kesch']: + self.modules = ['craype-accel-nvidia35'] + self.variables = { + 'MV2_USE_CUDA': '1', + 'G2G': '1' + } + self.num_tasks = 8 + self.num_tasks_per_node = 8 + self.num_gpus_per_node = 8 + self.build_system.options = ['NVCC_FLAGS="-arch=compute_37"'] + + if withmpi: + self.build_system.cppflags = ['-DUSE_MPI'] + else: + if self.current_system.name == 'kesch': + self.valid_prog_environs = ['PrgEnv-cray-nompi', + 'PrgEnv-pgi-nompi'] + + self.num_tasks = 1 + self.num_tasks_per_node = 1 + self.num_gpus_per_node = 1 + + self.executable = 'openacc_cuda_mpi_cppstd' + self.sanity_patterns = sn.assert_found(r'Result:\s+OK', self.stdout) + self.maintainers = ['AJ', 'VK'] + self.tags = {'production'} + + def setup(self, partition, environ, **job_opts): + if environ.name.startswith('PrgEnv-cray'): + self.build_system.fflags += ['-hacc', '-hnoomp'] + elif environ.name.startswith('PrgEnv-pgi'): + self.build_system.fflags += ['-acc'] + if self.current_system.name in ['daint', 'dom']: + self.build_system.fflags += ['-ta:tesla:cc60'] + self.build_system.ldflags = ['-acc', '-ta:tesla:cc60', + '-Mnorpath', '-lstdc++'] + elif self.current_system.name == 'kesch': + self.build_system.fflags += ['-ta=tesla,cc35,cuda8.0'] + self.build_system.ldflags = [ + '-acc', '-ta:tesla:cc35,cuda8.0', '-lstdc++', + '-L/global/opt/nvidia/cudatoolkit/8.0.61/lib64', + '-lcublas', '-lcudart'] + + super().setup(partition, environ, **job_opts) diff --git a/cscs-checks/mch/src/Makefile b/cscs-checks/mch/src/Makefile new file mode 100644 index 0000000000..5de7f6f0be --- /dev/null +++ b/cscs-checks/mch/src/Makefile @@ -0,0 +1,29 @@ +RM := rm -f +EXECUTABLE := openacc_cuda_mpi_cppstd + +all: $(EXECUTABLE) +LD = $(FC) + +OBJS = compute_cuda.o openacc_cuda_mpi_cppstd.o std_cpp_call.o +# OBJ2 = $(subst _,$(PE_ENV)_,$(OBJ)) +LIB = + +.SUFFIXES: .o .cu .cpp .F90 + +%.o: %.cu + $(NVCC) $(CPPFLAGS) $(NVCC_FLAGS) -c $< -o $@ + +%.o: %.cpp + $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $< -o $@ + +%.o: %.F90 + $(FC) $(CPPFLAGS) $(FCFLAGS) -c $< -o $@ + +$(EXECUTABLE): $(OBJS) + $(LD) $(OBJS) -o $@ $(LDFLAGS) $(LIB) + +clean: + -$(RM) $(OBJS) + +distclean: + -$(RM) $(OBJS) $(EXECUTABLE) diff --git a/cscs-checks/mch/src/compute_cuda.cu b/cscs-checks/mch/src/compute_cuda.cu new file mode 100644 index 0000000000..bd77a2d23c --- /dev/null +++ b/cscs-checks/mch/src/compute_cuda.cu @@ -0,0 +1,59 @@ +#include +#include "cuda.h" + +#define cudaCheckErrors(msg) \ + do { \ + cudaError_t __err = cudaGetLastError(); \ + if (__err != cudaSuccess) { \ + fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \ + msg, cudaGetErrorString(__err), \ + __FILE__, __LINE__); \ + fprintf(stderr, "*** FAILED - ABORTING\n"); \ + exit(1); \ + } \ + } while (0) + +extern "C" { + +__global__ void simple_add(float* a, float* b, int n) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if(i < n) { + a[i] = a[i] + b[i]; + } +} + +void cuda_kernel_no_copy(float* a, float* b, int n) +{ + const int THREADS_PER_BLOCK = 1; + const int NUMBER_OF_BLOCKS = 10; + + cudaThreadSynchronize(); + simple_add<<>>(a, b, n); + cudaThreadSynchronize(); + + cudaCheckErrors("cuda error"); +} + +void cuda_kernel_with_copy(float* a, float* b, int n) +{ + const int THREADS_PER_BLOCK = 1; + const int NUMBER_OF_BLOCKS = 10; + + float* d_a; + float* d_b; + cudaMalloc(&d_a, n*sizeof(float)); + cudaMalloc(&d_b, n*sizeof(float)); + cudaMemcpy(d_a, a, n*sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_b, b, n*sizeof(float), cudaMemcpyHostToDevice); + + cudaThreadSynchronize(); + simple_add<<>>(d_a, d_b, n); + cudaThreadSynchronize(); + + cudaMemcpy(a, d_a, n*sizeof(float), cudaMemcpyDeviceToHost); + + cudaCheckErrors("cuda error"); + +} +}; diff --git a/cscs-checks/mch/src/openacc_cuda_mpi_cppstd.F90 b/cscs-checks/mch/src/openacc_cuda_mpi_cppstd.F90 new file mode 100644 index 0000000000..439f493648 --- /dev/null +++ b/cscs-checks/mch/src/openacc_cuda_mpi_cppstd.F90 @@ -0,0 +1,180 @@ +program openacc_cuda_mpi_cppstd + ! This code tests MPI communication on GPU with OpenACC using the + ! host_data directive + CUDA call from Fortran as well as C++ function + ! using std library call + implicit none + + +#ifdef USE_MPI + include 'mpif.h' + integer :: status(MPI_STATUS_SIZE) +#endif + + integer :: ierr, i + integer :: cpp_std_sum ! Sum done with C++ call to STD lib + integer :: mpi_size, mpi_rank + integer(8) :: mydata(1), data_sum(1), ref_val + real, allocatable :: f1(:), f2(:), f3(:) + + ! Test parameter + integer, parameter :: NSIZE = 10 + real, parameter :: EXPECTED_CUDA_SUM = 110.0 + real, parameter :: EXPECTED_CPP_STD_SUM = 55.0 + +#ifdef USE_MPI + call MPI_Init(ierr) + call MPI_Comm_size(MPI_COMM_WORLD, mpi_size, ierr) + call MPI_Comm_rank(MPI_COMM_WORLD, mpi_rank, ierr) + mydata(1) = mpi_rank +#ifdef _OPENACC + if (mpi_rank == 0) write(*,*) "MPI test on GPU with OpenACC using ",mpi_size,"tasks" +#else + if (mpi_rank == 0) write(*,*) "MPI test on CPU using ",mpi_size,"tasks" +#endif + +#else + + mpi_rank = 0 +#ifdef _OPENACC + write(*,*) "Single node test on GPU with OpenACC" +#else + write(*,*) "Single node test test on CPU" +#endif + +#endif + +#ifdef USE_MPI + !$acc data copy(mydata,data_sum) + !$acc host_data use_device(mydata,data_sum) + call MPI_Reduce(mydata, data_sum, 1, MPI_INTEGER8, MPI_SUM, 0, MPI_COMM_WORLD, ierr) + !$acc end host_data + !$acc end data +#endif + + if(mpi_rank == 0) then + + ! Allocate and initialize arrays on the GPU + allocate(f1(NSIZE)) + allocate(f2(NSIZE)) + allocate(f3(NSIZE)) + + !$acc data pcreate(f1,f2,f3) + !$acc parallel loop + do i = 1, NSIZE + f1(i) = i + f2(i) = i + f3(i) = i + end do + !$acc update host(f1,f2,f3) + + ! Call a CUDA kernel with host arrays + call call_cuda_kernel_with_copy(f1, f2, NSIZE) + + ! Call a CUDA kernel without data copy, use device ptr + call call_cuda_kernel_no_copy(f3, f2, NSIZE) + !$acc update host(f3) + + ! Call a C++ function using STD lib + call call_cpp_std(f2, NSIZE, cpp_std_sum) + !$acc end data + end if + + !Check results + if (mpi_rank == 0) then + ref_val = 0 + do i = 0, mpi_size - 1 + ref_val = ref_val + i + end do + if (sum(f1) /= EXPECTED_CUDA_SUM) then + write (*,*) "Result : FAIL" + write (*,*) "Expected value sum(f1): ", EXPECTED_CUDA_SUM, "actual value:", sum(f1) + else if (sum(f3) /= EXPECTED_CUDA_SUM) then + write (*,*) "Result : FAIL" + write (*,*) "Expected value sum(f3): ", EXPECTED_CUDA_SUM, "actual value:", sum(f3) +#ifdef USE_MPI + else if (data_sum(1) /= ref_val) then + write (*,*) "Result : FAIL" + write (*,*) "Expected value data_sum: ", ref_val, "actual value:", data_sum(1) +#endif + else if (cpp_std_sum /= EXPECTED_CPP_STD_SUM) then + write (*,*) "Result : FAIL" + write (*,*) "Expected value stdres: ", EXPECTED_CPP_STD_SUM, "actual value:", cpp_std_sum + else + write (*,*) "Result : OK" + end if + end if + + if(mpi_rank == 0) then + deallocate(f1) + deallocate(f2) + deallocate(f3) + write (*,*) "Result: OK" + end if + +#ifdef USE_MPI + call MPI_Finalize(ierr); +#endif + +contains + subroutine call_cuda_kernel_with_copy(f1,f2,n) + use, intrinsic :: iso_c_binding + implicit none + real, intent(inout) :: f1(:) + real, intent(in) :: f2(:) + integer, intent(in) :: n + + interface + subroutine cuda_kernel_with_copy(f1,f2,n) bind(c,name='cuda_kernel_with_copy') + use, intrinsic :: iso_c_binding + type(c_ptr), intent(in), value :: f1, f2 + integer, intent(in), value :: n + end subroutine cuda_kernel_with_copy + end interface + + call cuda_kernel_with_copy(c_loc(f1(1)), c_loc(f2(1)), n) + end subroutine call_cuda_kernel_with_copy + + subroutine call_cuda_kernel_no_copy(f1,f2,n) + use, intrinsic :: iso_c_binding + implicit none + real, intent(inout) :: f1(:) + real, intent(in) :: f2(:) + integer, intent(in) :: n + + interface + subroutine cuda_kernel_no_copy(f1,f2,n) bind(c,name='cuda_kernel_no_copy') + use, intrinsic :: iso_c_binding + type(c_ptr), intent(in), value :: f1, f2 + integer, intent(in), value :: n + end subroutine cuda_kernel_no_copy + end interface + + !$acc data present(f1, f2) + !$acc host_data use_device(f1, f2) + call cuda_kernel_no_copy(c_loc(f1(1)), c_loc(f2(1)), n) + !$acc end host_data + !$acc end data + end subroutine call_cuda_kernel_no_copy + + subroutine call_cpp_std(f,n,i) + use, intrinsic :: iso_c_binding + implicit none + real(kind=c_float), intent(in), target :: f(:) + real(kind=c_float), pointer :: fp(:) + integer, intent(in) :: n + integer(kind=c_int), intent(out) :: i + + interface + subroutine cpp_call(f,n,i) bind(c,name='do_smth_with_std') + use, intrinsic :: iso_c_binding + type(c_ptr), intent(in), value :: f + integer, intent(in), value :: n + integer(kind=c_int), intent(out) :: i + end subroutine cpp_call + end interface + + fp => f + + call cpp_call(c_loc(fp(1)), n, i) + end subroutine call_cpp_std +end program openacc_cuda_mpi_cppstd diff --git a/cscs-checks/mch/src/std_cpp_call.cpp b/cscs-checks/mch/src/std_cpp_call.cpp new file mode 100644 index 0000000000..97c681a4b8 --- /dev/null +++ b/cscs-checks/mch/src/std_cpp_call.cpp @@ -0,0 +1,8 @@ +#include +using namespace std; + +extern"C" void do_smth_with_std(float* x, int n, int* res) +{ + *res = 0; + *res = accumulate(x, x+n, 0); +}