From b2fbe86799b9d56d0b4d787d4d3e8c1b568d9f84 Mon Sep 17 00:00:00 2001 From: ajocksch Date: Fri, 22 Jun 2018 11:34:59 +0200 Subject: [PATCH 1/7] added OpenACC+CUDA+C++ check; small correction of automatic_arrays.py --- cscs-checks/mch/automatic_arrays.py | 4 +- cscs-checks/mch/openacc_cuda_mpi_cppstd.py | 56 ++++++ cscs-checks/mch/src/Makefile | 29 +++ cscs-checks/mch/src/compute_cuda.cu | 59 ++++++ .../mch/src/openacc_cuda_mpi_cppstd.F90 | 182 ++++++++++++++++++ cscs-checks/mch/src/std_cpp_call.cpp | 8 + 6 files changed, 336 insertions(+), 2 deletions(-) create mode 100644 cscs-checks/mch/openacc_cuda_mpi_cppstd.py create mode 100644 cscs-checks/mch/src/Makefile create mode 100644 cscs-checks/mch/src/compute_cuda.cu create mode 100644 cscs-checks/mch/src/openacc_cuda_mpi_cppstd.F90 create mode 100644 cscs-checks/mch/src/std_cpp_call.cpp diff --git a/cscs-checks/mch/automatic_arrays.py b/cscs-checks/mch/automatic_arrays.py index 7fab45dc49..63c3a4a69c 100644 --- a/cscs-checks/mch/automatic_arrays.py +++ b/cscs-checks/mch/automatic_arrays.py @@ -11,11 +11,11 @@ def __init__(self, **kwargs): 'PrgEnv-gnu'] if self.current_system.name in ['daint', 'dom']: self.modules = ['craype-accel-nvidia60'] - self._pgi_flags = '-acc -ta=tesla:cc60 -Mnorpath' + self._pgi_flags = '-O2 -acc -ta=tesla:cc60 -Mnorpath' self._cray_variables = {} elif self.current_system.name in ['kesch']: self.modules = ['craype-accel-nvidia35'] - self._pgi_flags = '-O2 -ta=tesla,cc35,cuda8.0' + self._pgi_flags = '-O2 -acc -ta=tesla,cc35,cuda8.0' self._cray_variables = {'MV2_USE_CUDA': '1'} self.num_tasks = 1 diff --git a/cscs-checks/mch/openacc_cuda_mpi_cppstd.py b/cscs-checks/mch/openacc_cuda_mpi_cppstd.py new file mode 100644 index 0000000000..6220ffa52d --- /dev/null +++ b/cscs-checks/mch/openacc_cuda_mpi_cppstd.py @@ -0,0 +1,56 @@ +import reframe as rfm +import reframe.utility.sanity as sn + + +@rfm.parameterized_test(['mpi'], ['nompi']) +class OpenaccCudaMpiCppstd(rfm.RegressionTest): + def __init__(self, withmpi): + super().__init__() + self.descr = 'test for OpenACC, CUDA, MPI, and C++' + self.valid_systems = ['daint:gpu', 'dom:gpu', 'kesch:cn'] + self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-pgi'] + if self.current_system.name in ['daint', 'dom']: + self.modules = ['craype-accel-nvidia60'] + self._pgi_flags = '-O2 -acc -ta=tesla:cc60 -Mnorpath -lstdc++' + self._env_variables = { + 'MPICH_RDMA_ENABLED_CUDA': '1', + 'CRAY_CUDA_MPS': '1' + } + self.num_tasks = 12 + self.num_tasks_per_node = 12 + self.num_gpus_per_node = 1 + self._nvidia_sm = '60' + elif self.current_system.name in ['kesch']: + self.modules = ['craype-accel-nvidia35'] + self._pgi_flags = '-O2 -acc -ta=tesla,cc35,cuda8.0' + self._env_variables = { + 'MV2_USE_CUDA': '1', + 'G2G': '1' + } + self.num_tasks = 8 + self.num_tasks_per_node = 8 + self.num_gpus_per_node = 8 + self._nvidia_sm = '37' + + if withmpi == 'mpi': + self.mpiflag = ' -DUSEMPI' + else: + self.mpiflag = '' + self.num_tasks_per_node = 1 + + self.executable = 'openacc_cuda_mpi_cppstd' + self.sanity_patterns = sn.assert_found(r'Result:\s+OK', self.stdout) + self.maintainers = ['AJ', 'VK'] + self.tags = {'production'} + + def setup(self, partition, environ, **job_opts): + # Set nvcc flags + environ.cxxflags = '-lcublas -lcudart -arch=sm_%s' % self._nvidia_sm + if environ.name.startswith('PrgEnv-cray'): + environ.fflags = '-O2 -hacc -hnoomp' + elif environ.name.startswith('PrgEnv-pgi'): + environ.fflags = self._pgi_flags + + self.variables = self._env_variables + environ.fflags += self.mpiflag + super().setup(partition, environ, **job_opts) diff --git a/cscs-checks/mch/src/Makefile b/cscs-checks/mch/src/Makefile new file mode 100644 index 0000000000..7fbcc2528d --- /dev/null +++ b/cscs-checks/mch/src/Makefile @@ -0,0 +1,29 @@ +RM := rm -f +EXECUTABLE := openacc_cuda_mpi_cppstd + +all: $(EXECUTABLE) +LD = $(FC) + +OBJS = compute_cuda.o openacc_cuda_mpi_cppstd.o std_cpp_call.o +# OBJ2 = $(subst _,$(PE_ENV)_,$(OBJ)) +LIB = + +.SUFFIXES: .o .cu .cpp .F90 + +%.o: %.cu + $(PREP) nvcc $(CXXFLAGS) $(DDTFLAGS) -c $< -o $@ + +%.o: %.cpp + $(PREP) $(CXX) $(CFLAGS) $(DDTFLAGS) -c $< -o $@ + +%.o: %.F90 + $(PREP) $(FC) $(FFLAGS) $(DDTFLAGS) -c $< -o $@ + +$(EXECUTABLE): $(OBJS) + $(PREP) $(LD) $(FFLAGS) $(LDFLAGS) $(OBJS) $(LIB) -o $@ + +clean: + -$(RM) $(OBJS) + +distclean: + -$(RM) $(OBJS) $(EXECUTABLE) diff --git a/cscs-checks/mch/src/compute_cuda.cu b/cscs-checks/mch/src/compute_cuda.cu new file mode 100644 index 0000000000..bd77a2d23c --- /dev/null +++ b/cscs-checks/mch/src/compute_cuda.cu @@ -0,0 +1,59 @@ +#include +#include "cuda.h" + +#define cudaCheckErrors(msg) \ + do { \ + cudaError_t __err = cudaGetLastError(); \ + if (__err != cudaSuccess) { \ + fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \ + msg, cudaGetErrorString(__err), \ + __FILE__, __LINE__); \ + fprintf(stderr, "*** FAILED - ABORTING\n"); \ + exit(1); \ + } \ + } while (0) + +extern "C" { + +__global__ void simple_add(float* a, float* b, int n) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if(i < n) { + a[i] = a[i] + b[i]; + } +} + +void cuda_kernel_no_copy(float* a, float* b, int n) +{ + const int THREADS_PER_BLOCK = 1; + const int NUMBER_OF_BLOCKS = 10; + + cudaThreadSynchronize(); + simple_add<<>>(a, b, n); + cudaThreadSynchronize(); + + cudaCheckErrors("cuda error"); +} + +void cuda_kernel_with_copy(float* a, float* b, int n) +{ + const int THREADS_PER_BLOCK = 1; + const int NUMBER_OF_BLOCKS = 10; + + float* d_a; + float* d_b; + cudaMalloc(&d_a, n*sizeof(float)); + cudaMalloc(&d_b, n*sizeof(float)); + cudaMemcpy(d_a, a, n*sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_b, b, n*sizeof(float), cudaMemcpyHostToDevice); + + cudaThreadSynchronize(); + simple_add<<>>(d_a, d_b, n); + cudaThreadSynchronize(); + + cudaMemcpy(a, d_a, n*sizeof(float), cudaMemcpyDeviceToHost); + + cudaCheckErrors("cuda error"); + +} +}; diff --git a/cscs-checks/mch/src/openacc_cuda_mpi_cppstd.F90 b/cscs-checks/mch/src/openacc_cuda_mpi_cppstd.F90 new file mode 100644 index 0000000000..b419358c21 --- /dev/null +++ b/cscs-checks/mch/src/openacc_cuda_mpi_cppstd.F90 @@ -0,0 +1,182 @@ +program openacc_cuda_mpi_cppstd + ! This code tests MPI communication on GPU with OpenACC using the + ! host_data directive + CUDA call from Frotran as well as C++ function + ! using std library call + implicit none + + +#ifdef USE_MPI + include 'mpif.h' + integer :: status(MPI_STATUS_SIZE) +#endif + + integer :: ierr, i + integer :: cpp_std_sum ! Sum done with C++ call to STD lib + integer :: mpi_size, mpi_rank + integer(8) :: mydata(1), data_sum(1), ref_val + real, allocatable :: f1(:), f2(:), f3(:) + + ! Test parameter + integer, parameter :: NSIZE = 10 + real, parameter :: EXPECTED_CUDA_SUM = 110.0 + real, parameter :: EXPECTED_CPP_STD_SUM = 55.0 + +#ifdef USE_MPI + call MPI_Init(ierr) + call MPI_Comm_size(MPI_COMM_WORLD, mpi_size, ierr) + call MPI_Comm_rank(MPI_COMM_WORLD, mpi_rank, ierr) + mydata(1) = mpi_rank +#ifdef _OPENACC + if (mpi_rank == 0) write(*,*) "MPI test on GPU with OpenACC using ",mpi_size,"tasks" +#else + if (mpi_rank == 0) write(*,*) "MPI test on CPU using ",mpi_size,"tasks" +#endif + +#else + + mpi_rank = 0 +#ifdef _OPENACC + write(*,*) "Single node test on GPU with OpenACC" +#else + write(*,*) "Single node test test on CPU" +#endif + +#endif + + +#ifdef USE_MPI + !$acc data copy(mydata,data_sum) + !$acc host_data use_device(mydata,data_sum) + call MPI_Reduce(mydata, data_sum, 1, MPI_INTEGER8, MPI_SUM, 0, MPI_COMM_WORLD, ierr) + !$acc end host_data + !$acc end data +#endif + + if(mpi_rank == 0) then + + ! Allocate and initialize arrays on the GPU + allocate(f1(NSIZE)) + allocate(f2(NSIZE)) + allocate(f3(NSIZE)) + + !$acc data pcreate(f1,f2,f3) + !$acc parallel loop + do i = 1, NSIZE + f1(i) = i + f2(i) = i + f3(i) = i + end do + !$acc update host(f1,f2,f3) + + ! Call a CUDA kernel with host arrays + call call_cuda_kernel_with_copy(f1, f2, NSIZE) + + ! Call a CUDA kernel without data copy, use device ptr + call call_cuda_kernel_no_copy(f3, f2, NSIZE) + !$acc update host(f3) + + ! Call a C++ function using STD lib + call call_cpp_std(f2, NSIZE, cpp_std_sum) + !$acc end data + end if + + !Check results + if (mpi_rank == 0) then + ref_val = 0 + do i = 0, mpi_size - 1 + ref_val = ref_val + i + end do + if (sum(f1) /= EXPECTED_CUDA_SUM) then + write (*,*) "Result : FAIL" + write (*,*) "Expected value sum(f1): ", EXPECTED_CUDA_SUM, "actual value:", sum(f1) + else if (sum(f3) /= EXPECTED_CUDA_SUM) then + write (*,*) "Result : FAIL" + write (*,*) "Expected value sum(f3): ", EXPECTED_CUDA_SUM, "actual value:", sum(f3) +#ifdef USE_MPI + else if (data_sum(1) /= ref_val) then + write (*,*) "Result : FAIL" + write (*,*) "Expected value data_sum: ", ref_val, "actual value:", data_sum(1) +#endif + else if (cpp_std_sum /= EXPECTED_CPP_STD_SUM) then + write (*,*) "Result : FAIL" + write (*,*) "Expected value stdres: ", EXPECTED_CPP_STD_SUM, "actual value:", cpp_std_sum + else + write (*,*) "Result : OK" + end if + end if + + if(mpi_rank == 0) then + deallocate(f1) + deallocate(f2) + deallocate(f3) + write (*,*) "Result: OK" + end if + +#ifdef USE_MPI + call MPI_Finalize(ierr); +#endif + +contains + subroutine call_cuda_kernel_with_copy(f1,f2,n) + use, intrinsic :: iso_c_binding + implicit none + real, intent(inout) :: f1(:) + real, intent(in) :: f2(:) + integer, intent(in) :: n + + interface + subroutine cuda_kernel_with_copy(f1,f2,n) bind(c,name='cuda_kernel_with_copy') + use, intrinsic :: iso_c_binding + type(c_ptr), intent(in), value :: f1, f2 + integer, intent(in), value :: n + end subroutine cuda_kernel_with_copy + end interface + + call cuda_kernel_with_copy(c_loc(f1(1)), c_loc(f2(1)), n) + end subroutine call_cuda_kernel_with_copy + + subroutine call_cuda_kernel_no_copy(f1,f2,n) + use, intrinsic :: iso_c_binding + implicit none + real, intent(inout) :: f1(:) + real, intent(in) :: f2(:) + integer, intent(in) :: n + + interface + subroutine cuda_kernel_no_copy(f1,f2,n) bind(c,name='cuda_kernel_no_copy') + use, intrinsic :: iso_c_binding + type(c_ptr), intent(in), value :: f1, f2 + integer, intent(in), value :: n + end subroutine cuda_kernel_no_copy + end interface + + !$acc data present(f1, f2) + !$acc host_data use_device(f1, f2) + call cuda_kernel_no_copy(c_loc(f1(1)), c_loc(f2(1)), n) + !$acc end host_data + !$acc end data + end subroutine call_cuda_kernel_no_copy + + subroutine call_cpp_std(f,n,i) + use, intrinsic :: iso_c_binding + implicit none + real(kind=c_float), intent(in), target :: f(:) + real(kind=c_float), pointer :: fp(:) + integer, intent(in) :: n + integer(kind=c_int), intent(out) :: i + + interface + subroutine cpp_call(f,n,i) bind(c,name='do_smth_with_std') + use, intrinsic :: iso_c_binding + type(c_ptr), intent(in), value :: f + integer, intent(in), value :: n + integer(kind=c_int), intent(out) :: i + end subroutine cpp_call + end interface + + fp => f + + call cpp_call(c_loc(fp(1)), n, i) + end subroutine call_cpp_std +end program openacc_cuda_mpi_cppstd + diff --git a/cscs-checks/mch/src/std_cpp_call.cpp b/cscs-checks/mch/src/std_cpp_call.cpp new file mode 100644 index 0000000000..97c681a4b8 --- /dev/null +++ b/cscs-checks/mch/src/std_cpp_call.cpp @@ -0,0 +1,8 @@ +#include +using namespace std; + +extern"C" void do_smth_with_std(float* x, int n, int* res) +{ + *res = 0; + *res = accumulate(x, x+n, 0); +} From c01e5179dc5cbfa577201f09da9fd46f39010e40 Mon Sep 17 00:00:00 2001 From: ajocksch Date: Thu, 5 Jul 2018 16:26:47 +0200 Subject: [PATCH 2/7] made changes required --- cscs-checks/mch/openacc_cuda_mpi_cppstd.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/cscs-checks/mch/openacc_cuda_mpi_cppstd.py b/cscs-checks/mch/openacc_cuda_mpi_cppstd.py index 6220ffa52d..e80e7b9a18 100644 --- a/cscs-checks/mch/openacc_cuda_mpi_cppstd.py +++ b/cscs-checks/mch/openacc_cuda_mpi_cppstd.py @@ -2,13 +2,13 @@ import reframe.utility.sanity as sn -@rfm.parameterized_test(['mpi'], ['nompi']) -class OpenaccCudaMpiCppstd(rfm.RegressionTest): +@rfm.parameterized_test([True], [False]) +class OpenaccCudaMpiNoMPICppstd(rfm.RegressionTest): def __init__(self, withmpi): super().__init__() self.descr = 'test for OpenACC, CUDA, MPI, and C++' self.valid_systems = ['daint:gpu', 'dom:gpu', 'kesch:cn'] - self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-pgi'] + self.valid_prog_environs = ['PrgEnv-cray*', 'PrgEnv-pgi*'] if self.current_system.name in ['daint', 'dom']: self.modules = ['craype-accel-nvidia60'] self._pgi_flags = '-O2 -acc -ta=tesla:cc60 -Mnorpath -lstdc++' @@ -24,6 +24,7 @@ def __init__(self, withmpi): self.modules = ['craype-accel-nvidia35'] self._pgi_flags = '-O2 -acc -ta=tesla,cc35,cuda8.0' self._env_variables = { + 'MPICH_RDMA_ENABLED_CUDA': '1', 'MV2_USE_CUDA': '1', 'G2G': '1' } @@ -32,8 +33,8 @@ def __init__(self, withmpi): self.num_gpus_per_node = 8 self._nvidia_sm = '37' - if withmpi == 'mpi': - self.mpiflag = ' -DUSEMPI' + if withmpi: + self.mpiflag = ' -DUSE_MPI' else: self.mpiflag = '' self.num_tasks_per_node = 1 From ed8911d6727c8e30c52310db2ad59f610a4b193d Mon Sep 17 00:00:00 2001 From: Theofilos Manitaras Date: Tue, 10 Jul 2018 15:51:30 +0200 Subject: [PATCH 3/7] Address PR comments on check name --- cscs-checks/mch/openacc_cuda_mpi_cppstd.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cscs-checks/mch/openacc_cuda_mpi_cppstd.py b/cscs-checks/mch/openacc_cuda_mpi_cppstd.py index e80e7b9a18..4033d4a0f7 100644 --- a/cscs-checks/mch/openacc_cuda_mpi_cppstd.py +++ b/cscs-checks/mch/openacc_cuda_mpi_cppstd.py @@ -3,9 +3,11 @@ @rfm.parameterized_test([True], [False]) -class OpenaccCudaMpiNoMPICppstd(rfm.RegressionTest): +class OpenaccCudaCpp(rfm.RegressionTest): def __init__(self, withmpi): super().__init__() + name_suffix = 'WithMPI' if withmpi else 'WithoutMPI' + self.name = 'OpenaccCudaCPP' + name_suffix self.descr = 'test for OpenACC, CUDA, MPI, and C++' self.valid_systems = ['daint:gpu', 'dom:gpu', 'kesch:cn'] self.valid_prog_environs = ['PrgEnv-cray*', 'PrgEnv-pgi*'] From 887e594aabcaf894dbeeaa4cf9fe8da9b23fa487 Mon Sep 17 00:00:00 2001 From: ajocksch Date: Tue, 28 Aug 2018 11:05:26 +0200 Subject: [PATCH 4/7] intermediate commit --- config/cscs.py | 6 +++--- cscs-checks/mch/openacc_cuda_mpi_cppstd.py | 2 ++ cscs-checks/mch/src/openacc_cuda_mpi_cppstd.F90 | 10 +++++----- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/config/cscs.py b/config/cscs.py index 7e1a7f477a..aa65eaf0b5 100644 --- a/config/cscs.py +++ b/config/cscs.py @@ -126,7 +126,7 @@ class ReframeSettings: 'PrgEnv-pgi', 'PrgEnv-gnu-gdr', 'PrgEnv-pgi_16', 'PrgEnv-pgi_17', 'PrgEnv-pgi_18', 'PrgEnv-pgi_17_aj', - 'PrgEnv-pgi_18_aj', 'PrgEnv-cray_aj', + 'PrgEnv-pgi_18_gdr', 'PrgEnv-cray_aj', 'PrgEnv-cray_aj_b'], 'descr': 'Kesch compute nodes', 'resources': { @@ -240,9 +240,9 @@ class ReframeSettings: 'cxx': 'mpicxx', 'ftn': 'mpif90', }, - 'PrgEnv-pgi_18_aj': { + 'PrgEnv-pgi_18_gdr': { 'type': 'ProgEnvironment', - 'modules': ['PrgEnv-pgi/18.4_aj'], + 'modules': ['PrgEnv-pgi/18.4_gdr'], 'cc': 'mpicc', 'cxx': 'mpicxx', 'ftn': 'mpif90', diff --git a/cscs-checks/mch/openacc_cuda_mpi_cppstd.py b/cscs-checks/mch/openacc_cuda_mpi_cppstd.py index e80e7b9a18..ad87f4421c 100644 --- a/cscs-checks/mch/openacc_cuda_mpi_cppstd.py +++ b/cscs-checks/mch/openacc_cuda_mpi_cppstd.py @@ -37,7 +37,9 @@ def __init__(self, withmpi): self.mpiflag = ' -DUSE_MPI' else: self.mpiflag = '' + self.num_tasks = 1 self.num_tasks_per_node = 1 + self.num_gpus_per_node = 1 self.executable = 'openacc_cuda_mpi_cppstd' self.sanity_patterns = sn.assert_found(r'Result:\s+OK', self.stdout) diff --git a/cscs-checks/mch/src/openacc_cuda_mpi_cppstd.F90 b/cscs-checks/mch/src/openacc_cuda_mpi_cppstd.F90 index b419358c21..c26f170a1e 100644 --- a/cscs-checks/mch/src/openacc_cuda_mpi_cppstd.F90 +++ b/cscs-checks/mch/src/openacc_cuda_mpi_cppstd.F90 @@ -1,6 +1,6 @@ program openacc_cuda_mpi_cppstd ! This code tests MPI communication on GPU with OpenACC using the - ! host_data directive + CUDA call from Frotran as well as C++ function + ! host_data directive + CUDA call from Fortran as well as C++ function ! using std library call implicit none @@ -43,7 +43,6 @@ program openacc_cuda_mpi_cppstd #endif - #ifdef USE_MPI !$acc data copy(mydata,data_sum) !$acc host_data use_device(mydata,data_sum) @@ -60,6 +59,7 @@ program openacc_cuda_mpi_cppstd allocate(f3(NSIZE)) !$acc data pcreate(f1,f2,f3) +stop !$acc parallel loop do i = 1, NSIZE f1(i) = i @@ -69,14 +69,14 @@ program openacc_cuda_mpi_cppstd !$acc update host(f1,f2,f3) ! Call a CUDA kernel with host arrays - call call_cuda_kernel_with_copy(f1, f2, NSIZE) +! call call_cuda_kernel_with_copy(f1, f2, NSIZE) ! Call a CUDA kernel without data copy, use device ptr - call call_cuda_kernel_no_copy(f3, f2, NSIZE) +! call call_cuda_kernel_no_copy(f3, f2, NSIZE) !$acc update host(f3) ! Call a C++ function using STD lib - call call_cpp_std(f2, NSIZE, cpp_std_sum) +! call call_cpp_std(f2, NSIZE, cpp_std_sum) !$acc end data end if From be589f76800dc9419a5431fbe2ff8777cb7ede55 Mon Sep 17 00:00:00 2001 From: ajocksch Date: Tue, 28 Aug 2018 16:07:11 +0200 Subject: [PATCH 5/7] intermediate stage | not working without MPI --- config/cscs.py | 10 +++++----- cscs-checks/mch/src/openacc_cuda_mpi_cppstd.F90 | 1 - 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/config/cscs.py b/config/cscs.py index a0de47aaeb..90bab0ca82 100644 --- a/config/cscs.py +++ b/config/cscs.py @@ -125,7 +125,7 @@ class ReframeSettings: 'environs': ['PrgEnv-gnu', 'PrgEnv-cray', 'PrgEnv-pgi', 'PrgEnv-gnu-gdr', 'PrgEnv-pgi_17.10_gdr', 'PrgEnv-pgi_18.4_gdr', - 'PrgEnv-cray_aj', 'PrgEnv-cray_aj_b'], + 'PrgEnv-cray_gdr', 'PrgEnv-cray_gdr_2.3'], 'descr': 'Kesch compute nodes', 'resources': { '_rfm_gpu': ['--gres=gpu:{num_gpus_per_node}'] @@ -224,13 +224,13 @@ class ReframeSettings: 'cxx': 'mpicxx', 'ftn': 'mpif90', }, - 'PrgEnv-cray_aj': { + 'PrgEnv-cray_gdr': { 'type': 'ProgEnvironment', - 'modules': ['PrgEnv-cray/1.0.2_aj'], + 'modules': ['PrgEnv-cray/1.0.2_gdr'], }, - 'PrgEnv-cray_aj_b': { + 'PrgEnv-cray_gdr_2.3': { 'type': 'ProgEnvironment', - 'modules': ['PrgEnv-cray/1.0.2_aj_b'], + 'modules': ['PrgEnv-cray/1.0.2_gdr_2.3'], }, 'PrgEnv-gnu-gdr': { 'type': 'ProgEnvironment', diff --git a/cscs-checks/mch/src/openacc_cuda_mpi_cppstd.F90 b/cscs-checks/mch/src/openacc_cuda_mpi_cppstd.F90 index c26f170a1e..cc04343b58 100644 --- a/cscs-checks/mch/src/openacc_cuda_mpi_cppstd.F90 +++ b/cscs-checks/mch/src/openacc_cuda_mpi_cppstd.F90 @@ -59,7 +59,6 @@ program openacc_cuda_mpi_cppstd allocate(f3(NSIZE)) !$acc data pcreate(f1,f2,f3) -stop !$acc parallel loop do i = 1, NSIZE f1(i) = i From c2b6e646a8edd991362deb7e4757874b79f7bc9a Mon Sep 17 00:00:00 2001 From: ajocksch Date: Tue, 28 Aug 2018 17:00:16 +0200 Subject: [PATCH 6/7] WIP: only MPI case is running --- cscs-checks/mch/src/openacc_cuda_mpi_cppstd.F90 | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cscs-checks/mch/src/openacc_cuda_mpi_cppstd.F90 b/cscs-checks/mch/src/openacc_cuda_mpi_cppstd.F90 index cc04343b58..439f493648 100644 --- a/cscs-checks/mch/src/openacc_cuda_mpi_cppstd.F90 +++ b/cscs-checks/mch/src/openacc_cuda_mpi_cppstd.F90 @@ -68,14 +68,14 @@ program openacc_cuda_mpi_cppstd !$acc update host(f1,f2,f3) ! Call a CUDA kernel with host arrays -! call call_cuda_kernel_with_copy(f1, f2, NSIZE) + call call_cuda_kernel_with_copy(f1, f2, NSIZE) ! Call a CUDA kernel without data copy, use device ptr -! call call_cuda_kernel_no_copy(f3, f2, NSIZE) + call call_cuda_kernel_no_copy(f3, f2, NSIZE) !$acc update host(f3) ! Call a C++ function using STD lib -! call call_cpp_std(f2, NSIZE, cpp_std_sum) + call call_cpp_std(f2, NSIZE, cpp_std_sum) !$acc end data end if @@ -178,4 +178,3 @@ end subroutine cpp_call call cpp_call(c_loc(fp(1)), n, i) end subroutine call_cpp_std end program openacc_cuda_mpi_cppstd - From e1f44fb8a0947310868c4ef014f3135f6d26bfc3 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Mon, 8 Oct 2018 16:13:58 +0200 Subject: [PATCH 7/7] Fix OpenACC+CUDA+MPI test on Kesch+Dom --- cscs-checks/mch/openacc_cuda_mpi_cppstd.py | 18 +++++++++++++----- cscs-checks/mch/src/Makefile | 8 ++++---- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/cscs-checks/mch/openacc_cuda_mpi_cppstd.py b/cscs-checks/mch/openacc_cuda_mpi_cppstd.py index 32b8970982..d24efecf14 100644 --- a/cscs-checks/mch/openacc_cuda_mpi_cppstd.py +++ b/cscs-checks/mch/openacc_cuda_mpi_cppstd.py @@ -13,7 +13,6 @@ def __init__(self, withmpi): self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-pgi'] self.build_system = 'Make' self.build_system.fflags = ['-O2'] - self.build_system.ldflags = ['-lcublas', '-lcudart'] if self.current_system.name in ['daint', 'dom']: self.modules = ['craype-accel-nvidia60'] self.variables = { @@ -23,7 +22,7 @@ def __init__(self, withmpi): self.num_tasks = 12 self.num_tasks_per_node = 12 self.num_gpus_per_node = 1 - self.build_system.options = ['NVCC_FLAGS="-arch=sm60"'] + self.build_system.options = ['NVCC_FLAGS="-arch=compute_60"'] elif self.current_system.name in ['kesch']: self.modules = ['craype-accel-nvidia35'] self.variables = { @@ -33,11 +32,15 @@ def __init__(self, withmpi): self.num_tasks = 8 self.num_tasks_per_node = 8 self.num_gpus_per_node = 8 - self.build_system.options = ['NVCC_FLAGS="-arch=sm37"'] + self.build_system.options = ['NVCC_FLAGS="-arch=compute_37"'] if withmpi: self.build_system.cppflags = ['-DUSE_MPI'] else: + if self.current_system.name == 'kesch': + self.valid_prog_environs = ['PrgEnv-cray-nompi', + 'PrgEnv-pgi-nompi'] + self.num_tasks = 1 self.num_tasks_per_node = 1 self.num_gpus_per_node = 1 @@ -53,9 +56,14 @@ def setup(self, partition, environ, **job_opts): elif environ.name.startswith('PrgEnv-pgi'): self.build_system.fflags += ['-acc'] if self.current_system.name in ['daint', 'dom']: - self.build_system.fflags += ['-ta:tesla:cc60', '-Mnorpath'] - self.build_system.ldflags += ['-lstdc++'] + self.build_system.fflags += ['-ta:tesla:cc60'] + self.build_system.ldflags = ['-acc', '-ta:tesla:cc60', + '-Mnorpath', '-lstdc++'] elif self.current_system.name == 'kesch': self.build_system.fflags += ['-ta=tesla,cc35,cuda8.0'] + self.build_system.ldflags = [ + '-acc', '-ta:tesla:cc35,cuda8.0', '-lstdc++', + '-L/global/opt/nvidia/cudatoolkit/8.0.61/lib64', + '-lcublas', '-lcudart'] super().setup(partition, environ, **job_opts) diff --git a/cscs-checks/mch/src/Makefile b/cscs-checks/mch/src/Makefile index 5cd05c3d1e..5de7f6f0be 100644 --- a/cscs-checks/mch/src/Makefile +++ b/cscs-checks/mch/src/Makefile @@ -11,16 +11,16 @@ LIB = .SUFFIXES: .o .cu .cpp .F90 %.o: %.cu - $(PREP) nvcc $(CPPFLAGS) $(NVCC_FLAGS) -c $< -o $@ + $(NVCC) $(CPPFLAGS) $(NVCC_FLAGS) -c $< -o $@ %.o: %.cpp - $(PREP) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $< -o $@ + $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $< -o $@ %.o: %.F90 - $(PREP) $(FC) $(FCFLAGS) -c $< -o $@ + $(FC) $(CPPFLAGS) $(FCFLAGS) -c $< -o $@ $(EXECUTABLE): $(OBJS) - $(PREP) $(LD) $(OBJS) -o $@ $(LDFLAGS) $(LIB) + $(LD) $(OBJS) -o $@ $(LDFLAGS) $(LIB) clean: -$(RM) $(OBJS)