reframe-hpc · vkarak · Oct 8, 2018 · Jun 22, 2018 · Jul 5, 2018 · Jul 5, 2018
diff --git a/cscs-checks/mch/openacc_cuda_mpi_cppstd.py b/cscs-checks/mch/openacc_cuda_mpi_cppstd.py
@@ -0,0 +1,69 @@
+import reframe as rfm
+import reframe.utility.sanity as sn
+
+
+@rfm.parameterized_test([True], [False])
+class OpenaccCudaCpp(rfm.RegressionTest):
+    def __init__(self, withmpi):
+        super().__init__()
+        name_suffix = 'WithMPI' if withmpi else 'WithoutMPI'
+        self.name = 'OpenaccCudaCPP' + name_suffix
+        self.descr = 'test for OpenACC, CUDA, MPI, and C++'
+        self.valid_systems = ['daint:gpu', 'dom:gpu', 'kesch:cn']
+        self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-pgi']
+        self.build_system = 'Make'
+        self.build_system.fflags = ['-O2']
+        if self.current_system.name in ['daint', 'dom']:
+            self.modules = ['craype-accel-nvidia60']
+            self.variables = {
+                'MPICH_RDMA_ENABLED_CUDA': '1',
+                'CRAY_CUDA_MPS': '1'
+            }
+            self.num_tasks = 12
+            self.num_tasks_per_node = 12
+            self.num_gpus_per_node = 1
+            self.build_system.options = ['NVCC_FLAGS="-arch=compute_60"']
+        elif self.current_system.name in ['kesch']:
+            self.modules = ['craype-accel-nvidia35']
+            self.variables = {
+                'MV2_USE_CUDA': '1',
+                'G2G': '1'
+            }
+            self.num_tasks = 8
+            self.num_tasks_per_node = 8
+            self.num_gpus_per_node = 8
+            self.build_system.options = ['NVCC_FLAGS="-arch=compute_37"']
+
+        if withmpi:
+            self.build_system.cppflags = ['-DUSE_MPI']
+        else:
+            if self.current_system.name == 'kesch':
+                self.valid_prog_environs = ['PrgEnv-cray-nompi',
+                                            'PrgEnv-pgi-nompi']
+
+            self.num_tasks = 1
+            self.num_tasks_per_node = 1
+            self.num_gpus_per_node = 1
+
+        self.executable = 'openacc_cuda_mpi_cppstd'
+        self.sanity_patterns = sn.assert_found(r'Result:\s+OK', self.stdout)
+        self.maintainers = ['AJ', 'VK']
+        self.tags = {'production'}
+
+    def setup(self, partition, environ, **job_opts):
+        if environ.name.startswith('PrgEnv-cray'):
+            self.build_system.fflags += ['-hacc', '-hnoomp']
+        elif environ.name.startswith('PrgEnv-pgi'):
+            self.build_system.fflags += ['-acc']
+            if self.current_system.name in ['daint', 'dom']:
+                self.build_system.fflags += ['-ta:tesla:cc60']
+                self.build_system.ldflags = ['-acc', '-ta:tesla:cc60',
+                                             '-Mnorpath', '-lstdc++']
+            elif self.current_system.name == 'kesch':
+                self.build_system.fflags += ['-ta=tesla,cc35,cuda8.0']
+                self.build_system.ldflags = [
+                    '-acc', '-ta:tesla:cc35,cuda8.0', '-lstdc++',
+                    '-L/global/opt/nvidia/cudatoolkit/8.0.61/lib64',
+                    '-lcublas', '-lcudart']
+
+        super().setup(partition, environ, **job_opts)
diff --git a/cscs-checks/mch/src/Makefile b/cscs-checks/mch/src/Makefile
@@ -0,0 +1,29 @@
+RM := rm -f
+EXECUTABLE := openacc_cuda_mpi_cppstd
+
+all: $(EXECUTABLE)
+LD = $(FC)
+
+OBJS = compute_cuda.o openacc_cuda_mpi_cppstd.o std_cpp_call.o
+# OBJ2 = $(subst _,$(PE_ENV)_,$(OBJ))
+LIB  =
+
+.SUFFIXES: .o .cu .cpp .F90
+
+%.o: %.cu
+	$(NVCC) $(CPPFLAGS) $(NVCC_FLAGS) -c $< -o $@
+
+%.o: %.cpp
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $< -o $@
+
+%.o: %.F90
+	$(FC) $(CPPFLAGS) $(FCFLAGS) -c $< -o $@
+
+$(EXECUTABLE): $(OBJS)
+	$(LD) $(OBJS) -o $@ $(LDFLAGS) $(LIB)
+
+clean:
+	-$(RM) $(OBJS)
+
+distclean:
+	-$(RM) $(OBJS) $(EXECUTABLE)
diff --git a/cscs-checks/mch/src/compute_cuda.cu b/cscs-checks/mch/src/compute_cuda.cu
@@ -0,0 +1,59 @@
+#include  <stdio.h>
+#include "cuda.h"
+
+#define cudaCheckErrors(msg) \
+    do { \
+        cudaError_t __err = cudaGetLastError(); \
+        if (__err != cudaSuccess) { \
+            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
+                msg, cudaGetErrorString(__err), \
+                __FILE__, __LINE__); \
+            fprintf(stderr, "*** FAILED - ABORTING\n"); \
+            exit(1); \
+        } \
+    } while (0)
+
+extern "C" {
+
+__global__  void simple_add(float* a, float* b, int n)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x; 
+  if(i < n) {
+    a[i] = a[i] + b[i];
+  }
+}
+
+void cuda_kernel_no_copy(float* a, float* b, int n)
+{
+  const int THREADS_PER_BLOCK = 1;
+  const int NUMBER_OF_BLOCKS = 10;
+
+  cudaThreadSynchronize();
+  simple_add<<<NUMBER_OF_BLOCKS, THREADS_PER_BLOCK>>>(a, b, n);
+  cudaThreadSynchronize();
+
+  cudaCheckErrors("cuda error");
+}
+
+void cuda_kernel_with_copy(float* a, float* b, int n)
+{
+  const int THREADS_PER_BLOCK = 1;
+  const int NUMBER_OF_BLOCKS = 10;
+
+  float* d_a;
+  float* d_b;
+  cudaMalloc(&d_a, n*sizeof(float));
+  cudaMalloc(&d_b, n*sizeof(float));
+  cudaMemcpy(d_a, a, n*sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_b, b, n*sizeof(float), cudaMemcpyHostToDevice);
+
+  cudaThreadSynchronize();
+  simple_add<<<NUMBER_OF_BLOCKS, THREADS_PER_BLOCK>>>(d_a, d_b, n);
+  cudaThreadSynchronize();
+
+  cudaMemcpy(a, d_a, n*sizeof(float), cudaMemcpyDeviceToHost);
+
+  cudaCheckErrors("cuda error");
+
+}
+};
diff --git a/cscs-checks/mch/src/openacc_cuda_mpi_cppstd.F90 b/cscs-checks/mch/src/openacc_cuda_mpi_cppstd.F90
@@ -0,0 +1,180 @@
+program openacc_cuda_mpi_cppstd
+  ! This code tests MPI communication on GPU with OpenACC using the
+  ! host_data directive + CUDA call from Fortran as well as C++ function 
+  ! using std library call
+  implicit none
+
+
+#ifdef USE_MPI
+  include 'mpif.h'
+  integer :: status(MPI_STATUS_SIZE)
+#endif
+
+  integer :: ierr, i
+  integer :: cpp_std_sum ! Sum done with C++ call to STD lib
+  integer :: mpi_size, mpi_rank
+  integer(8) :: mydata(1), data_sum(1), ref_val
+  real, allocatable :: f1(:), f2(:), f3(:) 
+
+  ! Test parameter
+  integer, parameter :: NSIZE = 10
+  real, parameter :: EXPECTED_CUDA_SUM = 110.0
+  real, parameter :: EXPECTED_CPP_STD_SUM = 55.0
+
+#ifdef USE_MPI
+  call MPI_Init(ierr)
+  call MPI_Comm_size(MPI_COMM_WORLD, mpi_size, ierr)
+  call MPI_Comm_rank(MPI_COMM_WORLD, mpi_rank, ierr)
+  mydata(1) = mpi_rank
+#ifdef _OPENACC
+  if (mpi_rank == 0) write(*,*) "MPI test on GPU with OpenACC using ",mpi_size,"tasks"
+#else
+  if (mpi_rank == 0) write(*,*) "MPI test on CPU using ",mpi_size,"tasks"
+#endif
+
+#else
+
+  mpi_rank = 0
+#ifdef _OPENACC
+  write(*,*) "Single node test on GPU with OpenACC"
+#else
+  write(*,*) "Single node test test on CPU"
+#endif
+
+#endif
+
+#ifdef USE_MPI
+  !$acc data copy(mydata,data_sum)
+  !$acc host_data use_device(mydata,data_sum)
+  call MPI_Reduce(mydata, data_sum, 1, MPI_INTEGER8, MPI_SUM, 0, MPI_COMM_WORLD, ierr)
+  !$acc end host_data
+  !$acc end data
+#endif
+
+  if(mpi_rank == 0) then
+
+    ! Allocate and initialize arrays on the GPU
+    allocate(f1(NSIZE))
+    allocate(f2(NSIZE))
+    allocate(f3(NSIZE))
+
+    !$acc data pcreate(f1,f2,f3)
+    !$acc parallel loop  
+    do i = 1, NSIZE
+      f1(i) = i
+      f2(i) = i
+      f3(i) = i
+    end do 
+    !$acc update host(f1,f2,f3)
+
+    ! Call a CUDA kernel with host arrays 
+    call call_cuda_kernel_with_copy(f1, f2, NSIZE)
+
+    ! Call a CUDA kernel without data copy, use device ptr
+    call call_cuda_kernel_no_copy(f3, f2, NSIZE)
+    !$acc update host(f3)
+
+    ! Call a C++ function using STD lib
+    call call_cpp_std(f2, NSIZE, cpp_std_sum)
+    !$acc end data 
+  end if
+
+  !Check results
+  if (mpi_rank == 0) then
+    ref_val = 0
+    do i = 0, mpi_size - 1
+      ref_val = ref_val + i
+    end do
+    if (sum(f1) /= EXPECTED_CUDA_SUM) then
+      write (*,*) "Result : FAIL"
+      write (*,*) "Expected value sum(f1): ", EXPECTED_CUDA_SUM, "actual value:", sum(f1)
+    else if (sum(f3) /= EXPECTED_CUDA_SUM) then
+      write (*,*) "Result : FAIL"
+      write (*,*) "Expected value sum(f3): ", EXPECTED_CUDA_SUM, "actual value:", sum(f3)
+#ifdef USE_MPI
+    else if (data_sum(1) /= ref_val) then
+      write (*,*) "Result : FAIL"
+      write (*,*) "Expected value data_sum: ", ref_val, "actual value:", data_sum(1)
+#endif
+    else if (cpp_std_sum /= EXPECTED_CPP_STD_SUM) then
+      write (*,*) "Result : FAIL"
+      write (*,*) "Expected value stdres: ", EXPECTED_CPP_STD_SUM, "actual value:", cpp_std_sum
+    else
+      write (*,*) "Result : OK"
+    end if
+  end if
+
+  if(mpi_rank == 0) then
+    deallocate(f1)
+    deallocate(f2)
+    deallocate(f3)
+    write (*,*) "Result: OK"
+  end if
+
+#ifdef USE_MPI
+  call MPI_Finalize(ierr);
+#endif
+
+contains
+  subroutine call_cuda_kernel_with_copy(f1,f2,n)
+    use, intrinsic :: iso_c_binding
+    implicit none
+    real, intent(inout) :: f1(:)
+    real, intent(in) :: f2(:)
+    integer, intent(in) :: n
+
+    interface
+      subroutine cuda_kernel_with_copy(f1,f2,n) bind(c,name='cuda_kernel_with_copy')
+        use, intrinsic :: iso_c_binding
+        type(c_ptr), intent(in), value :: f1, f2
+        integer, intent(in), value :: n
+      end subroutine cuda_kernel_with_copy
+    end interface
+
+    call cuda_kernel_with_copy(c_loc(f1(1)), c_loc(f2(1)), n)
+  end subroutine call_cuda_kernel_with_copy
+
+  subroutine call_cuda_kernel_no_copy(f1,f2,n)
+    use, intrinsic :: iso_c_binding
+    implicit none
+    real, intent(inout) :: f1(:)
+    real, intent(in) :: f2(:)
+    integer, intent(in) :: n
+
+    interface
+      subroutine cuda_kernel_no_copy(f1,f2,n) bind(c,name='cuda_kernel_no_copy')
+        use, intrinsic :: iso_c_binding
+        type(c_ptr), intent(in), value :: f1, f2
+        integer, intent(in), value :: n
+      end subroutine cuda_kernel_no_copy
+    end interface
+
+    !$acc data present(f1, f2)
+    !$acc host_data use_device(f1, f2)
+    call cuda_kernel_no_copy(c_loc(f1(1)), c_loc(f2(1)), n)
+    !$acc end host_data
+    !$acc end data
+  end subroutine call_cuda_kernel_no_copy
+
+  subroutine call_cpp_std(f,n,i)
+    use, intrinsic :: iso_c_binding
+    implicit none
+    real(kind=c_float), intent(in), target :: f(:)
+    real(kind=c_float), pointer :: fp(:)
+    integer, intent(in) :: n
+    integer(kind=c_int), intent(out) :: i
+
+    interface
+      subroutine cpp_call(f,n,i) bind(c,name='do_smth_with_std')
+        use, intrinsic :: iso_c_binding
+        type(c_ptr), intent(in), value :: f
+        integer, intent(in), value :: n
+        integer(kind=c_int), intent(out) :: i
+      end subroutine cpp_call
+    end interface
+
+    fp => f   
+
+    call cpp_call(c_loc(fp(1)), n, i)
+  end subroutine call_cpp_std 
+end program openacc_cuda_mpi_cppstd
diff --git a/cscs-checks/mch/src/std_cpp_call.cpp b/cscs-checks/mch/src/std_cpp_call.cpp
@@ -0,0 +1,8 @@
+#include <numeric>
+using namespace std;
+
+extern"C" void do_smth_with_std(float* x, int n, int* res)
+{
+    *res = 0;
+    *res = accumulate(x, x+n, 0);
+}