Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions cscs-checks/mch/openacc_cuda_mpi_cppstd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import reframe as rfm
import reframe.utility.sanity as sn


@rfm.parameterized_test([True], [False])
class OpenaccCudaCpp(rfm.RegressionTest):
def __init__(self, withmpi):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The withmpi should be a boolean, not a string.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

super().__init__()
name_suffix = 'WithMPI' if withmpi else 'WithoutMPI'
self.name = 'OpenaccCudaCPP' + name_suffix
self.descr = 'test for OpenACC, CUDA, MPI, and C++'
self.valid_systems = ['daint:gpu', 'dom:gpu', 'kesch:cn']
self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-pgi']
self.build_system = 'Make'
self.build_system.fflags = ['-O2']
if self.current_system.name in ['daint', 'dom']:
self.modules = ['craype-accel-nvidia60']
self.variables = {
'MPICH_RDMA_ENABLED_CUDA': '1',
'CRAY_CUDA_MPS': '1'
}
self.num_tasks = 12
self.num_tasks_per_node = 12
self.num_gpus_per_node = 1
self.build_system.options = ['NVCC_FLAGS="-arch=compute_60"']
elif self.current_system.name in ['kesch']:
self.modules = ['craype-accel-nvidia35']
self.variables = {
'MV2_USE_CUDA': '1',
'G2G': '1'
}
self.num_tasks = 8
self.num_tasks_per_node = 8
self.num_gpus_per_node = 8
self.build_system.options = ['NVCC_FLAGS="-arch=compute_37"']

if withmpi:
self.build_system.cppflags = ['-DUSE_MPI']
else:
if self.current_system.name == 'kesch':
self.valid_prog_environs = ['PrgEnv-cray-nompi',
'PrgEnv-pgi-nompi']

self.num_tasks = 1
self.num_tasks_per_node = 1
self.num_gpus_per_node = 1

self.executable = 'openacc_cuda_mpi_cppstd'
self.sanity_patterns = sn.assert_found(r'Result:\s+OK', self.stdout)
self.maintainers = ['AJ', 'VK']
self.tags = {'production'}

def setup(self, partition, environ, **job_opts):
if environ.name.startswith('PrgEnv-cray'):
self.build_system.fflags += ['-hacc', '-hnoomp']
elif environ.name.startswith('PrgEnv-pgi'):
self.build_system.fflags += ['-acc']
if self.current_system.name in ['daint', 'dom']:
self.build_system.fflags += ['-ta:tesla:cc60']
self.build_system.ldflags = ['-acc', '-ta:tesla:cc60',
'-Mnorpath', '-lstdc++']
elif self.current_system.name == 'kesch':
self.build_system.fflags += ['-ta=tesla,cc35,cuda8.0']
self.build_system.ldflags = [
'-acc', '-ta:tesla:cc35,cuda8.0', '-lstdc++',
'-L/global/opt/nvidia/cudatoolkit/8.0.61/lib64',
'-lcublas', '-lcudart']

super().setup(partition, environ, **job_opts)
29 changes: 29 additions & 0 deletions cscs-checks/mch/src/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
RM := rm -f
EXECUTABLE := openacc_cuda_mpi_cppstd

all: $(EXECUTABLE)
LD = $(FC)

OBJS = compute_cuda.o openacc_cuda_mpi_cppstd.o std_cpp_call.o
# OBJ2 = $(subst _,$(PE_ENV)_,$(OBJ))
LIB =

.SUFFIXES: .o .cu .cpp .F90

%.o: %.cu
$(NVCC) $(CPPFLAGS) $(NVCC_FLAGS) -c $< -o $@

%.o: %.cpp
$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $< -o $@

%.o: %.F90
$(FC) $(CPPFLAGS) $(FCFLAGS) -c $< -o $@

$(EXECUTABLE): $(OBJS)
$(LD) $(OBJS) -o $@ $(LDFLAGS) $(LIB)

clean:
-$(RM) $(OBJS)

distclean:
-$(RM) $(OBJS) $(EXECUTABLE)
59 changes: 59 additions & 0 deletions cscs-checks/mch/src/compute_cuda.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#include <stdio.h>
#include "cuda.h"

#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)

extern "C" {

__global__ void simple_add(float* a, float* b, int n)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i < n) {
a[i] = a[i] + b[i];
}
}

void cuda_kernel_no_copy(float* a, float* b, int n)
{
const int THREADS_PER_BLOCK = 1;
const int NUMBER_OF_BLOCKS = 10;

cudaThreadSynchronize();
simple_add<<<NUMBER_OF_BLOCKS, THREADS_PER_BLOCK>>>(a, b, n);
cudaThreadSynchronize();

cudaCheckErrors("cuda error");
}

void cuda_kernel_with_copy(float* a, float* b, int n)
{
const int THREADS_PER_BLOCK = 1;
const int NUMBER_OF_BLOCKS = 10;

float* d_a;
float* d_b;
cudaMalloc(&d_a, n*sizeof(float));
cudaMalloc(&d_b, n*sizeof(float));
cudaMemcpy(d_a, a, n*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, n*sizeof(float), cudaMemcpyHostToDevice);

cudaThreadSynchronize();
simple_add<<<NUMBER_OF_BLOCKS, THREADS_PER_BLOCK>>>(d_a, d_b, n);
cudaThreadSynchronize();

cudaMemcpy(a, d_a, n*sizeof(float), cudaMemcpyDeviceToHost);

cudaCheckErrors("cuda error");

}
};
180 changes: 180 additions & 0 deletions cscs-checks/mch/src/openacc_cuda_mpi_cppstd.F90
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
program openacc_cuda_mpi_cppstd
! This code tests MPI communication on GPU with OpenACC using the
! host_data directive + CUDA call from Fortran as well as C++ function
! using std library call
implicit none


#ifdef USE_MPI
include 'mpif.h'
integer :: status(MPI_STATUS_SIZE)
#endif

integer :: ierr, i
integer :: cpp_std_sum ! Sum done with C++ call to STD lib
integer :: mpi_size, mpi_rank
integer(8) :: mydata(1), data_sum(1), ref_val
real, allocatable :: f1(:), f2(:), f3(:)

! Test parameter
integer, parameter :: NSIZE = 10
real, parameter :: EXPECTED_CUDA_SUM = 110.0
real, parameter :: EXPECTED_CPP_STD_SUM = 55.0

#ifdef USE_MPI
call MPI_Init(ierr)
call MPI_Comm_size(MPI_COMM_WORLD, mpi_size, ierr)
call MPI_Comm_rank(MPI_COMM_WORLD, mpi_rank, ierr)
mydata(1) = mpi_rank
#ifdef _OPENACC
if (mpi_rank == 0) write(*,*) "MPI test on GPU with OpenACC using ",mpi_size,"tasks"
#else
if (mpi_rank == 0) write(*,*) "MPI test on CPU using ",mpi_size,"tasks"
#endif

#else

mpi_rank = 0
#ifdef _OPENACC
write(*,*) "Single node test on GPU with OpenACC"
#else
write(*,*) "Single node test test on CPU"
#endif

#endif

#ifdef USE_MPI
!$acc data copy(mydata,data_sum)
!$acc host_data use_device(mydata,data_sum)
call MPI_Reduce(mydata, data_sum, 1, MPI_INTEGER8, MPI_SUM, 0, MPI_COMM_WORLD, ierr)
!$acc end host_data
!$acc end data
#endif

if(mpi_rank == 0) then

! Allocate and initialize arrays on the GPU
allocate(f1(NSIZE))
allocate(f2(NSIZE))
allocate(f3(NSIZE))

!$acc data pcreate(f1,f2,f3)
!$acc parallel loop
do i = 1, NSIZE
f1(i) = i
f2(i) = i
f3(i) = i
end do
!$acc update host(f1,f2,f3)

! Call a CUDA kernel with host arrays
call call_cuda_kernel_with_copy(f1, f2, NSIZE)

! Call a CUDA kernel without data copy, use device ptr
call call_cuda_kernel_no_copy(f3, f2, NSIZE)
!$acc update host(f3)

! Call a C++ function using STD lib
call call_cpp_std(f2, NSIZE, cpp_std_sum)
!$acc end data
end if

!Check results
if (mpi_rank == 0) then
ref_val = 0
do i = 0, mpi_size - 1
ref_val = ref_val + i
end do
if (sum(f1) /= EXPECTED_CUDA_SUM) then
write (*,*) "Result : FAIL"
write (*,*) "Expected value sum(f1): ", EXPECTED_CUDA_SUM, "actual value:", sum(f1)
else if (sum(f3) /= EXPECTED_CUDA_SUM) then
write (*,*) "Result : FAIL"
write (*,*) "Expected value sum(f3): ", EXPECTED_CUDA_SUM, "actual value:", sum(f3)
#ifdef USE_MPI
else if (data_sum(1) /= ref_val) then
write (*,*) "Result : FAIL"
write (*,*) "Expected value data_sum: ", ref_val, "actual value:", data_sum(1)
#endif
else if (cpp_std_sum /= EXPECTED_CPP_STD_SUM) then
write (*,*) "Result : FAIL"
write (*,*) "Expected value stdres: ", EXPECTED_CPP_STD_SUM, "actual value:", cpp_std_sum
else
write (*,*) "Result : OK"
end if
end if

if(mpi_rank == 0) then
deallocate(f1)
deallocate(f2)
deallocate(f3)
write (*,*) "Result: OK"
end if

#ifdef USE_MPI
call MPI_Finalize(ierr);
#endif

contains
subroutine call_cuda_kernel_with_copy(f1,f2,n)
use, intrinsic :: iso_c_binding
implicit none
real, intent(inout) :: f1(:)
real, intent(in) :: f2(:)
integer, intent(in) :: n

interface
subroutine cuda_kernel_with_copy(f1,f2,n) bind(c,name='cuda_kernel_with_copy')
use, intrinsic :: iso_c_binding
type(c_ptr), intent(in), value :: f1, f2
integer, intent(in), value :: n
end subroutine cuda_kernel_with_copy
end interface

call cuda_kernel_with_copy(c_loc(f1(1)), c_loc(f2(1)), n)
end subroutine call_cuda_kernel_with_copy

subroutine call_cuda_kernel_no_copy(f1,f2,n)
use, intrinsic :: iso_c_binding
implicit none
real, intent(inout) :: f1(:)
real, intent(in) :: f2(:)
integer, intent(in) :: n

interface
subroutine cuda_kernel_no_copy(f1,f2,n) bind(c,name='cuda_kernel_no_copy')
use, intrinsic :: iso_c_binding
type(c_ptr), intent(in), value :: f1, f2
integer, intent(in), value :: n
end subroutine cuda_kernel_no_copy
end interface

!$acc data present(f1, f2)
!$acc host_data use_device(f1, f2)
call cuda_kernel_no_copy(c_loc(f1(1)), c_loc(f2(1)), n)
!$acc end host_data
!$acc end data
end subroutine call_cuda_kernel_no_copy

subroutine call_cpp_std(f,n,i)
use, intrinsic :: iso_c_binding
implicit none
real(kind=c_float), intent(in), target :: f(:)
real(kind=c_float), pointer :: fp(:)
integer, intent(in) :: n
integer(kind=c_int), intent(out) :: i

interface
subroutine cpp_call(f,n,i) bind(c,name='do_smth_with_std')
use, intrinsic :: iso_c_binding
type(c_ptr), intent(in), value :: f
integer, intent(in), value :: n
integer(kind=c_int), intent(out) :: i
end subroutine cpp_call
end interface

fp => f

call cpp_call(c_loc(fp(1)), n, i)
end subroutine call_cpp_std
end program openacc_cuda_mpi_cppstd
8 changes: 8 additions & 0 deletions cscs-checks/mch/src/std_cpp_call.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#include <numeric>
using namespace std;

extern"C" void do_smth_with_std(float* x, int n, int* res)
{
*res = 0;
*res = accumulate(x, x+n, 0);
}