diff --git a/config/cscs.py b/config/cscs.py index 69a8c43a0d..e3a91747cc 100644 --- a/config/cscs.py +++ b/config/cscs.py @@ -348,8 +348,10 @@ class ReframeSettings: }, 'PrgEnv-pgi': { 'type': 'ProgEnvironment', - 'modules': ['PE/17.06', - 'PrgEnv-pgi/18.5'], + 'modules': [ + 'PE/17.06', 'pgi/18.5-gcc-5.4.0-2.26', + 'openmpi/4.0.1-pgi-18.5-gcc-5.4.0-2.26-cuda-8.0' + ], 'cc': 'mpicc', 'cxx': 'mpicxx', 'ftn': 'mpif90', diff --git a/cscs-checks/mch/automatic_arrays_acc.py b/cscs-checks/mch/automatic_arrays_acc.py index 910fb9d137..2fc21ed56d 100644 --- a/cscs-checks/mch/automatic_arrays_acc.py +++ b/cscs-checks/mch/automatic_arrays_acc.py @@ -7,18 +7,18 @@ class AutomaticArraysCheck(rfm.RegressionTest): def __init__(self): super().__init__() self.valid_systems = ['daint:gpu', 'dom:gpu', 'kesch:cn'] - self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-pgi', - 'PrgEnv-cray-c2sm-gpu', - 'PrgEnv-pgi-c2sm-gpu'] + self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-pgi'] if self.current_system.name in ['daint', 'dom']: self.modules = ['craype-accel-nvidia60'] elif self.current_system.name == 'kesch': self.exclusive_access = True - self.modules = ['craype-accel-nvidia35'] + self.modules = ['cudatoolkit/8.0.61'] # FIXME: workaround -- the variable should not be needed since # there is no GPUdirect in this check - self.variables = {'MV2_USE_CUDA': '1'} - + self.variables = { + 'CRAY_ACCEL_TARGET': 'nvidia35', + 'MV2_USE_CUDA': '1' + } # This tets requires an MPI compiler, although it uses a single task self.num_tasks = 1 self.num_gpus_per_node = 1 @@ -56,7 +56,7 @@ def setup(self, partition, environ, **job_opts): envname = 'PrgEnv-pgi' self.build_system.fflags += ['-acc'] if self.current_system.name == 'kesch': - self.build_system.fflags += ['-ta=tesla,cc35,cuda8.0'] + self.build_system.fflags += ['-ta=tesla,cc35'] elif self.current_system.name in ['daint', 'dom']: self.build_system.fflags += ['-ta=tesla,cc60', '-Mnorpath'] else: diff --git a/cscs-checks/mch/collectives_halo.py b/cscs-checks/mch/collectives_halo.py index 018236660c..9744686a90 100644 --- a/cscs-checks/mch/collectives_halo.py +++ b/cscs-checks/mch/collectives_halo.py @@ -2,7 +2,7 @@ import reframe.utility.sanity as sn -class CommunicationTestBase(rfm.RegressionTest): +class CollectivesBaseTest(rfm.RegressionTest): def __init__(self, variant, bench_reference): super().__init__() self.valid_systems = ['dom:gpu', 'daint:gpu', 'kesch:cn'] @@ -23,7 +23,7 @@ def __init__(self, variant, bench_reference): self.num_gpus_per_node = 16 self.num_tasks_per_node = 16 self.num_tasks_per_socket = 8 - self.modules = ['craype-accel-nvidia35', 'cmake'] + self.modules = ['cmake'] self.variables['MV2_USE_CUDA'] = '1' self.build_system.config_opts += [ '-DMPI_VENDOR=mvapich2', @@ -63,6 +63,7 @@ def __init__(self, variant, bench_reference): 'default': 0.0138493 } } + if self.current_system.name == 'dom': sysname = 'daint' else: @@ -98,11 +99,8 @@ def setup(self, *args, **kwargs): '--cpu_bind=q'] -# the values default, nocomm and nocomp refer to the different parts -# of the check where the time is measured; default == all -# nocomm == no communication nocomp == no computation @rfm.parameterized_test(['default'], ['nocomm'], ['nocomp']) -class AlltoallvTest(CommunicationTestBase): +class AlltoallvTest(CollectivesBaseTest): def __init__(self, variant): super().__init__(variant, { @@ -117,14 +115,13 @@ def __init__(self, variant): 'default': 0.0138493 } }) - self.descr = 'Alltoall communication test' self.strict_check = False self.sourcesdir = 'https://github.com/cosunae/comm_overlap_bench' self.prebuild_cmd = ['git checkout alltoallv'] @rfm.parameterized_test(['default'], ['nocomm'], ['nocomp']) -class HaloExchangeTest(CommunicationTestBase): +class HaloExchangeTest(CollectivesBaseTest): def __init__(self, variant): super().__init__(variant, { @@ -139,6 +136,5 @@ def __init__(self, variant): 'default': 2.53509 } }) - self.descr = 'Halo-cell exchange test' self.sourcesdir = 'https://github.com/MeteoSwiss-APN/comm_overlap_bench.git' self.prebuild_cmd = ['git checkout barebones'] diff --git a/cscs-checks/mch/cuda_stress_test.py b/cscs-checks/mch/cuda_stress_test.py index 4caeaefdcc..caac7700ee 100644 --- a/cscs-checks/mch/cuda_stress_test.py +++ b/cscs-checks/mch/cuda_stress_test.py @@ -11,7 +11,7 @@ def __init__(self): if self.current_system.name == 'kesch': self.exclusive_access = True self.valid_prog_environs = ['PrgEnv-gnu-nompi'] - self.modules = ['craype-accel-nvidia35'] + self.modules = ['cudatoolkit/8.0.61'] else: self.valid_prog_environs = ['PrgEnv-gnu'] self.modules = ['craype-accel-nvidia60'] @@ -33,7 +33,7 @@ def __init__(self): 'time': (1.39758, None, 0.05) }, 'kesch:cn': { - 'time': (2.12769, None, 0.05) + 'time': (2.25, None, 0.05) } } self.tags = {'production', 'mch'} diff --git a/cscs-checks/mch/g2g_meteoswiss_check.py b/cscs-checks/mch/g2g_meteoswiss_check.py index 9b28869163..99cf363f63 100644 --- a/cscs-checks/mch/g2g_meteoswiss_check.py +++ b/cscs-checks/mch/g2g_meteoswiss_check.py @@ -14,7 +14,7 @@ def __init__(self, g2g): # 'PrgEnv-gnu-c2sm-gpu' will be added later self.valid_prog_environs = ['PrgEnv-gnu'] self.exclusive_access = True - self.modules = ['cmake', 'craype-accel-nvidia35'] + self.modules = ['cmake'] self.pre_run = ["export EXECUTABLE=$(ls src/ | " "grep 'GNU.*MVAPICH.*CUDA.*kesch.*')"] self.executable = 'build/src/comm_overlap_benchmark' diff --git a/cscs-checks/mch/gpu_direct_acc.py b/cscs-checks/mch/gpu_direct_acc.py index e6c0024ca9..ab783bc9db 100644 --- a/cscs-checks/mch/gpu_direct_acc.py +++ b/cscs-checks/mch/gpu_direct_acc.py @@ -10,10 +10,7 @@ def __init__(self): self.descr = 'tests gpu-direct for Fortran OpenACC' self.valid_systems = ['daint:gpu', 'dom:gpu', 'kesch:cn'] - # FIXME: temporary workaround until the mvapich module is fixed; - # 'PrgEnv-pgi-c2sm-gpu' will be added later - self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-cray-c2sm-gpu', - 'PrgEnv-pgi'] + self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-pgi'] if self.current_system.name in ['daint', 'dom']: self.modules = ['craype-accel-nvidia60'] self.variables = {'MPICH_RDMA_ENABLED_CUDA': '1'} @@ -22,8 +19,9 @@ def __init__(self): self.num_tasks_per_node = 1 elif self.current_system.name == 'kesch': self.exclusive_access = True - self.modules = ['craype-accel-nvidia35'] + self.modules = ['cudatoolkit/8.0.61'] self.variables = { + 'CRAY_ACCEL_TARGET': 'nvidia35', 'MV2_USE_CUDA': '1', 'G2G': '1' } diff --git a/cscs-checks/mch/gpu_direct_cuda.py b/cscs-checks/mch/gpu_direct_cuda.py index 5ae22c894a..876f4d5c23 100644 --- a/cscs-checks/mch/gpu_direct_cuda.py +++ b/cscs-checks/mch/gpu_direct_cuda.py @@ -22,7 +22,7 @@ def __init__(self): elif self.current_system.name == 'kesch': self.exclusive_access = True self.valid_prog_environs = ['PrgEnv-gnu'] - self.modules = ['craype-accel-nvidia35'] + self.modules = ['cudatoolkit/8.0.61'] self.variables = { 'MV2_USE_CUDA': '1', 'G2G': '1', diff --git a/cscs-checks/mch/openacc_cuda_mpi_cppstd.py b/cscs-checks/mch/openacc_cuda_mpi_cppstd.py index 86f5829d0f..e1a0b40257 100644 --- a/cscs-checks/mch/openacc_cuda_mpi_cppstd.py +++ b/cscs-checks/mch/openacc_cuda_mpi_cppstd.py @@ -7,33 +7,54 @@ class OpenaccCudaCpp(rfm.RegressionTest): def __init__(self): super().__init__() self.descr = 'test for OpenACC, CUDA, MPI, and C++' - self.valid_systems = ['daint:gpu', 'dom:gpu', 'kesch:cn'] - self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-pgi', 'PrgEnv-gnu'] + self.valid_systems = ['daint:gpu', 'dom:gpu', + 'kesch:cn', 'arolla:cn', 'tsa:cn'] + self.valid_prog_environs = ['PrgEnv-cce', 'PrgEnv-cray', + 'PrgEnv-pgi', 'PrgEnv-gnu'] self.build_system = 'Make' self.build_system.fflags = ['-O2'] + if self.current_system.name in ['daint', 'dom']: self.modules = ['craype-accel-nvidia60'] self.num_tasks = 12 self.num_tasks_per_node = 12 self.num_gpus_per_node = 1 self.build_system.options = ['NVCC_FLAGS="-arch=compute_60"'] + self.variables = { + 'MPICH_RDMA_ENABLED_CUDA': '1', + 'CRAY_CUDA_MPS': '1' + } elif self.current_system.name == 'kesch': self.exclusive_access = True - self.modules = ['craype-accel-nvidia35'] + self.modules = ['cudatoolkit/8.0.61'] self.num_tasks = 8 self.num_tasks_per_node = 8 self.num_gpus_per_node = 8 self.build_system.options = ['NVCC_FLAGS="-arch=compute_37"'] - # FIXME: temporary workaround until the mvapich module is fixed; - # 'PrgEnv-{pgi,gnu}-c2sm-gpu' will be added later - self.valid_prog_environs += ['PrgEnv-cray-c2sm-gpu'] - - if self.current_system.name in ['daint', 'dom']: self.variables = { - 'MPICH_RDMA_ENABLED_CUDA': '1', - 'CRAY_CUDA_MPS': '1' + 'MV2_USE_CUDA': '1', + 'G2G': '1' } - elif self.current_system.name in ['kesch']: + elif self.current_system.name == 'arolla': + self.exclusive_access = True + self.modules = ['cuda92/toolkit/9.2.88', + 'craype-accel-nvidia70'] + self.num_tasks = 8 + self.num_tasks_per_node = 8 + self.num_gpus_per_node = 8 + self.build_system.options = ['NVCC_FLAGS="-arch=compute_70"'] + self.variables = { + 'MV2_USE_CUDA': '1', + 'G2G': '1' + } + elif self.current_system.name == 'tsa': + self.exclusive_access = True + self.modules = ['cuda10.0/toolkit/10.0.130', + 'craype-accel-nvidia70'] + self.num_tasks = 8 + self.num_tasks_per_node = 8 + self.num_gpus_per_node = 8 + self.build_system.options = ['NVCC_FLAGS="-arch=compute_70"'] self.variables = { 'MV2_USE_CUDA': '1', 'G2G': '1' @@ -47,6 +68,20 @@ def __init__(self): def setup(self, partition, environ, **job_opts): if environ.name.startswith('PrgEnv-cray'): self.build_system.fflags += ['-hacc', '-hnoomp'] + + elif environ.name.startswith('PrgEnv-cce'): + self.build_system.fflags += ['-hacc', '-hnoomp'] + if self.current_system.name == 'arolla': + self.build_system.ldflags = [ + '-L/cm/shared/apps/cuda92/toolkit/9.2.88/lib64', + '-lcublas', '-lcudart' + ] + elif self.current_system.name == 'tsa': + self.build_system.ldflags = [ + '-L/cm/shared/apps/cuda10.0/toolkit/10.0.130/lib64', + '-lcublas', '-lcudart' + ] + elif environ.name.startswith('PrgEnv-pgi'): self.build_system.fflags += ['-acc'] if self.current_system.name in ['daint', 'dom']: @@ -55,19 +90,39 @@ def setup(self, partition, environ, **job_opts): '-Mnorpath', '-lstdc++'] elif self.current_system.name == 'kesch': self.build_system.fflags += ['-ta=tesla,cc35,cuda8.0'] - self.build_system.ldflags = ['-acc', '-ta:tesla:cc35,cuda8.0', - '-lstdc++'] - if environ.name == 'PrgEnv-pgi-nompi': - self.build_system.ldflags += [ - '-L/global/opt/nvidia/cudatoolkit/8.0.61/lib64', - '-lcublas', '-lcudart' - ] + self.build_system.ldflags = [ + '-acc', '-ta:tesla:cc35,cuda8.0', '-lstdc++', + '-L/global/opt/nvidia/cudatoolkit/8.0.61/lib64', + '-lcublas', '-lcudart' + ] + elif self.current_system.name == 'arolla': + self.build_system.fflags += ['-ta=tesla,cc70,cuda10.0'] + self.build_system.ldflags = [ + '-acc', '-ta:tesla:cc70,cuda10.0', '-lstdc++', + '-L/cm/shared/apps/cuda92/toolkit/9.2.88/lib64', + '-lcublas', '-lcudart' + ] + elif self.current_system.name == 'tsa': + self.build_system.fflags += ['-ta=tesla,cc70,cuda10.0'] + self.build_system.ldflags = [ + '-acc', '-ta:tesla:cc70,cuda10.0', '-lstdc++', + '-L/cm/shared/apps/cuda10.0/toolkit/10.0.130/lib64', + '-lcublas', '-lcudart' + ] + elif environ.name.startswith('PrgEnv-gnu'): self.build_system.ldflags = ['-lstdc++'] if self.current_system.name == 'kesch': self.build_system.ldflags += [ - '-L/global/opt/nvidia/cudatoolkit/8.0.61/lib64', - '-lcublas', '-lcudart' + '-L/global/opt/nvidia/cudatoolkit/8.0.61/lib64' ] + if self.current_system.name == 'arolla': + self.build_system.ldflags += [ + '-L/cm/shared/apps/cuda92/toolkit/9.2.88/lib64' + ] + if self.current_system.name == 'tsa': + self.build_system.ldflags += [ + '-L/cm/shared/apps/cuda10.0/toolkit/10.0.130/lib64'] + self.build_system.ldflags += ['-lcublas', '-lcudart'] super().setup(partition, environ, **job_opts) diff --git a/cscs-checks/mch/src/openacc_cuda_mpi_cppstd.F90 b/cscs-checks/mch/src/openacc_cuda_mpi_cppstd.F90 index 4df1293702..4568b9f4f5 100644 --- a/cscs-checks/mch/src/openacc_cuda_mpi_cppstd.F90 +++ b/cscs-checks/mch/src/openacc_cuda_mpi_cppstd.F90 @@ -35,6 +35,7 @@ program openacc_cuda_mpi_cppstd !$acc end host_data !$acc end data + if(mpi_rank == 0) then ! Allocate and initialize arrays on the GPU @@ -92,6 +93,7 @@ program openacc_cuda_mpi_cppstd deallocate(f1) deallocate(f2) deallocate(f3) + write (*,*) "Result: OK" end if call MPI_Finalize(ierr);