From 0a9bb8f01d0943ba578406673a1bb361ed29da2f Mon Sep 17 00:00:00 2001 From: Sebastian Keller Date: Fri, 19 Apr 2019 16:28:11 +0200 Subject: [PATCH 1/5] added GPU shmem bw benchmark --- cscs-checks/microbenchmarks/shmem/shmem.py | 42 +++++++ .../microbenchmarks/shmem/src/shmem.cu | 107 ++++++++++++++++++ 2 files changed, 149 insertions(+) create mode 100644 cscs-checks/microbenchmarks/shmem/shmem.py create mode 100644 cscs-checks/microbenchmarks/shmem/src/shmem.cu diff --git a/cscs-checks/microbenchmarks/shmem/shmem.py b/cscs-checks/microbenchmarks/shmem/shmem.py new file mode 100644 index 0000000000..70eaf15ea2 --- /dev/null +++ b/cscs-checks/microbenchmarks/shmem/shmem.py @@ -0,0 +1,42 @@ +import reframe as rfm +import reframe.utility.sanity as sn + + +@rfm.required_version('>=2.18-dev0') +@rfm.parameterized_test(['sync'], ['async']) +class KernelLatencyTest(rfm.RegressionTest): + def __init__(self, kernel_version): + super().__init__() + self.sourcepath = 'shmem.cu' + self.build_system = 'SingleSource' + self.valid_systems = ['daint:gpu', 'dom:gpu'] + self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-pgi', 'PrgEnv-gnu'] + self.num_tasks = 0 + self.num_tasks_per_node = 1 + + self.sanity_patterns = sn.assert_eq( + sn.count(sn.findall(r'Bandwidth', + self.stdout)), + self.num_tasks_assigned * 2) + + self.perf_patterns = { + 'bandwidth': sn.extractsingle( + r'Bandwidth\(double\) (?P\S+) GB/s', + self.stdout, 'bw', float) + } + self.reference = { + 'dom:gpu': { + 'bandwidth': (11800, -0.01, 0.1, 'GB/s') + }, + 'daint:gpu': { + 'bandwidth': (11800, -0.01, 0.1, 'GB/s') + }, + } + + self.maintainers = ['SK'] + self.tags = {'benchmark', 'diagnostic'} + + @property + @sn.sanity_function + def num_tasks_assigned(self): + return self.job.num_tasks diff --git a/cscs-checks/microbenchmarks/shmem/src/shmem.cu b/cscs-checks/microbenchmarks/shmem/src/shmem.cu new file mode 100644 index 0000000000..670cb3b16c --- /dev/null +++ b/cscs-checks/microbenchmarks/shmem/src/shmem.cu @@ -0,0 +1,107 @@ + +// Shared memory bandwidth benchmark +// contributed by Sebastian Keller +// +// Relevant nvprof metrics: +// nvprof -m shared_load_throughput,shared_store_throughput + +#include + +#include +#include +#include + + +#define NTHREADS 256 +#define NITER 4096 +// must be even +#define SHARED_SEGMENTS 4 + +static void HandleError( cudaError_t err, + const char *file, + int line ) { + if (err != cudaSuccess) { + printf( "%s in %s at line %d\n", cudaGetErrorString( err ), + file, line ); + exit( EXIT_FAILURE ); + } +} +#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ )) + +template +__device__ void swap(T* a, T* b) +{ + T tmp; + tmp = *a; + *a = *b; + // +1 isn't needed to prevent code elimination by the + // compiler, but is added in case it gets smarter in + // a future version + *b = tmp + T(1); +} + +template +__global__ void test_shmem(T* glob_mem) +{ + __shared__ T smem[NTHREADS*SHARED_SEGMENTS]; + + int tid = threadIdx.x; + + smem[tid] = T(1); + for (int i = 0; i < NITER; ++i) + { + // even shared segments + for (int j = 0; j < SHARED_SEGMENTS-1; j+=2) + swap(smem + tid + j*blockDim.x, smem + tid + (j+1)*blockDim.x); + + // uneven shared segments + for (int j = 1; j < SHARED_SEGMENTS-2; j+=2) + swap(smem + tid + j*blockDim.x, smem + tid + (j+1)*blockDim.x); + } + + glob_mem[blockIdx.x * blockDim.x + tid] = smem[tid]; +} + +template +double test_bw(long size) +{ + T* buffer = (T*)malloc(size); + T* dev_buffer; + HANDLE_ERROR( cudaMalloc((void**)&dev_buffer, size) ); + int nblocks = size / (NTHREADS * sizeof(T)); + + cudaEvent_t start, stop; + HANDLE_ERROR( cudaEventCreate(&start) ); + HANDLE_ERROR( cudaEventCreate(&stop) ); + HANDLE_ERROR( cudaEventRecord(start,0) ); + + test_shmem<<>>(dev_buffer); + + HANDLE_ERROR( cudaEventRecord(stop,0) ); + HANDLE_ERROR( cudaEventSynchronize(stop) ); + float gpu_time; + HANDLE_ERROR( cudaEventElapsedTime( &gpu_time, start, stop ) ); + // convert to seconds + gpu_time /= 1000; + + // 2 writes + 2 reads per swap + double nbytes = nblocks * NTHREADS * double(NITER) * sizeof(T) * SHARED_SEGMENTS * 4; + + cudaEventDestroy(start); + cudaEventDestroy(stop); + free(buffer); + cudaFree(dev_buffer); + + return nbytes / gpu_time; +} + +int main() +{ + long size = 1024 * 1024 * 64; // 64 MB global buffer + + // warmup + test_bw(size); + + std::cout << "Bandwidth(int) " << test_bw(size) / 1024 / 1024 / 1024 << " GB/s" << std::endl; + std::cout << "Bandwidth(double) " << test_bw(size) / 1024 / 1024 / 1024 << " GB/s" << std::endl; +} From 99253ce395eff8aa9e770f118b83ff50cb409808 Mon Sep 17 00:00:00 2001 From: Sebastian Keller Date: Fri, 19 Apr 2019 16:31:28 +0200 Subject: [PATCH 2/5] adapt test name --- cscs-checks/microbenchmarks/shmem/shmem.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cscs-checks/microbenchmarks/shmem/shmem.py b/cscs-checks/microbenchmarks/shmem/shmem.py index 70eaf15ea2..7ac85d28c4 100644 --- a/cscs-checks/microbenchmarks/shmem/shmem.py +++ b/cscs-checks/microbenchmarks/shmem/shmem.py @@ -4,7 +4,7 @@ @rfm.required_version('>=2.18-dev0') @rfm.parameterized_test(['sync'], ['async']) -class KernelLatencyTest(rfm.RegressionTest): +class GPUShmemTest(rfm.RegressionTest): def __init__(self, kernel_version): super().__init__() self.sourcepath = 'shmem.cu' From 07897ba7ca79bd95e0415e240518d5184c216dc9 Mon Sep 17 00:00:00 2001 From: Sebastian Keller Date: Fri, 19 Apr 2019 16:33:57 +0200 Subject: [PATCH 3/5] added support for beautiful pep8 syntax --- cscs-checks/microbenchmarks/shmem/shmem.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cscs-checks/microbenchmarks/shmem/shmem.py b/cscs-checks/microbenchmarks/shmem/shmem.py index 7ac85d28c4..fc1f9d29f6 100644 --- a/cscs-checks/microbenchmarks/shmem/shmem.py +++ b/cscs-checks/microbenchmarks/shmem/shmem.py @@ -16,7 +16,7 @@ def __init__(self, kernel_version): self.sanity_patterns = sn.assert_eq( sn.count(sn.findall(r'Bandwidth', - self.stdout)), + self.stdout)), self.num_tasks_assigned * 2) self.perf_patterns = { From b96cf00b62cb2fc0b01bf4fa030fe1d4d4558f13 Mon Sep 17 00:00:00 2001 From: Sebastian Keller Date: Mon, 6 May 2019 14:46:52 +0200 Subject: [PATCH 4/5] correct memops counting --- cscs-checks/microbenchmarks/shmem/shmem.py | 11 +++++------ cscs-checks/microbenchmarks/shmem/src/shmem.cu | 8 ++++---- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/cscs-checks/microbenchmarks/shmem/shmem.py b/cscs-checks/microbenchmarks/shmem/shmem.py index fc1f9d29f6..5f90b46782 100644 --- a/cscs-checks/microbenchmarks/shmem/shmem.py +++ b/cscs-checks/microbenchmarks/shmem/shmem.py @@ -2,7 +2,7 @@ import reframe.utility.sanity as sn -@rfm.required_version('>=2.18-dev0') +@rfm.required_version('>=2.16-dev0') @rfm.parameterized_test(['sync'], ['async']) class GPUShmemTest(rfm.RegressionTest): def __init__(self, kernel_version): @@ -10,13 +10,12 @@ def __init__(self, kernel_version): self.sourcepath = 'shmem.cu' self.build_system = 'SingleSource' self.valid_systems = ['daint:gpu', 'dom:gpu'] - self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-pgi', 'PrgEnv-gnu'] + self.valid_prog_environs = ['PrgEnv-gnu'] self.num_tasks = 0 self.num_tasks_per_node = 1 self.sanity_patterns = sn.assert_eq( - sn.count(sn.findall(r'Bandwidth', - self.stdout)), + sn.count(sn.findall(r'Bandwidth', self.stdout)), self.num_tasks_assigned * 2) self.perf_patterns = { @@ -26,10 +25,10 @@ def __init__(self, kernel_version): } self.reference = { 'dom:gpu': { - 'bandwidth': (11800, -0.01, 0.1, 'GB/s') + 'bandwidth': (8850, -0.01, 0.1, 'GB/s') }, 'daint:gpu': { - 'bandwidth': (11800, -0.01, 0.1, 'GB/s') + 'bandwidth': (8850, -0.01, 0.1, 'GB/s') }, } diff --git a/cscs-checks/microbenchmarks/shmem/src/shmem.cu b/cscs-checks/microbenchmarks/shmem/src/shmem.cu index 670cb3b16c..8a7b6695c5 100644 --- a/cscs-checks/microbenchmarks/shmem/src/shmem.cu +++ b/cscs-checks/microbenchmarks/shmem/src/shmem.cu @@ -14,7 +14,7 @@ #define NTHREADS 256 #define NITER 4096 -// must be even +// length of the thread block swap chain (must be even) #define SHARED_SEGMENTS 4 static void HandleError( cudaError_t err, @@ -37,7 +37,7 @@ __device__ void swap(T* a, T* b) // +1 isn't needed to prevent code elimination by the // compiler, but is added in case it gets smarter in // a future version - *b = tmp + T(1); + *b = tmp + T{1}; } template @@ -47,7 +47,7 @@ __global__ void test_shmem(T* glob_mem) int tid = threadIdx.x; - smem[tid] = T(1); + smem[tid] = T{0}; for (int i = 0; i < NITER; ++i) { // even shared segments @@ -85,7 +85,7 @@ double test_bw(long size) gpu_time /= 1000; // 2 writes + 2 reads per swap - double nbytes = nblocks * NTHREADS * double(NITER) * sizeof(T) * SHARED_SEGMENTS * 4; + double nbytes = NITER * size * (SHARED_SEGMENTS-1) * 4; cudaEventDestroy(start); cudaEventDestroy(stop); From 034b54c75e6848d19f4be5fc52702229a01e729e Mon Sep 17 00:00:00 2001 From: Sebastian Keller Date: Mon, 6 May 2019 15:57:57 +0200 Subject: [PATCH 5/5] corrected perf upper limit --- cscs-checks/microbenchmarks/shmem/shmem.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cscs-checks/microbenchmarks/shmem/shmem.py b/cscs-checks/microbenchmarks/shmem/shmem.py index 5f90b46782..101964cafe 100644 --- a/cscs-checks/microbenchmarks/shmem/shmem.py +++ b/cscs-checks/microbenchmarks/shmem/shmem.py @@ -23,12 +23,14 @@ def __init__(self, kernel_version): r'Bandwidth\(double\) (?P\S+) GB/s', self.stdout, 'bw', float) } + # theoretical limit: + # 8 [B/cycle] * 1.328 [GHz] * 16 [bankwidth] * 56 [SM] = 9520 GB/s self.reference = { 'dom:gpu': { - 'bandwidth': (8850, -0.01, 0.1, 'GB/s') + 'bandwidth': (8850, -0.01, 1. - 9520/8850, 'GB/s') }, 'daint:gpu': { - 'bandwidth': (8850, -0.01, 0.1, 'GB/s') + 'bandwidth': (8850, -0.01, 1. - 9520/8850, 'GB/s') }, }