diff --git a/cscs-checks/microbenchmarks/shmem/shmem.py b/cscs-checks/microbenchmarks/shmem/shmem.py new file mode 100644 index 0000000000..101964cafe --- /dev/null +++ b/cscs-checks/microbenchmarks/shmem/shmem.py @@ -0,0 +1,43 @@ +import reframe as rfm +import reframe.utility.sanity as sn + + +@rfm.required_version('>=2.16-dev0') +@rfm.parameterized_test(['sync'], ['async']) +class GPUShmemTest(rfm.RegressionTest): + def __init__(self, kernel_version): + super().__init__() + self.sourcepath = 'shmem.cu' + self.build_system = 'SingleSource' + self.valid_systems = ['daint:gpu', 'dom:gpu'] + self.valid_prog_environs = ['PrgEnv-gnu'] + self.num_tasks = 0 + self.num_tasks_per_node = 1 + + self.sanity_patterns = sn.assert_eq( + sn.count(sn.findall(r'Bandwidth', self.stdout)), + self.num_tasks_assigned * 2) + + self.perf_patterns = { + 'bandwidth': sn.extractsingle( + r'Bandwidth\(double\) (?P\S+) GB/s', + self.stdout, 'bw', float) + } + # theoretical limit: + # 8 [B/cycle] * 1.328 [GHz] * 16 [bankwidth] * 56 [SM] = 9520 GB/s + self.reference = { + 'dom:gpu': { + 'bandwidth': (8850, -0.01, 1. - 9520/8850, 'GB/s') + }, + 'daint:gpu': { + 'bandwidth': (8850, -0.01, 1. - 9520/8850, 'GB/s') + }, + } + + self.maintainers = ['SK'] + self.tags = {'benchmark', 'diagnostic'} + + @property + @sn.sanity_function + def num_tasks_assigned(self): + return self.job.num_tasks diff --git a/cscs-checks/microbenchmarks/shmem/src/shmem.cu b/cscs-checks/microbenchmarks/shmem/src/shmem.cu new file mode 100644 index 0000000000..8a7b6695c5 --- /dev/null +++ b/cscs-checks/microbenchmarks/shmem/src/shmem.cu @@ -0,0 +1,107 @@ + +// Shared memory bandwidth benchmark +// contributed by Sebastian Keller +// +// Relevant nvprof metrics: +// nvprof -m shared_load_throughput,shared_store_throughput + +#include + +#include +#include +#include + + +#define NTHREADS 256 +#define NITER 4096 +// length of the thread block swap chain (must be even) +#define SHARED_SEGMENTS 4 + +static void HandleError( cudaError_t err, + const char *file, + int line ) { + if (err != cudaSuccess) { + printf( "%s in %s at line %d\n", cudaGetErrorString( err ), + file, line ); + exit( EXIT_FAILURE ); + } +} +#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ )) + +template +__device__ void swap(T* a, T* b) +{ + T tmp; + tmp = *a; + *a = *b; + // +1 isn't needed to prevent code elimination by the + // compiler, but is added in case it gets smarter in + // a future version + *b = tmp + T{1}; +} + +template +__global__ void test_shmem(T* glob_mem) +{ + __shared__ T smem[NTHREADS*SHARED_SEGMENTS]; + + int tid = threadIdx.x; + + smem[tid] = T{0}; + for (int i = 0; i < NITER; ++i) + { + // even shared segments + for (int j = 0; j < SHARED_SEGMENTS-1; j+=2) + swap(smem + tid + j*blockDim.x, smem + tid + (j+1)*blockDim.x); + + // uneven shared segments + for (int j = 1; j < SHARED_SEGMENTS-2; j+=2) + swap(smem + tid + j*blockDim.x, smem + tid + (j+1)*blockDim.x); + } + + glob_mem[blockIdx.x * blockDim.x + tid] = smem[tid]; +} + +template +double test_bw(long size) +{ + T* buffer = (T*)malloc(size); + T* dev_buffer; + HANDLE_ERROR( cudaMalloc((void**)&dev_buffer, size) ); + int nblocks = size / (NTHREADS * sizeof(T)); + + cudaEvent_t start, stop; + HANDLE_ERROR( cudaEventCreate(&start) ); + HANDLE_ERROR( cudaEventCreate(&stop) ); + HANDLE_ERROR( cudaEventRecord(start,0) ); + + test_shmem<<>>(dev_buffer); + + HANDLE_ERROR( cudaEventRecord(stop,0) ); + HANDLE_ERROR( cudaEventSynchronize(stop) ); + float gpu_time; + HANDLE_ERROR( cudaEventElapsedTime( &gpu_time, start, stop ) ); + // convert to seconds + gpu_time /= 1000; + + // 2 writes + 2 reads per swap + double nbytes = NITER * size * (SHARED_SEGMENTS-1) * 4; + + cudaEventDestroy(start); + cudaEventDestroy(stop); + free(buffer); + cudaFree(dev_buffer); + + return nbytes / gpu_time; +} + +int main() +{ + long size = 1024 * 1024 * 64; // 64 MB global buffer + + // warmup + test_bw(size); + + std::cout << "Bandwidth(int) " << test_bw(size) / 1024 / 1024 / 1024 << " GB/s" << std::endl; + std::cout << "Bandwidth(double) " << test_bw(size) / 1024 / 1024 / 1024 << " GB/s" << std::endl; +}