Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions cscs-checks/microbenchmarks/shmem/shmem.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import reframe as rfm
import reframe.utility.sanity as sn


@rfm.required_version('>=2.16-dev0')
@rfm.parameterized_test(['sync'], ['async'])
class GPUShmemTest(rfm.RegressionTest):
def __init__(self, kernel_version):
super().__init__()
self.sourcepath = 'shmem.cu'
self.build_system = 'SingleSource'
self.valid_systems = ['daint:gpu', 'dom:gpu']
self.valid_prog_environs = ['PrgEnv-gnu']
self.num_tasks = 0
self.num_tasks_per_node = 1

self.sanity_patterns = sn.assert_eq(
sn.count(sn.findall(r'Bandwidth', self.stdout)),
self.num_tasks_assigned * 2)

self.perf_patterns = {
'bandwidth': sn.extractsingle(
r'Bandwidth\(double\) (?P<bw>\S+) GB/s',
self.stdout, 'bw', float)
}
# theoretical limit:
# 8 [B/cycle] * 1.328 [GHz] * 16 [bankwidth] * 56 [SM] = 9520 GB/s
self.reference = {
'dom:gpu': {
'bandwidth': (8850, -0.01, 1. - 9520/8850, 'GB/s')
},
'daint:gpu': {
'bandwidth': (8850, -0.01, 1. - 9520/8850, 'GB/s')
},
}

self.maintainers = ['SK']
self.tags = {'benchmark', 'diagnostic'}

@property
@sn.sanity_function
def num_tasks_assigned(self):
return self.job.num_tasks
107 changes: 107 additions & 0 deletions cscs-checks/microbenchmarks/shmem/src/shmem.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@

// Shared memory bandwidth benchmark
// contributed by Sebastian Keller
//
// Relevant nvprof metrics:
// nvprof -m shared_load_throughput,shared_store_throughput

#include <iostream>

#include <malloc.h>
#include <cuda.h>
#include <cuda_runtime.h>


#define NTHREADS 256
#define NITER 4096
// length of the thread block swap chain (must be even)
#define SHARED_SEGMENTS 4

static void HandleError( cudaError_t err,
const char *file,
int line ) {
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))

template <class T>
__device__ void swap(T* a, T* b)
{
T tmp;
tmp = *a;
*a = *b;
// +1 isn't needed to prevent code elimination by the
// compiler, but is added in case it gets smarter in
// a future version
*b = tmp + T{1};
}

template <class T>
__global__ void test_shmem(T* glob_mem)
{
__shared__ T smem[NTHREADS*SHARED_SEGMENTS];

int tid = threadIdx.x;

smem[tid] = T{0};
for (int i = 0; i < NITER; ++i)
{
// even shared segments
for (int j = 0; j < SHARED_SEGMENTS-1; j+=2)
swap(smem + tid + j*blockDim.x, smem + tid + (j+1)*blockDim.x);

// uneven shared segments
for (int j = 1; j < SHARED_SEGMENTS-2; j+=2)
swap(smem + tid + j*blockDim.x, smem + tid + (j+1)*blockDim.x);
}

glob_mem[blockIdx.x * blockDim.x + tid] = smem[tid];
}

template <class T>
double test_bw(long size)
{
T* buffer = (T*)malloc(size);
T* dev_buffer;
HANDLE_ERROR( cudaMalloc((void**)&dev_buffer, size) );
int nblocks = size / (NTHREADS * sizeof(T));

cudaEvent_t start, stop;
HANDLE_ERROR( cudaEventCreate(&start) );
HANDLE_ERROR( cudaEventCreate(&stop) );
HANDLE_ERROR( cudaEventRecord(start,0) );

test_shmem<<<nblocks, NTHREADS>>>(dev_buffer);

HANDLE_ERROR( cudaEventRecord(stop,0) );
HANDLE_ERROR( cudaEventSynchronize(stop) );
float gpu_time;
HANDLE_ERROR( cudaEventElapsedTime( &gpu_time, start, stop ) );
// convert to seconds
gpu_time /= 1000;

// 2 writes + 2 reads per swap
double nbytes = NITER * size * (SHARED_SEGMENTS-1) * 4;

cudaEventDestroy(start);
cudaEventDestroy(stop);
free(buffer);
cudaFree(dev_buffer);

return nbytes / gpu_time;
}

int main()
{
long size = 1024 * 1024 * 64; // 64 MB global buffer

// warmup
test_bw<int>(size);

std::cout << "Bandwidth(int) " << test_bw<int>(size) / 1024 / 1024 / 1024 << " GB/s" << std::endl;
std::cout << "Bandwidth(double) " << test_bw<double>(size) / 1024 / 1024 / 1024 << " GB/s" << std::endl;
}