Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
70 commits
Select commit Hold shift + click to select a range
d5a7f7c
Add first pointer_chase draft.
jjotero Oct 21, 2020
15ecf71
Bugfix on memory access.
jjotero Oct 21, 2020
9efa771
Add random node placement on the linked list.
jjotero Oct 23, 2020
0c365a8
Add support for command line args.
jjotero Oct 23, 2020
5dee5bf
Reword help menu.
jjotero Oct 26, 2020
d90b632
Merge branch 'master' into test/pointer-chase
jjotero Nov 9, 2020
e39c710
Update pointer_chase test to use the XDevice lib.
jjotero Nov 9, 2020
3b1eab4
Pointer chase ported to HIP.
jjotero Nov 10, 2020
8552a5f
Rename src and dst pointers in dev copy functions.
jjotero Nov 10, 2020
055be79
Add node ID to the test prints.
jjotero Nov 10, 2020
58c28d2
Merge branch 'test/mem-bandwidth-ault' into test/pointer-chase
jjotero Nov 11, 2020
2994696
Merge branch 'test/ault' into test/pointer-chase
jjotero Nov 11, 2020
8d09541
Add comments in the source code.
jjotero Nov 11, 2020
1c1ab5e
Add P2P pointer chase.
jjotero Nov 11, 2020
1d6b06b
Extend options to retrieve min latency.
jjotero Nov 11, 2020
20fda05
Add asm XClock and XClock64 functions.
jjotero Nov 13, 2020
ff209db
Restructure pChase algo
jjotero Nov 13, 2020
12033fc
Create pointer_chase reframe test.
jjotero Nov 16, 2020
ec1317b
Update ref for A100s,
jjotero Nov 17, 2020
ee42305
Add XClocks class to Xdevice lib.
jjotero Nov 18, 2020
51567a2
Bugfix in the clocks implementation for hip.
jjotero Nov 18, 2020
345bf9f
Merge branch 'master' into test/pointer-chase
jjotero Nov 18, 2020
525d25b
Expand pointer_chase checks.
jjotero Nov 19, 2020
c481ee9
Add tsa references.
jjotero Nov 19, 2020
b3ea42c
Update a100 refs.
jjotero Nov 19, 2020
8280c18
Update refs for dom/daint.
jjotero Nov 19, 2020
8056cd6
Add clock latency check.
jjotero Nov 20, 2020
c7b23f2
Add refs for daint, dom and tsa.
jjotero Nov 20, 2020
a44c67e
Fix PEP8 issues and comments to the src code.
jjotero Nov 20, 2020
9272d97
Bugfix in the HIP clockLatency function.
jjotero Nov 24, 2020
6566ee1
Port kernel latency test to AMD GPUs.
jjotero Nov 24, 2020
6459d40
Merge branch 'test/Xdevice' into test/pointer-chase
jjotero Nov 26, 2020
900cd19
Update the Xdevice wrappers.
jjotero Dec 2, 2020
6aac2c9
Add dgemm-gpu sources.
jjotero Dec 9, 2020
2a87f57
Cleanup dgemm-gpu output.
jjotero Dec 9, 2020
de7d2d8
Add threading support to dgemm
jjotero Dec 9, 2020
140f91d
Merge branch 'master' into test/dgemm
jjotero Dec 9, 2020
a946e7b
Add dgemm test
jjotero Dec 9, 2020
c6a2d05
Merge branch 'test/pointer-chase' into test/ault-dev
jjotero Dec 9, 2020
f96cc5f
Merge branch 'test/ault' into test/ault-dev
jjotero Dec 9, 2020
fc17c5e
Merge branch 'test/ault' into test/ault-dev
jjotero Dec 10, 2020
fb7d7c1
Add fixme label.
jjotero Dec 10, 2020
a568249
Remove double XMemcpy definition.
jjotero Dec 10, 2020
2c11e84
Merge branch 'test/ault' into test/ault-dev
jjotero Dec 10, 2020
7779679
Merge branch 'master' into test/ault-dev
jjotero Jan 15, 2021
a18a294
Fix PEP8
jjotero Jan 15, 2021
1e48c27
Merge branch 'test/ault-dev' of github.com:jjotero/reframe into test/…
jjotero Jan 19, 2021
1842537
Add cdt-cuda module for dom:gpu
jjotero Jan 19, 2021
e41f14e
Merge branch 'master' into test/ault-dev
jjotero Jan 19, 2021
6f3273a
Remove unnecessary workaround
jjotero Jan 19, 2021
b196503
Update includes after merge
jjotero Jan 19, 2021
c3e8676
Adjust pointer chase refs
jjotero Jan 19, 2021
d484c79
Add benchmark tags
jjotero Jan 19, 2021
82d6085
Fix typo
jjotero Jan 19, 2021
1bd83c1
Replace cudatoolkit module by craype-accel-nvidia60
jjotero Jan 20, 2021
fd582b4
Merge branch 'master' into test/ault-dev
jjotero Jan 29, 2021
0ee2c13
Add consitent naming
jjotero Jan 29, 2021
4c36930
Address PR comments
jjotero Feb 1, 2021
80f3b14
Add missing include
jjotero Feb 1, 2021
a823a4e
Make the chase circular
jjotero Feb 1, 2021
e875d0f
Remove single-jump timing routines
jjotero Mar 1, 2021
dc41d68
Merge branch 'master' into test/ault-dev
jjotero Mar 1, 2021
aef694a
Cleanup source code and remove single-step tests
jjotero Mar 2, 2021
6da6785
Add memory latency tests
jjotero Mar 2, 2021
9c429fb
Update refs for tsa
jjotero Mar 2, 2021
9cfd000
Address PR comments
jjotero Mar 4, 2021
19f3054
Merge branch 'master' into test/ault-dev
jjotero Mar 4, 2021
c89f950
Do chase simultaneously in all devices
jjotero Mar 4, 2021
9884951
Merge branch 'master' into test/ault-dev
Mar 5, 2021
26e7dc5
Merge branch 'master' into test/ault-dev
Mar 5, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 95 additions & 0 deletions cscs-checks/microbenchmarks/gpu/dgemm/dgmemm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# Copyright 2016-2020 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
# ReFrame Project Developers. See the top-level LICENSE file for details.
#
# SPDX-License-Identifier: BSD-3-Clause

import reframe as rfm
import reframe.utility.sanity as sn


@rfm.simple_test
class GPUdgemmTest(rfm.RegressionTest):
def __init__(self):
self.valid_systems = ['daint:gpu', 'dom:gpu',
'ault:amdv100', 'ault:intelv100',
'ault:amda100', 'ault:amdvega']
self.valid_prog_environs = ['PrgEnv-gnu']
self.num_tasks = 0
self.num_tasks_per_node = 1
self.build_system = 'Make'
self.executable = 'dgemm.x'
self.sanity_patterns = self.assert_num_gpus()
self.perf_patterns = {
'perf': sn.min(sn.extractall(
r'^\s*\[[^\]]*\]\s*GPU\s*\d+: (?P<fp>\S+) TF/s',
self.stdout, 'fp', float))
}
self.reference = {
'dom:gpu': {
'perf': (3.35, -0.1, None, 'TF/s')
},
'daint:gpu': {
'perf': (3.35, -0.1, None, 'TF/s')
},
'ault:amdv100': {
'perf': (5.25, -0.1, None, 'TF/s')
},
'ault:intelv100': {
'perf': (5.25, -0.1, None, 'TF/s')
},
'ault:amda100': {
'perf': (10.5, -0.1, None, 'TF/s')
},
'ault:amdvega': {
'perf': (3.45, -0.1, None, 'TF/s')
}
}

self.maintainers = ['JO', 'SK']
self.tags = {'benchmark'}

@sn.sanity_function
def assert_num_gpus(self):
return sn.assert_eq(
sn.count(sn.findall(r'^\s*\[[^\]]*\]\s*Test passed', self.stdout)),
sn.getattr(self.job, 'num_tasks'))

@rfm.run_before('compile')
def select_makefile(self):
cp = self.current_partition.fullname
if cp == 'ault:amdvega':
self.build_system.makefile = 'makefile.hip'
else:
self.build_system.makefile = 'makefile.cuda'

@rfm.run_before('compile')
def set_gpu_arch(self):
cp = self.current_partition.fullname

# Deal with the NVIDIA options first
nvidia_sm = None
if cp in {'tsa:cn', 'ault:intelv100', 'ault:amdv100'}:
nvidia_sm = '70'
elif cp == 'ault:amda100':
nvidia_sm = '80'
elif cp in {'dom:gpu', 'daint:gpu'}:
nvidia_sm = '60'

if nvidia_sm:
self.build_system.cxxflags += [f'-arch=sm_{nvidia_sm}']
if cp in {'dom:gpu', 'daint:gpu'}:
self.modules += ['craype-accel-nvidia60']
if cp == 'dom:gpu':
self.modules += ['cdt-cuda']

else:
self.modules += ['cuda']

# Deal with the AMD options
amd_trgt = None
if cp == 'ault:amdvega':
amd_trgt = 'gfx906'

if amd_trgt:
self.build_system.cxxflags += [f'--amdgpu-target={amd_trgt}']
self.modules += ['rocm']
1 change: 1 addition & 0 deletions cscs-checks/microbenchmarks/gpu/dgemm/src/Xdevice
161 changes: 161 additions & 0 deletions cscs-checks/microbenchmarks/gpu/dgemm/src/dgemm.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
/*
* Basic DGEMM test
*
* Multiply two matrices of dimensions SIZE*SIZE filled with ones. Therefore,
* all the elements of the resulting matrix will be just SIZE.
*/

#define SIZE 1024
#define REPEAT 30

#include <iostream>
#include <unistd.h>
#include <thread>
#include <mutex>
#include <vector>
#include <algorithm>
#include <functional>

#include "Xdevice/runtime.hpp"
#include "Xdevice/blas.hpp"


namespace kernels
{
template<class T>
__global__ void init_as_ones(T * arr, size_t size)
{
unsigned int tid = threadIdx.x + blockIdx.x*blockDim.x;
if (tid < size)
{
arr[tid] = (T)1.0;
}
}

template<class T>
__global__ void verify(T * arr, size_t size, int * err)
{
unsigned int tid = threadIdx.x + blockIdx.x*blockDim.x;
if (tid < size)
{
if (int(arr[tid]) != SIZE)
atomicAdd(err, 1);
}
}
}

/*
* This code uses a thread per device in the node.
* For simplicity, we define the variables below as global.
*/

#define HOST_NAME_SIZE 128
char hostname[HOST_NAME_SIZE];
double tflops = SIZE*SIZE*SIZE*2.0 * 1E-12;
int totalErrors = 0;
std::mutex mtx;

#define BLOCK_SIZE 128
void dgemm(int device)
{
XSetDevice(device);

double * A;
double * B;
double * C;
const double alpha = 1.0;
const double beta = 0.0;

XMalloc((void**)&A, sizeof(double)*SIZE*SIZE);
XMalloc((void**)&B, sizeof(double)*SIZE*SIZE);
XMalloc((void**)&C, sizeof(double)*SIZE*SIZE);

kernels::init_as_ones<double><<<(SIZE*SIZE+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(A, SIZE*SIZE);
kernels::init_as_ones<double><<<(SIZE*SIZE+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(B, SIZE*SIZE);
XDeviceSynchronize();

XStream_t stream;
XStreamCreate(&stream);
XblasHandle_t blas_handle;
XblasCreate(&blas_handle);
XblasSetStream(blas_handle, stream);

// Warmup call
XblasDgemm(blas_handle,
XBLAS_OP_N, XBLAS_OP_N,
SIZE, SIZE, SIZE,
&alpha,
(const double*)A, SIZE,
(const double*)B, SIZE,
&beta,
C, SIZE);
XDeviceSynchronize();

// Time the execution
XTimer t(stream);
t.start();
for (int i = 0; i < REPEAT; i++)
{
XblasDgemm(blas_handle,
XBLAS_OP_N, XBLAS_OP_N,
SIZE, SIZE, SIZE,
&alpha,
(const double*)A, SIZE,
(const double*)B, SIZE,
&beta,
C, SIZE);
}

// Calc the performance data in TFlops/sec
double perf = tflops/(t.stop()/REPEAT/1000.0);

XblasDestroy(blas_handle);
XStreamDestroy(stream);

// Verify that the final values of C are correct.
int * err, h_err = 0;
XMalloc((void**)&err, sizeof(int));
XMemcpy( err, &h_err, sizeof(int), XMemcpyHostToDevice);
kernels::verify<double><<<(SIZE+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(C, SIZE*SIZE, err);
XMemcpy(&h_err, err, sizeof(int), XMemcpyDeviceToHost);
{
std::lock_guard<std::mutex> lg(mtx);
totalErrors += h_err;

// Print the performance results
printf("[%s] GPU %d: %4.2f TF/s\n", hostname, device, (float)perf);
}
XFree(A);
XFree(B);
XFree(C);

}

int main(int argc, char **argv)
{

gethostname(hostname, sizeof(hostname));

int num_devices;
XGetDeviceCount(&num_devices);

// Print device count
printf("[%s] Found %d device(s).\n", hostname, num_devices);

// Create vector of threads.
std::vector<std::thread> threads;

// Do the dgemm for all devices in the node.
for (int device = 0; device < num_devices; device++)
{
threads.push_back(std::thread(dgemm,device));
}

// Join all threads
std::for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join));

// Test if there were any errors and print the test result.
printf("[%s] Test %s\n", hostname, totalErrors == 0 ? "passed" : "failed");

return 0;
}
2 changes: 2 additions & 0 deletions cscs-checks/microbenchmarks/gpu/dgemm/src/makefile.cuda
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
dgemm:
nvcc $@.cu -o $@.x ${CXXFLAGS} -lnvidia-ml -lcublas -std=c++14
6 changes: 6 additions & 0 deletions cscs-checks/microbenchmarks/gpu/dgemm/src/makefile.hip
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
CXXFLAGS?=--amdgpu-target=gfx906,gfx908
ROCM_ROOT?=/opt/rocm
RSMI_ROOT?=/opt/rocm/rocm_smi

dgemm:
hipcc -O3 $@.cu -o $@.x -DTARGET_HIP ${CXXFLAGS} -std=c++14 -I${ROCM_ROOT} -I${RSMI_ROOT}/include -lnuma -lrocm_smi64 -lrocblas
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ void XblasDestroy(cublasHandle_t handle)
checkError( cublasDestroy(handle) );
}

void XblasSetStream(cublasHandle_t h, cudaStream_t s)
{
checkError ( cublasSetStream(h, s) );
}

auto XblasDgemm = cublasDgemm;
auto XblasSgemm = cublasSgemm;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@
#include <unistd.h>
#include <nvml.h>

/*
* NVML - SMI tools
*/

static inline void nvmlCheck(nvmlReturn_t err)
{
# ifdef DEBUG
Expand Down Expand Up @@ -80,4 +84,96 @@ Smi::~Smi()
}
}


/*
* ASM tools
*/

__device__ __forceinline__ uint32_t XClock()
{
// Clock counter
uint32_t x;
asm volatile ("mov.u32 %0, %%clock;" : "=r"(x) :: "memory");
return x;
}

__device__ __forceinline__ uint64_t XClock64()
{
// Clock counter
uint64_t x;
asm volatile ("mov.u64 %0, %%clock64;" : "=l"(x) :: "memory");
return x;
}

__device__ __forceinline__ uint32_t XSyncClock()
{
// Clock counter with a preceeding barrier.
uint32_t x;
asm volatile ("bar.sync 0;\n\t"
"mov.u32 %0, %%clock;" : "=r"(x) :: "memory");
return x;
}

__device__ __forceinline__ uint64_t XSyncClock64()
{
// Clock counter with a preceeding barrier.
uint64_t x;
asm volatile ("bar.sync 0;\n\t"
"mov.u64 %0, %%clock64;" : "=l"(x) :: "memory");
return x;
}


template<class T = uint32_t>
class __XClocks
{
/*
* XClocks timer tool
* Tracks the number of clock cycles between a call to the start
* and end member functions.
*/
public:
T startClock;
__device__ void start()
{
startClock = XSyncClock();
}
__device__ T end()
{
return XClock() - startClock;
}
};

template<>
void __XClocks<uint64_t>::start()
{
this->startClock = XSyncClock64();
}

template<>
uint64_t __XClocks<uint64_t>::end()
{
return XClock64() - this->startClock;
}

using XClocks64 = __XClocks<uint64_t>;
using XClocks = __XClocks<>;


template<class T>
__device__ T XClockLatency()
{
uint64_t start = XClock64();
uint64_t end = XClock64();
return (T)(end-start);
}

__device__ __forceinline__ int __smId()
{
// SM ID
uint32_t x;
asm volatile ("mov.u32 %0, %%smid;" : "=r"(x) :: "memory");
return (int)x;
}

#endif
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,6 @@ XMemcpyKind XMemcpyDeviceToDevice = cudaMemcpyDeviceToDevice;
XMemcpyKind XMemcpyHostToHost = cudaMemcpyHostToHost;
XMemcpyKind XMemcpyDefault = cudaMemcpyDefault;

#define XHostAllocMapped cudaHostAllocMapped

#endif
Loading