reframe-hpc · vkarak · Mar 5, 2021 · Oct 21, 2020 · Oct 21, 2020 · Oct 23, 2020
diff --git a/cscs-checks/microbenchmarks/gpu/dgemm/dgmemm.py b/cscs-checks/microbenchmarks/gpu/dgemm/dgmemm.py
@@ -0,0 +1,95 @@
+# Copyright 2016-2020 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
+# ReFrame Project Developers. See the top-level LICENSE file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import reframe as rfm
+import reframe.utility.sanity as sn
+
+
+@rfm.simple_test
+class GPUdgemmTest(rfm.RegressionTest):
+    def __init__(self):
+        self.valid_systems = ['daint:gpu', 'dom:gpu',
+                              'ault:amdv100', 'ault:intelv100',
+                              'ault:amda100', 'ault:amdvega']
+        self.valid_prog_environs = ['PrgEnv-gnu']
+        self.num_tasks = 0
+        self.num_tasks_per_node = 1
+        self.build_system = 'Make'
+        self.executable = 'dgemm.x'
+        self.sanity_patterns = self.assert_num_gpus()
+        self.perf_patterns = {
+            'perf': sn.min(sn.extractall(
+                r'^\s*\[[^\]]*\]\s*GPU\s*\d+: (?P<fp>\S+) TF/s',
+                self.stdout, 'fp', float))
+        }
+        self.reference = {
+            'dom:gpu': {
+                'perf': (3.35, -0.1, None, 'TF/s')
+            },
+            'daint:gpu': {
+                'perf': (3.35, -0.1, None, 'TF/s')
+            },
+            'ault:amdv100': {
+                'perf': (5.25, -0.1, None, 'TF/s')
+            },
+            'ault:intelv100': {
+                'perf': (5.25, -0.1, None, 'TF/s')
+            },
+            'ault:amda100': {
+                'perf': (10.5, -0.1, None, 'TF/s')
+            },
+            'ault:amdvega': {
+                'perf': (3.45, -0.1, None, 'TF/s')
+            }
+        }
+
+        self.maintainers = ['JO', 'SK']
+        self.tags = {'benchmark'}
+
+    @sn.sanity_function
+    def assert_num_gpus(self):
+        return sn.assert_eq(
+            sn.count(sn.findall(r'^\s*\[[^\]]*\]\s*Test passed', self.stdout)),
+            sn.getattr(self.job, 'num_tasks'))
+
+    @rfm.run_before('compile')
+    def select_makefile(self):
+        cp = self.current_partition.fullname
+        if cp == 'ault:amdvega':
+            self.build_system.makefile = 'makefile.hip'
+        else:
+            self.build_system.makefile = 'makefile.cuda'
+
+    @rfm.run_before('compile')
+    def set_gpu_arch(self):
+        cp = self.current_partition.fullname
+
+        # Deal with the NVIDIA options first
+        nvidia_sm = None
+        if cp in {'tsa:cn', 'ault:intelv100', 'ault:amdv100'}:
+            nvidia_sm = '70'
+        elif cp == 'ault:amda100':
+            nvidia_sm = '80'
+        elif cp in {'dom:gpu', 'daint:gpu'}:
+            nvidia_sm = '60'
+
+        if nvidia_sm:
+            self.build_system.cxxflags += [f'-arch=sm_{nvidia_sm}']
+            if cp in {'dom:gpu', 'daint:gpu'}:
+                self.modules += ['craype-accel-nvidia60']
+                if cp == 'dom:gpu':
+                    self.modules += ['cdt-cuda']
+
+            else:
+                self.modules += ['cuda']
+
+        # Deal with the AMD options
+        amd_trgt = None
+        if cp == 'ault:amdvega':
+            amd_trgt = 'gfx906'
+
+        if amd_trgt:
+            self.build_system.cxxflags += [f'--amdgpu-target={amd_trgt}']
+            self.modules += ['rocm']
diff --git a/cscs-checks/microbenchmarks/gpu/dgemm/src/Xdevice b/cscs-checks/microbenchmarks/gpu/dgemm/src/Xdevice
@@ -0,0 +1 @@
+../../memory_bandwidth/src/Xdevice
diff --git a/cscs-checks/microbenchmarks/gpu/dgemm/src/dgemm.cu b/cscs-checks/microbenchmarks/gpu/dgemm/src/dgemm.cu
@@ -0,0 +1,161 @@
+/*
+ * Basic DGEMM test
+ *
+ * Multiply two matrices of dimensions SIZE*SIZE filled with ones. Therefore,
+ * all the elements of the resulting matrix will be just SIZE.
+ */
+
+#define SIZE 1024
+#define REPEAT 30
+
+#include <iostream>
+#include <unistd.h>
+#include <thread>
+#include <mutex>
+#include <vector>
+#include <algorithm>
+#include <functional>
+
+#include "Xdevice/runtime.hpp"
+#include "Xdevice/blas.hpp"
+
+
+namespace kernels
+{
+  template<class T>
+  __global__ void init_as_ones(T * arr, size_t size)
+  {
+    unsigned int tid = threadIdx.x + blockIdx.x*blockDim.x;
+    if (tid < size)
+    {
+      arr[tid] = (T)1.0;
+    }
+  }
+
+  template<class T>
+  __global__ void verify(T * arr, size_t size, int * err)
+  {
+    unsigned int tid = threadIdx.x + blockIdx.x*blockDim.x;
+    if (tid < size)
+    {
+      if (int(arr[tid]) != SIZE)
+        atomicAdd(err, 1);
+    }
+  }
+}
+
+/*
+ * This code uses a thread per device in the node.
+ * For simplicity, we define the variables below as global.
+ */
+
+#define HOST_NAME_SIZE 128
+char hostname[HOST_NAME_SIZE];
+double tflops = SIZE*SIZE*SIZE*2.0 * 1E-12;
+int totalErrors = 0;
+std::mutex mtx;
+
+#define BLOCK_SIZE 128
+void dgemm(int device)
+{
+    XSetDevice(device);
+
+    double * A;
+    double * B;
+    double * C;
+    const double alpha = 1.0;
+    const double beta = 0.0;
+
+    XMalloc((void**)&A, sizeof(double)*SIZE*SIZE);
+    XMalloc((void**)&B, sizeof(double)*SIZE*SIZE);
+    XMalloc((void**)&C, sizeof(double)*SIZE*SIZE);
+
+    kernels::init_as_ones<double><<<(SIZE*SIZE+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(A, SIZE*SIZE);
+    kernels::init_as_ones<double><<<(SIZE*SIZE+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(B, SIZE*SIZE);
+    XDeviceSynchronize();
+
+    XStream_t stream;
+    XStreamCreate(&stream);
+    XblasHandle_t blas_handle;
+    XblasCreate(&blas_handle);
+    XblasSetStream(blas_handle, stream);
+
+    // Warmup call
+    XblasDgemm(blas_handle,
+               XBLAS_OP_N, XBLAS_OP_N,
+               SIZE, SIZE, SIZE,
+               &alpha,
+               (const double*)A, SIZE,
+               (const double*)B, SIZE,
+               &beta,
+               C, SIZE);
+    XDeviceSynchronize();
+
+    // Time the execution
+    XTimer t(stream);
+    t.start();
+    for (int i = 0; i < REPEAT; i++)
+    {
+        XblasDgemm(blas_handle,
+                   XBLAS_OP_N, XBLAS_OP_N,
+                   SIZE, SIZE, SIZE,
+                   &alpha,
+                   (const double*)A, SIZE,
+                   (const double*)B, SIZE,
+                   &beta,
+                   C, SIZE);
+    }
+
+    // Calc the performance data in TFlops/sec
+    double perf = tflops/(t.stop()/REPEAT/1000.0);
+
+    XblasDestroy(blas_handle);
+    XStreamDestroy(stream);
+
+    // Verify that the final values of C are correct.
+    int * err, h_err = 0;
+    XMalloc((void**)&err, sizeof(int));
+    XMemcpy( err, &h_err, sizeof(int), XMemcpyHostToDevice);
+    kernels::verify<double><<<(SIZE+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(C, SIZE*SIZE, err);
+    XMemcpy(&h_err, err, sizeof(int), XMemcpyDeviceToHost);
+    {
+      std::lock_guard<std::mutex> lg(mtx);
+      totalErrors += h_err;
+
+      // Print the performance results
+      printf("[%s] GPU %d: %4.2f TF/s\n", hostname, device, (float)perf);
+    }
+    XFree(A);
+    XFree(B);
+    XFree(C);
+
+}
+
+int main(int argc, char **argv)
+{
+
+    gethostname(hostname, sizeof(hostname));
+
+    int num_devices;
+    XGetDeviceCount(&num_devices);
+
+    // Print device count
+    printf("[%s] Found %d device(s).\n", hostname, num_devices);
+
+    // Create vector of threads.
+    std::vector<std::thread> threads;
+
+    // Do the dgemm for all devices in the node.
+    for (int device = 0; device < num_devices; device++)
+    {
+        threads.push_back(std::thread(dgemm,device));
+    }
+
+    // Join all threads
+    std::for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join));
+
+    // Test if there were any errors and print the test result.
+    printf("[%s] Test %s\n", hostname, totalErrors == 0 ? "passed" : "failed");
+
+    return 0;
+}
diff --git a/cscs-checks/microbenchmarks/gpu/dgemm/src/makefile.cuda b/cscs-checks/microbenchmarks/gpu/dgemm/src/makefile.cuda
@@ -0,0 +1,2 @@
+dgemm:
+	nvcc $@.cu -o $@.x ${CXXFLAGS} -lnvidia-ml -lcublas -std=c++14
diff --git a/cscs-checks/microbenchmarks/gpu/dgemm/src/makefile.hip b/cscs-checks/microbenchmarks/gpu/dgemm/src/makefile.hip
@@ -0,0 +1,6 @@
+CXXFLAGS?=--amdgpu-target=gfx906,gfx908
+ROCM_ROOT?=/opt/rocm
+RSMI_ROOT?=/opt/rocm/rocm_smi
+
+dgemm:
+	hipcc -O3 $@.cu -o $@.x -DTARGET_HIP ${CXXFLAGS} -std=c++14 -I${ROCM_ROOT} -I${RSMI_ROOT}/include -lnuma -lrocm_smi64 -lrocblas
diff --git a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/blas.hpp b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/blas.hpp
@@ -39,6 +39,10 @@ void XblasDestroy(cublasHandle_t handle)
   checkError( cublasDestroy(handle) );
 }
 
+void XblasSetStream(cublasHandle_t h, cudaStream_t s)
+{
+  checkError ( cublasSetStream(h, s) );
+}
 
 auto XblasDgemm = cublasDgemm;
 auto XblasSgemm = cublasSgemm;

diff --git a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/smi.hpp b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/smi.hpp
@@ -5,6 +5,10 @@
 #include <unistd.h>
 #include <nvml.h>
 
+/*
+ * NVML - SMI tools
+ */
+
 static inline void nvmlCheck(nvmlReturn_t err)
 {
 # ifdef DEBUG
@@ -80,4 +84,96 @@ Smi::~Smi()
   }
 }
 
+
+/*
+ * ASM tools
+ */
+
+__device__ __forceinline__ uint32_t XClock()
+{
+  // Clock counter
+  uint32_t x;
+  asm volatile ("mov.u32 %0, %%clock;" : "=r"(x) :: "memory");
+  return x;
+}
+
+__device__ __forceinline__ uint64_t XClock64()
+{
+  // Clock counter
+  uint64_t x;
+  asm volatile ("mov.u64 %0, %%clock64;" : "=l"(x) :: "memory");
+  return x;
+}
+
+__device__ __forceinline__ uint32_t XSyncClock()
+{
+  // Clock counter with a preceeding barrier.
+  uint32_t x;
+  asm volatile ("bar.sync	0;\n\t"
+                "mov.u32 %0, %%clock;" : "=r"(x) :: "memory");
+  return x;
+}
+
+__device__ __forceinline__ uint64_t XSyncClock64()
+{
+  // Clock counter with a preceeding barrier.
+  uint64_t x;
+  asm volatile ("bar.sync	0;\n\t"
+                "mov.u64 %0, %%clock64;" : "=l"(x) :: "memory");
+  return x;
+}
+
+
+template<class T = uint32_t>
+class __XClocks
+{
+  /*
+   * XClocks timer tool
+   * Tracks the number of clock cycles between a call to the start
+   * and end member functions.
+   */
+public:
+  T startClock;
+  __device__ void start()
+  {
+    startClock = XSyncClock();
+  }
+  __device__ T end()
+  {
+    return XClock() - startClock;
+  }
+};
+
+template<>
+void __XClocks<uint64_t>::start()
+{
+  this->startClock = XSyncClock64();
+}
+
+template<>
+uint64_t __XClocks<uint64_t>::end()
+{
+  return XClock64() - this->startClock;
+}
+
+using XClocks64 = __XClocks<uint64_t>;
+using XClocks = __XClocks<>;
+
+
+template<class T>
+__device__ T XClockLatency()
+{
+  uint64_t start = XClock64();
+  uint64_t end   = XClock64();
+  return (T)(end-start);
+}
+
+__device__ __forceinline__ int __smId()
+{
+  // SM ID
+  uint32_t x;
+  asm volatile ("mov.u32 %0, %%smid;" : "=r"(x) :: "memory");
+  return (int)x;
+}
+
 #endif
diff --git a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/types.hpp b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/types.hpp
@@ -13,4 +13,6 @@ XMemcpyKind XMemcpyDeviceToDevice = cudaMemcpyDeviceToDevice;
 XMemcpyKind XMemcpyHostToHost = cudaMemcpyHostToHost;
 XMemcpyKind XMemcpyDefault = cudaMemcpyDefault;
 
+#define XHostAllocMapped cudaHostAllocMapped
+
 #endif
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		dgemm:
		nvcc $@.cu -o $@.x ${CXXFLAGS} -lnvidia-ml -lcublas -std=c++14