From d5a7f7c2d45cc32a97470c7835ef307f65328c4b Mon Sep 17 00:00:00 2001
From: Javier Otero <jotero@cscs.ch>
Date: Wed, 21 Oct 2020 16:31:33 +0200
Subject: [PATCH 01/51] Add first pointer_chase draft.

---
 .../gpu/pointer_chase/src/pointer_chase.cu    | 85 +++++++++++++++++++
 1 file changed, 85 insertions(+)
 create mode 100644 cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
new file mode 100644
index 0000000000..c6c0d9f2d2
--- /dev/null
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
@@ -0,0 +1,85 @@
+#define EXPAND2(x) x; x; 
+#define EXPAND4(x) EXPAND2(x) EXPAND2(x) 
+#define EXPAND8(x) EXPAND4(x) EXPAND4(x) 
+#define EXPAND16(x) EXPAND8(x) EXPAND8(x) 
+#define EXPAND32(x) EXPAND16(x) EXPAND16(x) 
+#define EXPAND64(x) EXPAND32(x) EXPAND32(x) 
+#define EXPAND128(x) EXPAND24(x) EXPAND64(x)
+#define EXPAND256(x) EXPAND128(x) EXPAND128(x)
+
+#include <stdio.h>
+#include <cstdint>
+#include <iostream>
+
+//using namespace std;
+
+#define NUM_NODES 200
+#define BUFFER_SIZE 400
+#define NODE_PADDING 128
+
+#if BUFFER_SIZE < NUM_NODES
+# error "NUM_NODES cannot exceed BUFFER_SIZE."
+#endif
+
+static __device__ __forceinline__ uint32_t __clock()
+{
+  uint32_t x;
+  asm volatile ("mov.u32 %0, %%clock;" : "=r"(x) :: "memory");
+  return x;
+}
+
+struct Node
+{
+  Node * next = nullptr;
+  char _padding[NODE_PADDING];
+};
+
+__global__ void initList(Node * head)
+{
+  // Set the head
+  Node * prev = new (head) Node();
+
+  // Init the rest of the list
+  for (int n = 1; n <= NUM_NODES; n++)
+  {
+    Node * temp = new (&(head[n])) Node();
+    prev->next = temp;
+    prev = temp;
+  }
+
+}
+
+__global__ void pointer_chase(Node * head, Node * nodeOut)
+{
+  // Create a pointer to iterate through the list
+  Node * ptr = head;
+
+  // start timer
+  uint32_t start = __clock();
+
+  // Traverse the list
+  EXPAND64(ptr = ptr->next)
+
+  // end cycle count
+  uint32_t end = __clock();
+
+  printf("Chase took %d cycles.\n", end - start);
+  *nodeOut = *ptr;
+}
+
+int main()
+{
+  // Allocate device buffer for the list
+  Node * listBuffer;
+  cudaMalloc((void**)&listBuffer, sizeof(Node)*BUFFER_SIZE);
+  initList<<<1,1>>>(listBuffer);
+  Node * result = nullptr;
+  pointer_chase<<<1,1>>>(listBuffer, result);
+  cudaDeviceSynchronize();
+
+  unsigned char buf[sizeof(int)*2] ; 
+  
+  // placement new in buf 
+  int *pInt = new (buf) int(3);
+
+}

From 15ecf7193f61ac96929d5e93fed27f3e112ecd8d Mon Sep 17 00:00:00 2001
From: Javier Otero <jotero@cscs.ch>
Date: Wed, 21 Oct 2020 17:50:20 +0200
Subject: [PATCH 02/51] Bugfix on memory access.

---
 .../gpu/pointer_chase/src/pointer_chase.cu    | 67 ++++++++++++-------
 1 file changed, 43 insertions(+), 24 deletions(-)

diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
index c6c0d9f2d2..8b2ab26ec1 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
@@ -7,19 +7,12 @@
 #define EXPAND128(x) EXPAND24(x) EXPAND64(x)
 #define EXPAND256(x) EXPAND128(x) EXPAND128(x)
 
-#include <stdio.h>
 #include <cstdint>
 #include <iostream>
 
-//using namespace std;
 
-#define NUM_NODES 200
-#define BUFFER_SIZE 400
-#define NODE_PADDING 128
-
-#if BUFFER_SIZE < NUM_NODES
-# error "NUM_NODES cannot exceed BUFFER_SIZE."
-#endif
+#define NODES 200
+#define NODE_PADDING 0
 
 static __device__ __forceinline__ uint32_t __clock()
 {
@@ -28,19 +21,21 @@ static __device__ __forceinline__ uint32_t __clock()
   return x;
 }
 
+
 struct Node
 {
   Node * next = nullptr;
   char _padding[NODE_PADDING];
 };
 
+
 __global__ void initList(Node * head)
 {
   // Set the head
   Node * prev = new (head) Node();
 
   // Init the rest of the list
-  for (int n = 1; n <= NUM_NODES; n++)
+  for (int n = 1; n < NODES+1; n++)
   {
     Node * temp = new (&(head[n])) Node();
     prev->next = temp;
@@ -49,7 +44,19 @@ __global__ void initList(Node * head)
 
 }
 
-__global__ void pointer_chase(Node * head, Node * nodeOut)
+
+template < unsigned int repeat >
+__device__ __forceinline__ void ptrChase(Node ** ptr)
+{
+  (*ptr) = (*ptr)->next;
+  ptrChase<repeat-1>(ptr);
+}
+
+template<>
+__device__ __forceinline__ void  ptrChase<0>(Node ** ptr){}
+
+
+__global__ void pointer_chase(Node * head)
 {
   // Create a pointer to iterate through the list
   Node * ptr = head;
@@ -57,29 +64,41 @@ __global__ void pointer_chase(Node * head, Node * nodeOut)
   // start timer
   uint32_t start = __clock();
 
-  // Traverse the list
-  EXPAND64(ptr = ptr->next)
-
+  ptrChase<10>(&ptr);
+  
   // end cycle count
   uint32_t end = __clock();
 
-  printf("Chase took %d cycles.\n", end - start);
-  *nodeOut = *ptr;
+  printf("Chase took %d cycles per node jump.\n", (end - start)/(NODES-1));
+  head[0] = (*ptr);
 }
 
-int main()
+void devicePointerChase()
 {
   // Allocate device buffer for the list
   Node * listBuffer;
-  cudaMalloc((void**)&listBuffer, sizeof(Node)*BUFFER_SIZE);
+  cudaMalloc((void**)&listBuffer, sizeof(Node)*NODES);
+
+  // Initilize the list
   initList<<<1,1>>>(listBuffer);
-  Node * result = nullptr;
-  pointer_chase<<<1,1>>>(listBuffer, result);
+
+  // Do the chase
+  pointer_chase<<<1,1>>>(listBuffer);
   cudaDeviceSynchronize();
 
-  unsigned char buf[sizeof(int)*2] ; 
-  
-  // placement new in buf 
-  int *pInt = new (buf) int(3);
+  cudaFree(listBuffer);
+  cudaError_t err = cudaGetLastError();
+  if (cudaSuccess != err)
+  {
+    std::cerr << cudaGetErrorString(err) << std::endl;
+  }
+}
 
+int main()
+{
+  for (int i = 0; i < 10; i++)
+  {
+    devicePointerChase();
+  }
+  printf("end.\n");
 }

From 9efa7716b51934b1f501f71d664cb5832d1dc9a0 Mon Sep 17 00:00:00 2001
From: Javier Otero <jotero@cscs.ch>
Date: Fri, 23 Oct 2020 12:06:28 +0200
Subject: [PATCH 03/51] Add random node placement on the linked list.

---
 .../gpu/pointer_chase/src/pointer_chase.cu    | 245 +++++++++++++++---
 1 file changed, 203 insertions(+), 42 deletions(-)

diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
index 8b2ab26ec1..141eedbb4e 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
@@ -1,19 +1,34 @@
-#define EXPAND2(x) x; x; 
-#define EXPAND4(x) EXPAND2(x) EXPAND2(x) 
-#define EXPAND8(x) EXPAND4(x) EXPAND4(x) 
-#define EXPAND16(x) EXPAND8(x) EXPAND8(x) 
-#define EXPAND32(x) EXPAND16(x) EXPAND16(x) 
-#define EXPAND64(x) EXPAND32(x) EXPAND32(x) 
-#define EXPAND128(x) EXPAND24(x) EXPAND64(x)
-#define EXPAND256(x) EXPAND128(x) EXPAND128(x)
-
 #include <cstdint>
 #include <iostream>
+#include <random>
+#include <chrono>
+#include <set>
 
-
-#define NODES 200
+#define NODES 128
 #define NODE_PADDING 0
 
+#ifndef NODE_STRIDE
+#define NODE_STRIDE 1
+#endif
+
+#ifndef BUFFER_SIZE
+# define BUFFER_SIZE NODES*NODE_STRIDE
+#endif
+
+#if (BUFFER_SIZE < NODES*NODE_STRIDE)
+# error "Buffer size cannot be lower than the number of nodes."
+#endif
+
+void checkErrors()
+{
+  cudaError_t err = cudaGetLastError();
+  if (cudaSuccess != err)
+  {
+    std::cerr << cudaGetErrorString(err) << std::endl;
+  }
+}
+
+
 static __device__ __forceinline__ uint32_t __clock()
 {
   uint32_t x;
@@ -22,83 +37,229 @@ static __device__ __forceinline__ uint32_t __clock()
 }
 
 
+static __device__ __forceinline__ uint32_t __smId()
+{
+  uint32_t x;
+  asm volatile ("mov.u32 %0, %%smid;" : "=r"(x) :: "memory");
+  return x;
+}
+
+
 struct Node
 {
   Node * next = nullptr;
-  char _padding[NODE_PADDING];
+  char _padding[8*NODE_PADDING];
 };
 
 
-__global__ void initList(Node * head)
+__global__ void initialize_list(Node * head, int stride = 1)
+{
+  // Set the head
+  Node * prev = new (&(head[0])) Node();
+
+  // Init the rest of the list
+  for (int n = 1; n < NODES; n++)
+  {
+    Node * temp = new (&(head[n*stride])) Node();
+    prev->next = temp;
+    prev = temp;
+  }
+
+}
+
+
+__global__ void initialize_random_list(Node * buffer, uint32_t *indices)
 {
   // Set the head
-  Node * prev = new (head) Node();
+  Node * prev = new (&(buffer[indices[0]])) Node();
 
   // Init the rest of the list
-  for (int n = 1; n < NODES+1; n++)
+  for (int n = 1; n < NODES; n++)
   {
-    Node * temp = new (&(head[n])) Node();
+    Node * temp = new (&(buffer[indices[n]])) Node();
     prev->next = temp;
     prev = temp;
   }
 
 }
 
+#ifdef VOLATILE
+# define __VOLATILE__ volatile
+#else
+# define __VOLATILE__
+#endif
+
 
 template < unsigned int repeat >
-__device__ __forceinline__ void ptrChase(Node ** ptr)
+__device__ __forceinline__ void nextNode( __VOLATILE__ Node ** ptr)
 {
+#ifdef TIME_EACH_STEP
+  uint32_t t1 = __clock();
+#endif
   (*ptr) = (*ptr)->next;
-  ptrChase<repeat-1>(ptr);
+#ifdef TIME_EACH_STEP
+  uint32_t t2 = __clock();
+  printf("Single jump took %d cycles.\n" , t2-t1);
+#endif
+  nextNode<repeat-1>(ptr);
 }
 
 template<>
-__device__ __forceinline__ void  ptrChase<0>(Node ** ptr){}
+__device__ __forceinline__ void  nextNode<0>( __VOLATILE__ Node ** ptr){}
 
 
-__global__ void pointer_chase(Node * head)
+__global__ void make_circular(Node * __restrict__ buffer, uint32_t headIndex)
 {
   // Create a pointer to iterate through the list
-  Node * ptr = head;
+  __VOLATILE__ Node * ptr = &(buffer[headIndex]);
 
   // start timer
   uint32_t start = __clock();
 
-  ptrChase<10>(&ptr);
+  nextNode<NODES-1>(&ptr);
   
   // end cycle count
   uint32_t end = __clock();
+  uint32_t smId = __smId();
+  printf("Chase took on average %d cycles per node jump (SM %d).\n", (end - start)/(NODES-1), smId);
+
+  // Join the tail with the head.
+  if (ptr->next == nullptr)
+  {
+    ptr->next = &(buffer[headIndex]);
+  }
 
-  printf("Chase took %d cycles per node jump.\n", (end - start)/(NODES-1));
-  head[0] = (*ptr);
 }
 
-void devicePointerChase()
+
+struct List
 {
-  // Allocate device buffer for the list
-  Node * listBuffer;
-  cudaMalloc((void**)&listBuffer, sizeof(Node)*NODES);
+  Node * buffer = nullptr;
+  uint32_t headIndex = 0;
 
-  // Initilize the list
-  initList<<<1,1>>>(listBuffer);
+  static void info(int n)
+  {
+    printf("Creating Linked list:\n");
+    printf(" - Node size: %d\n", sizeof(Node));
+    printf(" - Number of nodes: %d:\n", n);
+    printf(" - Total buffer size: %10.2f MB:\n", float(sizeof(Node)*BUFFER_SIZE)/1024.0/1024);
+  }
 
-  // Do the chase
-  pointer_chase<<<1,1>>>(listBuffer);
-  cudaDeviceSynchronize();
+  void initialize(int mode=0)
+  {
+    if (mode < 0 || mode > 1)
+    {
+      printf("Unknown list initialization scheme. Default to 0.");
+      mode = 0;
+    }
 
-  cudaFree(listBuffer);
-  cudaError_t err = cudaGetLastError();
-  if (cudaSuccess != err)
+    if (mode == 0)
+    {
+      initialize_list<<<1,1>>>(buffer, NODE_STRIDE);
+      cudaDeviceSynchronize();
+    }
+    else
+    {
+      // Random number engine.
+      std::mt19937_64 rng;
+      uint64_t timeSeed = std::chrono::high_resolution_clock::now().time_since_epoch().count();
+      std::seed_seq ss{uint32_t(timeSeed & 0xffffffff), uint32_t(timeSeed>>32)};
+      rng.seed(ss);
+      std::uniform_real_distribution<double> unif(0, 1);
+    
+      uint32_t * nodeIndices = (uint32_t*)malloc(sizeof(uint32_t)*NODES); 
+      // Create set to keep track of the assigned indices.
+      std::set<uint32_t> s = {}; 
+      for (int i = 0; i < NODES; i++)
+      {
+        // Get a random index.
+        uint32_t currentIndex = (uint32_t)(unif(rng)*BUFFER_SIZE);
+        
+        // If already present in the set, find another alternative index.
+        if(s.find(currentIndex) != s.end())
+        {
+          while (s.find(currentIndex) != s.end())
+          {
+            if (currentIndex < NODES-1)
+            {
+              currentIndex++;
+            }
+            else
+            {
+              currentIndex = 0;
+            }
+          }
+
+        }
+        nodeIndices[i] = currentIndex;
+        s.insert(currentIndex);
+      }
+      uint32_t * d_nodeIndices;
+      cudaMalloc((void**)&d_nodeIndices, sizeof(uint32_t)*NODES);
+      cudaMemcpy(d_nodeIndices, nodeIndices, sizeof(uint32_t)*NODES, cudaMemcpyHostToDevice);
+      initialize_random_list<<<1,1>>>(buffer, d_nodeIndices);
+      headIndex = nodeIndices[0];
+      free(nodeIndices);
+      cudaFree(d_nodeIndices); 
+    }
+
+    cudaDeviceSynchronize();
+    checkErrors();
+  }
+
+  void traverse()
   {
-    std::cerr << cudaGetErrorString(err) << std::endl;
+    make_circular<<<1,1>>>(buffer, headIndex);
+    cudaDeviceSynchronize();
+    checkErrors();
   }
-}
 
-int main()
+};
+
+
+struct DeviceList : public List
 {
-  for (int i = 0; i < 10; i++)
+  DeviceList(int n) 
+  {
+    List::info(n);
+    cudaMalloc((void**)&buffer, sizeof(Node)*BUFFER_SIZE);
+  }
+  ~DeviceList()
   {
-    devicePointerChase();
+    cudaFree(buffer);
   }
-  printf("end.\n");
+};
+
+struct HostList : public List
+{
+  Node * h_buffer;
+  HostList(int n) 
+  {
+    List::info(n);
+    cudaHostAlloc((void**)&h_buffer, sizeof(Node)*BUFFER_SIZE, cudaHostAllocMapped);
+    cudaHostGetDevicePointer((void**)&buffer, (void*)h_buffer, 0);
+  }
+  ~HostList()
+  {
+    cudaFreeHost(buffer);
+  }
+};
+
+
+template < class LIST >
+void devicePointerChase(int m)
+{
+  LIST l(NODES);
+
+  l.initialize(m);
+  l.traverse(); 
+
+}
+
+int main()
+{
+  devicePointerChase<DeviceList>(0);
+  devicePointerChase<DeviceList>(1);
+  devicePointerChase<HostList>(0);
+  devicePointerChase<HostList>(1);
 }

From 0c365a833354355651151e1f076484bc67164954 Mon Sep 17 00:00:00 2001
From: Javier Otero <jotero@cscs.ch>
Date: Fri, 23 Oct 2020 17:18:54 +0200
Subject: [PATCH 04/51] Add support for command line args.

---
 .../gpu/pointer_chase/src/pointer_chase.cu    | 220 ++++++++++++++----
 1 file changed, 177 insertions(+), 43 deletions(-)

diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
index 141eedbb4e..89ab03ec0a 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
@@ -1,22 +1,45 @@
 #include <cstdint>
+#include <cassert>
 #include <iostream>
+#include <string>
 #include <random>
 #include <chrono>
 #include <set>
 
-#define NODES 128
-#define NODE_PADDING 0
+/*
+ ~~ GPU Linked list pointer chase algorithm ~~
+ Times in clock cycles the time it takes to jump from one node to the next
+ in a singly linked list.
 
-#ifndef NODE_STRIDE
-#define NODE_STRIDE 1
-#endif
+ The list can be initialized sequentially or with a random node ordering. This
+ can be controlled passing the command line argument "--rand".
 
-#ifndef BUFFER_SIZE
-# define BUFFER_SIZE NODES*NODE_STRIDE
-#endif
+ The stride and the full buffer size can be set with "--stride" and "--buffer",
+ both in number of nodes.
+
+ The macro NODES sets the total number of nodes in the list. Node that the
+ list traversal is 'unrolled' inlining a recursive template, and this will
+ not work if you use a large number of nodes.
+
+ The nodes can be padded with an arbitrary size controlled by the NODE_PADDING
+ macro (in Bytes).
 
-#if (BUFFER_SIZE < NODES*NODE_STRIDE)
-# error "Buffer size cannot be lower than the number of nodes."
+ The LIST_TYPE macro dictates where the list is allocated. If DeviceList is used
+ (default option) the linked list is allocated in device memory. In contrast, if
+ HostList is used, the list is allocated as host's pinned memory.
+
+ The links of the list can be made vlatile defining the macro VOLATILE.
+
+ By default, the code returns the aveage number of cycles per jump, but this can
+ be changed to return the cycle count on a per-jump basis by defining the flag
+ TIME_EACH_STEP.
+*/
+
+#define NODES 64
+#define NODE_PADDING 0
+
+#ifndef LIST_TYPE
+# define LIST_TYPE DeviceList
 #endif
 
 void checkErrors()
@@ -45,6 +68,21 @@ static __device__ __forceinline__ uint32_t __smId()
 }
 
 
+static __device__ uint32_t __clockLatency()
+{
+  uint32_t start = __clock();
+  uint32_t end = __clock();
+  return end-start;
+}
+
+
+__global__ void clockLatency()
+{
+  uint32_t clkLatency = __clockLatency();
+  printf(" - Clock latency is %d.\n", clkLatency);
+}
+
+
 struct Node
 {
   Node * next = nullptr;
@@ -83,6 +121,27 @@ __global__ void initialize_random_list(Node * buffer, uint32_t *indices)
 
 }
 
+
+__global__ void simple_traverse(Node * __restrict__ buffer, uint32_t headIndex)
+{
+  uint32_t count = 0;
+  Node * head = &(buffer[headIndex]);
+  Node * ptr = head;
+  while(ptr->next != nullptr || count < NODES-1)
+  {
+    ptr = ptr->next;
+    count++;
+  }
+
+  // Silly dep. to tell the compiler not to throw away this kernel.
+  if (ptr->next == head)
+  {
+    printf("You had a circular list :(\n");
+  }
+
+}
+
+
 #ifdef VOLATILE
 # define __VOLATILE__ volatile
 #else
@@ -91,39 +150,61 @@ __global__ void initialize_random_list(Node * buffer, uint32_t *indices)
 
 
 template < unsigned int repeat >
-__device__ __forceinline__ void nextNode( __VOLATILE__ Node ** ptr)
+__device__ __forceinline__ void nextNode( __VOLATILE__ Node ** ptr, uint32_t * timings, Node ** ptrs)
 {
 #ifdef TIME_EACH_STEP
   uint32_t t1 = __clock();
 #endif
   (*ptr) = (*ptr)->next;
 #ifdef TIME_EACH_STEP
-  uint32_t t2 = __clock();
-  printf("Single jump took %d cycles.\n" , t2-t1);
+  (*ptrs) = (Node*)(*ptr);  // Data dep. to prevent ILP.
+  *timings = __clock() -t1; // Time the jump
 #endif
-  nextNode<repeat-1>(ptr);
+  nextNode<repeat-1>(ptr, timings+1, ptrs+1);
 }
 
 template<>
-__device__ __forceinline__ void  nextNode<0>( __VOLATILE__ Node ** ptr){}
+__device__ __forceinline__ void  nextNode<0>( __VOLATILE__ Node ** ptr, uint32_t * timings, Node ** ptrs){}
 
 
 __global__ void make_circular(Node * __restrict__ buffer, uint32_t headIndex)
 {
+
+  // These are used to prevent ILP when timing each jump.
+  __shared__ uint32_t timings[NODES-1];
+  __shared__ Node * ptrs[NODES-1];
+  uint32_t sum = 0;
+
   // Create a pointer to iterate through the list
   __VOLATILE__ Node * ptr = &(buffer[headIndex]);
 
+#ifndef TIME_EACH_STEP
   // start timer
   uint32_t start = __clock();
+#endif
+
+  nextNode<NODES-1>(&ptr, timings, ptrs);
 
-  nextNode<NODES-1>(&ptr);
-  
+#ifndef TIME_EACH_STEP
   // end cycle count
   uint32_t end = __clock();
-  uint32_t smId = __smId();
-  printf("Chase took on average %d cycles per node jump (SM %d).\n", (end - start)/(NODES-1), smId);
+  sum = end - start;
+#else
+  printf("Latency for each node jump:\n");
+  for (uint32_t i = 0; i < NODES-1; i++)
+  {
+    printf("%d\n", timings[i]);
+    sum += timings[i];
+  }
+  if (ptr == ptrs[0])
+  {
+    printf("This is some data dependency that will never be executed.");
+  }
+#endif
+
+  printf("Chase took on average %d cycles per node jump (SM %d).\n", sum/(NODES-1), __smId());
 
-  // Join the tail with the head.
+  // Join the tail with the head (just for the data dependency).
   if (ptr->next == nullptr)
   {
     ptr->next = &(buffer[headIndex]);
@@ -136,13 +217,19 @@ struct List
 {
   Node * buffer = nullptr;
   uint32_t headIndex = 0;
+  size_t buffSize;
+  size_t stride;
 
-  static void info(int n)
+  List(size_t bSize, size_t st) : buffSize(bSize), stride(st) {};
+
+  static void info(size_t n, size_t buffSize)
   {
     printf("Creating Linked list:\n");
     printf(" - Node size: %d\n", sizeof(Node));
     printf(" - Number of nodes: %d:\n", n);
-    printf(" - Total buffer size: %10.2f MB:\n", float(sizeof(Node)*BUFFER_SIZE)/1024.0/1024);
+    printf(" - Total buffer size: %10.2f MB:\n", float(sizeof(Node)*buffSize)/1024.0/1024);
+    clockLatency<<<1,1>>>();
+    cudaDeviceSynchronize();
   }
 
   void initialize(int mode=0)
@@ -155,7 +242,7 @@ struct List
 
     if (mode == 0)
     {
-      initialize_list<<<1,1>>>(buffer, NODE_STRIDE);
+      initialize_list<<<1,1>>>(buffer, stride);
       cudaDeviceSynchronize();
     }
     else
@@ -166,15 +253,15 @@ struct List
       std::seed_seq ss{uint32_t(timeSeed & 0xffffffff), uint32_t(timeSeed>>32)};
       rng.seed(ss);
       std::uniform_real_distribution<double> unif(0, 1);
-    
-      uint32_t * nodeIndices = (uint32_t*)malloc(sizeof(uint32_t)*NODES); 
+
+      uint32_t * nodeIndices = (uint32_t*)malloc(sizeof(uint32_t)*NODES);
       // Create set to keep track of the assigned indices.
-      std::set<uint32_t> s = {}; 
+      std::set<uint32_t> s = {};
       for (int i = 0; i < NODES; i++)
       {
         // Get a random index.
-        uint32_t currentIndex = (uint32_t)(unif(rng)*BUFFER_SIZE);
-        
+        uint32_t currentIndex = (uint32_t)(unif(rng)*buffSize);
+
         // If already present in the set, find another alternative index.
         if(s.find(currentIndex) != s.end())
         {
@@ -200,7 +287,7 @@ struct List
       initialize_random_list<<<1,1>>>(buffer, d_nodeIndices);
       headIndex = nodeIndices[0];
       free(nodeIndices);
-      cudaFree(d_nodeIndices); 
+      cudaFree(d_nodeIndices);
     }
 
     cudaDeviceSynchronize();
@@ -208,6 +295,12 @@ struct List
   }
 
   void traverse()
+  {
+    simple_traverse<<<1,1>>>(buffer, headIndex);
+    cudaDeviceSynchronize();
+    checkErrors();
+  }
+  void time_traversal()
   {
     make_circular<<<1,1>>>(buffer, headIndex);
     cudaDeviceSynchronize();
@@ -219,10 +312,10 @@ struct List
 
 struct DeviceList : public List
 {
-  DeviceList(int n) 
+  DeviceList(size_t n, size_t buffSize, size_t stride) : List(buffSize, stride)
   {
-    List::info(n);
-    cudaMalloc((void**)&buffer, sizeof(Node)*BUFFER_SIZE);
+    List::info(n, buffSize);
+    cudaMalloc((void**)&buffer, sizeof(Node)*buffSize);
   }
   ~DeviceList()
   {
@@ -233,10 +326,10 @@ struct DeviceList : public List
 struct HostList : public List
 {
   Node * h_buffer;
-  HostList(int n) 
+  HostList(size_t n, size_t buffSize, size_t stride) : List(buffSize,stride)
   {
-    List::info(n);
-    cudaHostAlloc((void**)&h_buffer, sizeof(Node)*BUFFER_SIZE, cudaHostAllocMapped);
+    List::info(n, buffSize);
+    cudaHostAlloc((void**)&h_buffer, sizeof(Node)*buffSize, cudaHostAllocMapped);
     cudaHostGetDevicePointer((void**)&buffer, (void*)h_buffer, 0);
   }
   ~HostList()
@@ -247,19 +340,60 @@ struct HostList : public List
 
 
 template < class LIST >
-void devicePointerChase(int m)
+void devicePointerChase(int m, size_t buffSize, size_t stride)
 {
-  LIST l(NODES);
+  LIST l(NODES, buffSize, stride);
 
   l.initialize(m);
-  l.traverse(); 
+  l.traverse(); // warmup kernel
+  l.time_traversal();
 
 }
 
-int main()
+int main(int argc, char ** argv)
 {
-  devicePointerChase<DeviceList>(0);
-  devicePointerChase<DeviceList>(1);
-  devicePointerChase<HostList>(0);
-  devicePointerChase<HostList>(1);
+  // Set program defaults before parsing the command line args.
+  int list_init = 0;
+  size_t stride = 1;
+  size_t buffSize = NODES*stride;
+
+  // Parse the command line args.
+  for (int i = 0; i < argc; i++)
+  {
+    std::string str = argv[i];
+    if (str == "--help" || str == "-h")
+    {
+      std::cout << "--rand      : Initializes the linked list with nodes in random order." << std::endl;
+      std::cout << "--stride #  : Sets the stride between the nodes in the list (in number of nodes)." << std::endl;
+      std::cout << "              If --rand is used, this parameter has no effect." << std::endl;
+      std::cout << "--buffer #  : Sets the size of the buffer where the linked list is allocated on. " << std::endl;
+      std::cout << "              The number indicates the size of the buffer in list nodes." << std::endl;
+      std::cout << "--help (-h) : I guess you figured what this does already ;)" << std::endl;
+      return 0;
+    }
+    else if (str == "--rand")
+    {
+      list_init = 1;
+    }
+    else if (str == "--stride")
+    {
+      stride = std::stoi((std::string)argv[++i]);
+      buffSize = NODES*stride;
+    }
+    else if (str == "--buffer")
+    {
+      buffSize = std::stoi((std::string)argv[++i]);
+    }
+  }
+
+  // Sanity of the command line args.
+  if (buffSize < NODES*stride)
+  {
+    std::cerr << "Buffer is not large enough to fit the list." << std::endl;
+    return 1;
+  }
+
+
+  // Run the pointer chase.
+  devicePointerChase<LIST_TYPE>(list_init, buffSize, stride);
 }

From 5dee5bf9bcab0df856418834ced609e4e2a712c6 Mon Sep 17 00:00:00 2001
From: Javier Otero <jotero@cscs.ch>
Date: Mon, 26 Oct 2020 12:16:37 +0100
Subject: [PATCH 05/51] Reword help menu.

---
 .../microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
index 89ab03ec0a..db392e8cc9 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
@@ -365,7 +365,7 @@ int main(int argc, char ** argv)
     {
       std::cout << "--rand      : Initializes the linked list with nodes in random order." << std::endl;
       std::cout << "--stride #  : Sets the stride between the nodes in the list (in number of nodes)." << std::endl;
-      std::cout << "              If --rand is used, this parameter has no effect." << std::endl;
+      std::cout << "              If --rand is used, this parameter just changes the buffer size." << std::endl;
       std::cout << "--buffer #  : Sets the size of the buffer where the linked list is allocated on. " << std::endl;
       std::cout << "              The number indicates the size of the buffer in list nodes." << std::endl;
       std::cout << "--help (-h) : I guess you figured what this does already ;)" << std::endl;

From e39c71058b6a8595eef50c335382387970d7f200 Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Mon, 9 Nov 2020 18:00:46 +0100
Subject: [PATCH 06/51] Update pointer_chase test to use the XDevice lib.

---
 .../src/Xdevice/cuda/tools.hpp                | 26 +++++
 .../src/Xdevice/cuda/types.hpp                |  2 +
 .../src/Xdevice/cuda/utils.hpp                | 15 +++
 .../gpu/pointer_chase/src/pointer_chase.cu    | 97 ++++++++++---------
 4 files changed, 94 insertions(+), 46 deletions(-)

diff --git a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/tools.hpp b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/tools.hpp
index 3e63b51e8b..e668ec0e32 100644
--- a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/tools.hpp
+++ b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/tools.hpp
@@ -5,6 +5,10 @@
 #include <unistd.h>
 #include <nvml.h>
 
+/*
+ * NVML - SMI tools
+ */
+
 static inline void nvmlCheck(nvmlReturn_t err)
 {
 # ifdef DEBUG
@@ -67,4 +71,26 @@ Smi::~Smi()
   }
 }
 
+
+/*
+ * ASM tools
+ */
+
+__device__ __forceinline__ uint32_t __ownClock()
+{
+  // Clock counter
+  uint32_t x;
+  asm volatile ("mov.u32 %0, %%clock;" : "=r"(x) :: "memory");
+  return x;
+}
+
+
+static __device__ __forceinline__ uint32_t __smId()
+{
+  // SM ID
+  uint32_t x;
+  asm volatile ("mov.u32 %0, %%smid;" : "=r"(x) :: "memory");
+  return x;
+}
+
 #endif
diff --git a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/types.hpp b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/types.hpp
index 705be8a53c..c0beed998e 100644
--- a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/types.hpp
+++ b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/types.hpp
@@ -13,4 +13,6 @@ XMemcpyKind XMemcpyDeviceToDevice = cudaMemcpyDeviceToDevice;
 XMemcpyKind XMemcpyHostToHost = cudaMemcpyHostToHost;
 XMemcpyKind XMemcpyDefault = cudaMemcpyDefault;
 
+#define XHostAllocMapped cudaHostAllocMapped
+
 #endif
diff --git a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/utils.hpp b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/utils.hpp
index d9f9271462..cbbee8df2b 100644
--- a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/utils.hpp
+++ b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/utils.hpp
@@ -28,6 +28,11 @@ void XMallocHost(void ** data, size_t size)
   checkError( cudaMallocHost(data, size) );
 }
 
+void XHostAlloc(void** pHost, size_t size, unsigned int flags)
+{
+  checkError( cudaHostAlloc(pHost, size, flags) );
+}
+
 void XFreeHost(void * data)
 {
   checkError( cudaFreeHost(data) );
@@ -98,6 +103,16 @@ void XMemcpyPeerAsync(void * dst, int peerDevId, void * src, int srcDevId, size_
   checkError( cudaMemcpyPeerAsync(dst, peerDevId, src, srcDevId, size, stream) );
 }
 
+void XMemcpy(void * in, void * out, size_t size, cudaMemcpyKind dir)
+{
+  checkError( cudaMemcpy(out, in, size, dir) );
+}
+
+void XHostGetDevicePointer(void** device, void* host, unsigned int flags)
+{
+  checkError( cudaHostGetDevicePointer(device, host, flags) );
+}
+
 int XGetLastError()
 {
   cudaError_t err = cudaGetLastError();
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
index db392e8cc9..356fe697e1 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
@@ -6,6 +6,9 @@
 #include <chrono>
 #include <set>
 
+// Include the CUDA/HIP wrappers from the other test for now.
+#include "../../memory_bandwidth/src/Xdevice/runtime.hpp"
+
 /*
  ~~ GPU Linked list pointer chase algorithm ~~
  Times in clock cycles the time it takes to jump from one node to the next
@@ -52,26 +55,10 @@ void checkErrors()
 }
 
 
-static __device__ __forceinline__ uint32_t __clock()
-{
-  uint32_t x;
-  asm volatile ("mov.u32 %0, %%clock;" : "=r"(x) :: "memory");
-  return x;
-}
-
-
-static __device__ __forceinline__ uint32_t __smId()
-{
-  uint32_t x;
-  asm volatile ("mov.u32 %0, %%smid;" : "=r"(x) :: "memory");
-  return x;
-}
-
-
 static __device__ uint32_t __clockLatency()
 {
-  uint32_t start = __clock();
-  uint32_t end = __clock();
+  uint32_t start = __ownClock();
+  uint32_t end = __ownClock();
   return end-start;
 }
 
@@ -83,13 +70,18 @@ __global__ void clockLatency()
 }
 
 
+/*
+ * Linked list definitions
+ */
+
+// The node
 struct Node
 {
   Node * next = nullptr;
   char _padding[8*NODE_PADDING];
 };
 
-
+// List serial initializer
 __global__ void initialize_list(Node * head, int stride = 1)
 {
   // Set the head
@@ -105,7 +97,7 @@ __global__ void initialize_list(Node * head, int stride = 1)
 
 }
 
-
+// List random initializer
 __global__ void initialize_random_list(Node * buffer, uint32_t *indices)
 {
   // Set the head
@@ -121,7 +113,7 @@ __global__ void initialize_random_list(Node * buffer, uint32_t *indices)
 
 }
 
-
+// Simple list traverse without any timers
 __global__ void simple_traverse(Node * __restrict__ buffer, uint32_t headIndex)
 {
   uint32_t count = 0;
@@ -148,25 +140,38 @@ __global__ void simple_traverse(Node * __restrict__ buffer, uint32_t headIndex)
 # define __VOLATILE__
 #endif
 
-
+/*
+ * Timed list traversal. This implementation is recursive (because it's less code) so you have to
+ * watch out to not exceed the recursion limits. The functions are force-inlined, so the PTX code
+ * looks identical as if you were to unwrap the recursion manually.
+ *
+ * Depending on the compiler flags used, the timing can either measure each node jump, or the entire
+ * list traversal as a whole.
+ */
 template < unsigned int repeat >
 __device__ __forceinline__ void nextNode( __VOLATILE__ Node ** ptr, uint32_t * timings, Node ** ptrs)
 {
-#ifdef TIME_EACH_STEP
-  uint32_t t1 = __clock();
-#endif
+# ifdef TIME_EACH_STEP
+  uint32_t t1 = __ownClock();
+# endif
   (*ptr) = (*ptr)->next;
-#ifdef TIME_EACH_STEP
+# ifdef TIME_EACH_STEP
   (*ptrs) = (Node*)(*ptr);  // Data dep. to prevent ILP.
-  *timings = __clock() -t1; // Time the jump
-#endif
+  *timings = __ownClock() - t1; // Time the jump
+# endif
+
+  // Keep traversing the list.
   nextNode<repeat-1>(ptr, timings+1, ptrs+1);
 }
 
+// Specialize the function to break the recursion.
 template<>
 __device__ __forceinline__ void  nextNode<0>( __VOLATILE__ Node ** ptr, uint32_t * timings, Node ** ptrs){}
 
 
+/* List traversal to make a singly-linked list circular. This is just to have a data dependency and
+ * cover from a potential compiler optimization that might throw the list traversal away.
+ */
 __global__ void make_circular(Node * __restrict__ buffer, uint32_t headIndex)
 {
 
@@ -180,14 +185,14 @@ __global__ void make_circular(Node * __restrict__ buffer, uint32_t headIndex)
 
 #ifndef TIME_EACH_STEP
   // start timer
-  uint32_t start = __clock();
+  uint32_t start = __ownClock();
 #endif
 
   nextNode<NODES-1>(&ptr, timings, ptrs);
 
 #ifndef TIME_EACH_STEP
   // end cycle count
-  uint32_t end = __clock();
+  uint32_t end = __ownClock();
   sum = end - start;
 #else
   printf("Latency for each node jump:\n");
@@ -229,7 +234,7 @@ struct List
     printf(" - Number of nodes: %d:\n", n);
     printf(" - Total buffer size: %10.2f MB:\n", float(sizeof(Node)*buffSize)/1024.0/1024);
     clockLatency<<<1,1>>>();
-    cudaDeviceSynchronize();
+    XDeviceSynchronize();
   }
 
   void initialize(int mode=0)
@@ -243,7 +248,7 @@ struct List
     if (mode == 0)
     {
       initialize_list<<<1,1>>>(buffer, stride);
-      cudaDeviceSynchronize();
+      XDeviceSynchronize();
     }
     else
     {
@@ -282,29 +287,29 @@ struct List
         s.insert(currentIndex);
       }
       uint32_t * d_nodeIndices;
-      cudaMalloc((void**)&d_nodeIndices, sizeof(uint32_t)*NODES);
-      cudaMemcpy(d_nodeIndices, nodeIndices, sizeof(uint32_t)*NODES, cudaMemcpyHostToDevice);
+      XMalloc((void**)&d_nodeIndices, sizeof(uint32_t)*NODES);
+      XMemcpy(d_nodeIndices, nodeIndices, sizeof(uint32_t)*NODES, XMemcpyHostToDevice);
       initialize_random_list<<<1,1>>>(buffer, d_nodeIndices);
       headIndex = nodeIndices[0];
       free(nodeIndices);
-      cudaFree(d_nodeIndices);
+      XFree(d_nodeIndices);
     }
 
-    cudaDeviceSynchronize();
-    checkErrors();
+    XDeviceSynchronize();
+    //checkErrors();
   }
 
   void traverse()
   {
     simple_traverse<<<1,1>>>(buffer, headIndex);
-    cudaDeviceSynchronize();
-    checkErrors();
+    XDeviceSynchronize();
+    //checkErrors();
   }
   void time_traversal()
   {
     make_circular<<<1,1>>>(buffer, headIndex);
-    cudaDeviceSynchronize();
-    checkErrors();
+    XDeviceSynchronize();
+    //checkErrors();
   }
 
 };
@@ -315,11 +320,11 @@ struct DeviceList : public List
   DeviceList(size_t n, size_t buffSize, size_t stride) : List(buffSize, stride)
   {
     List::info(n, buffSize);
-    cudaMalloc((void**)&buffer, sizeof(Node)*buffSize);
+    XMalloc((void**)&buffer, sizeof(Node)*buffSize);
   }
   ~DeviceList()
   {
-    cudaFree(buffer);
+    XFree(buffer);
   }
 };
 
@@ -329,12 +334,12 @@ struct HostList : public List
   HostList(size_t n, size_t buffSize, size_t stride) : List(buffSize,stride)
   {
     List::info(n, buffSize);
-    cudaHostAlloc((void**)&h_buffer, sizeof(Node)*buffSize, cudaHostAllocMapped);
-    cudaHostGetDevicePointer((void**)&buffer, (void*)h_buffer, 0);
+    XHostAlloc((void**)&h_buffer, sizeof(Node)*buffSize, XHostAllocMapped);
+    XHostGetDevicePointer((void**)&buffer, (void*)h_buffer, 0);
   }
   ~HostList()
   {
-    cudaFreeHost(buffer);
+    XFreeHost(buffer);
   }
 };
 

From 3b1eab4cc3c73e4a37cdf4fdd188215a6ec7bf42 Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Tue, 10 Nov 2020 11:59:41 +0100
Subject: [PATCH 07/51] Pointer chase ported to HIP.

---
 .../src/Xdevice/cuda/tools.hpp                |  5 +--
 .../src/Xdevice/cuda/utils.hpp                |  2 +-
 .../src/Xdevice/hip/tools.hpp                 | 19 +++++++++
 .../src/Xdevice/hip/types.hpp                 |  3 +-
 .../src/Xdevice/hip/utils.hpp                 | 15 +++++++
 .../src/makefile_pointerchase.hip             |  7 ++++
 .../gpu/pointer_chase/src/pointer_chase.cu    | 42 ++++++-------------
 7 files changed, 59 insertions(+), 34 deletions(-)
 create mode 100644 cscs-checks/microbenchmarks/gpu/pointer_chase/src/makefile_pointerchase.hip

diff --git a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/tools.hpp b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/tools.hpp
index e668ec0e32..e50679ad19 100644
--- a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/tools.hpp
+++ b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/tools.hpp
@@ -84,13 +84,12 @@ __device__ __forceinline__ uint32_t __ownClock()
   return x;
 }
 
-
-static __device__ __forceinline__ uint32_t __smId()
+__device__ __forceinline__ int __smId()
 {
   // SM ID
   uint32_t x;
   asm volatile ("mov.u32 %0, %%smid;" : "=r"(x) :: "memory");
-  return x;
+  return (int)x;
 }
 
 #endif
diff --git a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/utils.hpp b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/utils.hpp
index cbbee8df2b..e81a9d864d 100644
--- a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/utils.hpp
+++ b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/utils.hpp
@@ -28,7 +28,7 @@ void XMallocHost(void ** data, size_t size)
   checkError( cudaMallocHost(data, size) );
 }
 
-void XHostAlloc(void** pHost, size_t size, unsigned int flags)
+void XHostMalloc(void** pHost, size_t size, unsigned int flags)
 {
   checkError( cudaHostAlloc(pHost, size, flags) );
 }
diff --git a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/tools.hpp b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/tools.hpp
index 2704c8427f..b10aa4fa72 100644
--- a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/tools.hpp
+++ b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/tools.hpp
@@ -68,4 +68,23 @@ Smi::~Smi()
   }
 }
 
+
+/*
+ * ASM tools
+ */
+
+__device__ __forceinline__ uint32_t __ownClock()
+{
+  // Clock counter
+    uint64_t x;
+    asm volatile ("s_memtime %0" : "=r"(x));
+    return (uint32_t)x;
+}
+
+__device__ __forceinline__ int __smId()
+{
+  // NOT possible to retrieve the workgroup ID with AMD GPUs
+  return -1;
+}
+
 #endif
diff --git a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/types.hpp b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/types.hpp
index c2eb8ccc9c..4e9223ff71 100644
--- a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/types.hpp
+++ b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/types.hpp
@@ -13,6 +13,7 @@ XMemcpyKind XMemcpyDeviceToDevice = hipMemcpyDeviceToDevice;
 XMemcpyKind XMemcpyHostToHost = hipMemcpyHostToHost;
 XMemcpyKind XMemcpyDefault = hipMemcpyDefault;
 
-
+// This flag is ignored by ROCm
+#define XHostAllocMapped 0x02
 
 #endif
diff --git a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/utils.hpp b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/utils.hpp
index 94b8d77c28..14039d9134 100644
--- a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/utils.hpp
+++ b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/utils.hpp
@@ -28,6 +28,11 @@ void XMallocHost(void ** data, size_t size)
   checkError( hipHostMalloc(data, size) );
 }
 
+void XHostMalloc(void** pHost, size_t size, unsigned int flags)
+{
+  checkError( hipHostMalloc(pHost, size, flags) );
+}
+
 void XFreeHost(void * data)
 {
   checkError( hipHostFree(data) );
@@ -98,6 +103,16 @@ void XMemcpyPeerAsync(void * dst, int peerDevId, void * src, int srcDevId, size_
   checkError( hipMemcpyPeerAsync(dst, peerDevId, src, srcDevId, size, stream) );
 }
 
+void XMemcpy(void * in, void * out, size_t size, hipMemcpyKind dir)
+{
+  checkError( hipMemcpy(out, in, size, dir) );
+}
+
+void XHostGetDevicePointer(void** device, void* host, unsigned int flags)
+{
+  checkError( hipHostGetDevicePointer(device, host, flags) );
+}
+
 int XGetLastError()
 {
   hipError_t err = hipGetLastError();
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/makefile_pointerchase.hip b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/makefile_pointerchase.hip
new file mode 100644
index 0000000000..d065d38ad7
--- /dev/null
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/makefile_pointerchase.hip
@@ -0,0 +1,7 @@
+ROCBLAS_ROOT=/opt/rocm-3.9.0/rocblas
+ROCM_ROOT=/opt/rocm-3.9.0
+RSMI_ROOT=/opt/rocm-3.9.0/rocm_smi
+AMDGPU_TARGET=gfx906,gfx908
+
+test:
+	hipcc -O3 pointer_chase.cu -DTARGET_HIP ${CXXFLAGS} -std=c++11 -lnuma --amdgpu-target=${AMDGPU_TARGET} -I${ROCM_ROOT}/include -I${ROCTRACER_ROOT}/include -I${RSMI_ROOT}/include -L${RSMI_ROOT}/lib -lrocm_smi64
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
index 356fe697e1..2003881432 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
@@ -45,17 +45,7 @@
 # define LIST_TYPE DeviceList
 #endif
 
-void checkErrors()
-{
-  cudaError_t err = cudaGetLastError();
-  if (cudaSuccess != err)
-  {
-    std::cerr << cudaGetErrorString(err) << std::endl;
-  }
-}
-
-
-static __device__ uint32_t __clockLatency()
+__device__ uint32_t __clockLatency()
 {
   uint32_t start = __ownClock();
   uint32_t end = __ownClock();
@@ -227,11 +217,11 @@ struct List
 
   List(size_t bSize, size_t st) : buffSize(bSize), stride(st) {};
 
-  static void info(size_t n, size_t buffSize)
+  void info(size_t n, size_t buffSize)
   {
     printf("Creating Linked list:\n");
-    printf(" - Node size: %d\n", sizeof(Node));
-    printf(" - Number of nodes: %d:\n", n);
+    printf(" - Node size: %lu\n", sizeof(Node));
+    printf(" - Number of nodes: %lu:\n", n);
     printf(" - Total buffer size: %10.2f MB:\n", float(sizeof(Node)*buffSize)/1024.0/1024);
     clockLatency<<<1,1>>>();
     XDeviceSynchronize();
@@ -268,21 +258,18 @@ struct List
         uint32_t currentIndex = (uint32_t)(unif(rng)*buffSize);
 
         // If already present in the set, find another alternative index.
-        if(s.find(currentIndex) != s.end())
+        while (s.find(currentIndex) != s.end())
         {
-          while (s.find(currentIndex) != s.end())
+          if (currentIndex < NODES-1)
           {
-            if (currentIndex < NODES-1)
-            {
-              currentIndex++;
-            }
-            else
-            {
-              currentIndex = 0;
-            }
+            currentIndex++;
+          }
+          else
+          {
+            currentIndex = 0;
           }
-
         }
+
         nodeIndices[i] = currentIndex;
         s.insert(currentIndex);
       }
@@ -296,20 +283,17 @@ struct List
     }
 
     XDeviceSynchronize();
-    //checkErrors();
   }
 
   void traverse()
   {
     simple_traverse<<<1,1>>>(buffer, headIndex);
     XDeviceSynchronize();
-    //checkErrors();
   }
   void time_traversal()
   {
     make_circular<<<1,1>>>(buffer, headIndex);
     XDeviceSynchronize();
-    //checkErrors();
   }
 
 };
@@ -334,7 +318,7 @@ struct HostList : public List
   HostList(size_t n, size_t buffSize, size_t stride) : List(buffSize,stride)
   {
     List::info(n, buffSize);
-    XHostAlloc((void**)&h_buffer, sizeof(Node)*buffSize, XHostAllocMapped);
+    XHostMalloc((void**)&h_buffer, sizeof(Node)*buffSize, XHostAllocMapped);
     XHostGetDevicePointer((void**)&buffer, (void*)h_buffer, 0);
   }
   ~HostList()

From 8552a5f60b3defeb9112ebc59f471b712d145545 Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Tue, 10 Nov 2020 13:46:02 +0100
Subject: [PATCH 08/51] Rename src and dst pointers in dev copy functions.

---
 .../gpu/memory_bandwidth/src/Xdevice/cuda/utils.hpp       | 8 ++++----
 .../gpu/memory_bandwidth/src/Xdevice/hip/utils.hpp        | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/utils.hpp b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/utils.hpp
index e81a9d864d..ec1a8b0b83 100644
--- a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/utils.hpp
+++ b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/utils.hpp
@@ -43,9 +43,9 @@ void XMalloc(void ** data, size_t size)
   checkError( cudaMalloc(data, size) );
 }
 
-void XMemcpyAsync(void * in, void * out, size_t size, cudaMemcpyKind dir, cudaStream_t stream)
+void XMemcpyAsync(void * dst, void * src, size_t size, cudaMemcpyKind dir, cudaStream_t stream)
 {
-  checkError( cudaMemcpyAsync(out, in, size, dir, stream) );
+  checkError( cudaMemcpyAsync(dst, src, size, dir, stream) );
 }
 
 void XMemset( void * in, int val, size_t size)
@@ -103,9 +103,9 @@ void XMemcpyPeerAsync(void * dst, int peerDevId, void * src, int srcDevId, size_
   checkError( cudaMemcpyPeerAsync(dst, peerDevId, src, srcDevId, size, stream) );
 }
 
-void XMemcpy(void * in, void * out, size_t size, cudaMemcpyKind dir)
+void XMemcpy(void * dst, void * src, size_t size, cudaMemcpyKind dir)
 {
-  checkError( cudaMemcpy(out, in, size, dir) );
+  checkError( cudaMemcpy(dst, src, size, dir) );
 }
 
 void XHostGetDevicePointer(void** device, void* host, unsigned int flags)
diff --git a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/utils.hpp b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/utils.hpp
index 14039d9134..5d21299150 100644
--- a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/utils.hpp
+++ b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/utils.hpp
@@ -43,9 +43,9 @@ void XMalloc(void ** data, size_t size)
   checkError( hipMalloc(data, size) );
 }
 
-void XMemcpyAsync(void * in, void * out, size_t size, hipMemcpyKind dir, hipStream_t stream)
+void XMemcpyAsync(void * dst, void * src, size_t size, hipMemcpyKind dir, hipStream_t stream)
 {
-  checkError( hipMemcpyAsync(out, in, size, dir, stream) );
+  checkError( hipMemcpyAsync(dst, src, size, dir, stream) );
 }
 
 void XMemset( void * in, int val, size_t size)
@@ -103,9 +103,9 @@ void XMemcpyPeerAsync(void * dst, int peerDevId, void * src, int srcDevId, size_
   checkError( hipMemcpyPeerAsync(dst, peerDevId, src, srcDevId, size, stream) );
 }
 
-void XMemcpy(void * in, void * out, size_t size, hipMemcpyKind dir)
+void XMemcpy(void * dst, void * src, size_t size, hipMemcpyKind dir)
 {
-  checkError( hipMemcpy(out, in, size, dir) );
+  checkError( hipMemcpy(dst, src, size, dir) );
 }
 
 void XHostGetDevicePointer(void** device, void* host, unsigned int flags)

From 055be799ee29649377af38597c3c7267abfa96d3 Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Tue, 10 Nov 2020 18:17:06 +0100
Subject: [PATCH 09/51] Add node ID to the test prints.

---
 .../gpu/pointer_chase/pointer_chase.py        | 45 ++++++++++++++++
 .../gpu/pointer_chase/src/makefile.cuda       |  3 ++
 .../gpu/pointer_chase/src/makefile.hip        |  5 ++
 .../src/makefile_pointerchase.hip             |  7 ---
 .../gpu/pointer_chase/src/pointer_chase.cu    | 51 +++++++++++++++----
 5 files changed, 94 insertions(+), 17 deletions(-)
 create mode 100644 cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
 create mode 100644 cscs-checks/microbenchmarks/gpu/pointer_chase/src/makefile.cuda
 create mode 100644 cscs-checks/microbenchmarks/gpu/pointer_chase/src/makefile.hip
 delete mode 100644 cscs-checks/microbenchmarks/gpu/pointer_chase/src/makefile_pointerchase.hip

diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
new file mode 100644
index 0000000000..025fb1341e
--- /dev/null
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
@@ -0,0 +1,45 @@
+# Copyright 2016-2020 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
+# ReFrame Project Developers. See the top-level LICENSE file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import reframe.utility.sanity as sn
+import reframe as rfm
+
+
+@rfm.simple_test
+class GpuPointerChase(rfm.RegressionTest):
+    def __init__(self):
+        self.valid_systems = ['ault:intelv100', 'ault:amdv100',
+                              'ault:amda100']
+        self.valid_prog_environs = ['PrgEnv-gnu']
+        self.exclusive_access = True
+        self.pre_build_cmds = ['cp makefile.cuda Makefile']
+        self.build_system = 'Make'
+        self.executable = 'pChase.x'
+        self.num_tasks = 0
+        self.num_tasks_per_node = 1
+
+
+
+    @rfm.run_before('compile')
+    def set_gpu_arch(self):
+        cp = self.current_partition.fullname
+        if cp[-4:] == 'v100':
+            nvidia_sm = '70'
+        elif cp[-4:] == 'a100':
+            nvidia_sm = '80'
+        else
+            nvidia_sm = None
+
+        self.build_system.cxxflags += [f'-arch=sm_{nvidia_sm}']
+
+    @rfm.run_before('run')
+    def set_num_gpus_per_node(self):
+        cp = self.current_partition.fullname
+        if cp in {'ault:intelv100', 'ault:amda100'}:
+            self.num_gpus_per_node = 4
+        elif cp in {'ault:amdv100'}:
+            self.num_gpus_per_node = 2
+        else:
+            self.num_gpus_per_node = 1
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/makefile.cuda b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/makefile.cuda
new file mode 100644
index 0000000000..9ab7114592
--- /dev/null
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/makefile.cuda
@@ -0,0 +1,3 @@
+pointerChase:
+	nvcc pointer_chase.cu -std=c++11 ${CXXFLAGS} -lnvidia-ml -O3 -o pChase.x
+
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/makefile.hip b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/makefile.hip
new file mode 100644
index 0000000000..7ccc63176f
--- /dev/null
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/makefile.hip
@@ -0,0 +1,5 @@
+RSMI_ROOT=/opt/rocm-3.9.0/rocm_smi
+AMDGPU_TARGET=gfx906,gfx908
+
+pointerChase:
+	hipcc -o pChase.x -O3 pointer_chase.cu -DTARGET_HIP ${CXXFLAGS} -std=c++11 -lnuma --amdgpu-target=${AMDGPU_TARGET}  -I${RSMI_ROOT}/include -L${RSMI_ROOT}/lib -lrocm_smi64
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/makefile_pointerchase.hip b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/makefile_pointerchase.hip
deleted file mode 100644
index d065d38ad7..0000000000
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/makefile_pointerchase.hip
+++ /dev/null
@@ -1,7 +0,0 @@
-ROCBLAS_ROOT=/opt/rocm-3.9.0/rocblas
-ROCM_ROOT=/opt/rocm-3.9.0
-RSMI_ROOT=/opt/rocm-3.9.0/rocm_smi
-AMDGPU_TARGET=gfx906,gfx908
-
-test:
-	hipcc -O3 pointer_chase.cu -DTARGET_HIP ${CXXFLAGS} -std=c++11 -lnuma --amdgpu-target=${AMDGPU_TARGET} -I${ROCM_ROOT}/include -I${ROCTRACER_ROOT}/include -I${RSMI_ROOT}/include -L${RSMI_ROOT}/lib -lrocm_smi64
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
index 2003881432..321a5fd8d7 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
@@ -45,6 +45,9 @@
 # define LIST_TYPE DeviceList
 #endif
 
+#ifndef HOSTNAME_SIZE
+# define HOSTNAME_SIZE 80
+#endif
 __device__ uint32_t __clockLatency()
 {
   uint32_t start = __ownClock();
@@ -162,7 +165,7 @@ __device__ __forceinline__ void  nextNode<0>( __VOLATILE__ Node ** ptr, uint32_t
 /* List traversal to make a singly-linked list circular. This is just to have a data dependency and
  * cover from a potential compiler optimization that might throw the list traversal away.
  */
-__global__ void make_circular(Node * __restrict__ buffer, uint32_t headIndex)
+__global__ void make_circular(Node * __restrict__ buffer, uint32_t headIndex, int dev_id, char * nid)
 {
 
   // These are used to prevent ILP when timing each jump.
@@ -185,10 +188,10 @@ __global__ void make_circular(Node * __restrict__ buffer, uint32_t headIndex)
   uint32_t end = __ownClock();
   sum = end - start;
 #else
-  printf("Latency for each node jump:\n");
+  printf("[%s] Latency for each node jump (device %d):\n", nid, dev_id);
   for (uint32_t i = 0; i < NODES-1; i++)
   {
-    printf("%d\n", timings[i]);
+    printf("[%s] %d\n", nid, timings[i]);
     sum += timings[i];
   }
   if (ptr == ptrs[0])
@@ -197,7 +200,7 @@ __global__ void make_circular(Node * __restrict__ buffer, uint32_t headIndex)
   }
 #endif
 
-  printf("Chase took on average %d cycles per node jump (SM %d).\n", sum/(NODES-1), __smId());
+  printf("[%s] On device %d, the chase took on average %d cycles per node jump.\n", nid, dev_id, sum/(NODES-1));
 
   // Join the tail with the head (just for the data dependency).
   if (ptr->next == nullptr)
@@ -290,9 +293,12 @@ struct List
     simple_traverse<<<1,1>>>(buffer, headIndex);
     XDeviceSynchronize();
   }
-  void time_traversal()
+  void time_traversal(int dev_id, char * nid)
   {
-    make_circular<<<1,1>>>(buffer, headIndex);
+    char * d_nid;
+    XMalloc((void**)&d_nid, sizeof(char)*HOSTNAME_SIZE);
+    XMemcpy(d_nid, nid, sizeof(char)*HOSTNAME_SIZE, XMemcpyHostToDevice);
+    make_circular<<<1,1>>>(buffer, headIndex, dev_id, d_nid);
     XDeviceSynchronize();
   }
 
@@ -303,7 +309,9 @@ struct DeviceList : public List
 {
   DeviceList(size_t n, size_t buffSize, size_t stride) : List(buffSize, stride)
   {
+#   ifdef DEBUG
     List::info(n, buffSize);
+#   endif
     XMalloc((void**)&buffer, sizeof(Node)*buffSize);
   }
   ~DeviceList()
@@ -329,13 +337,13 @@ struct HostList : public List
 
 
 template < class LIST >
-void devicePointerChase(int m, size_t buffSize, size_t stride)
+void devicePointerChase(int m, size_t buffSize, size_t stride, int dev_id, char * nid)
 {
   LIST l(NODES, buffSize, stride);
 
   l.initialize(m);
   l.traverse(); // warmup kernel
-  l.time_traversal();
+  l.time_traversal(dev_id, nid);
 
 }
 
@@ -382,7 +390,30 @@ int main(int argc, char ** argv)
     return 1;
   }
 
+  // Get the node name
+  char nid_name[HOSTNAME_SIZE];
+  gethostname(nid_name, HOSTNAME_SIZE);
+
+  // Make sure we've got devices aboard.
+  int num_devices;
+  XGetDeviceCount(num_devices);
+  if (num_devices == 0)
+  {
+    std::cout << "No devices found on host " << nid_name << std::endl;
+    return 1;
+  }
+  else
+  {
+    printf("[%s] Found %d device(s).\n", nid_name, num_devices);
+  }
+
+  // Run the pointer chase on each device in the node.
+  for (int i = 0; i < num_devices; i++)
+  {
+    XSetDevice(i);
+    devicePointerChase<LIST_TYPE>(list_init, buffSize, stride, i, nid_name);
+  }
 
-  // Run the pointer chase.
-  devicePointerChase<LIST_TYPE>(list_init, buffSize, stride);
+  printf("[%s] Pointer chase complete.\n", nid_name);
+  return 0;
 }

From 8d095411f98d7bbdbbf6828d3daed36354cd7e32 Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Wed, 11 Nov 2020 12:19:53 +0100
Subject: [PATCH 10/51] Add comments in the source code.

---
 .../gpu/pointer_chase/src/pointer_chase.cu    | 107 +++++++++++++++---
 1 file changed, 91 insertions(+), 16 deletions(-)

diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
index 321a5fd8d7..8dc51f2251 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
@@ -67,32 +67,47 @@ __global__ void clockLatency()
  * Linked list definitions
  */
 
-// The node
 struct Node
 {
+  /* The node */
   Node * next = nullptr;
   char _padding[8*NODE_PADDING];
 };
 
-// List serial initializer
-__global__ void initialize_list(Node * head, int stride = 1)
+
+/*
+ *  Kernels and device functions
+ */
+
+__global__ void initialize_list(Node * buffer, int stride = 1)
 {
+  /* List serial initializer.
+   * - buffer: where the list is to be placed.
+   * - stride: argument controls the number of empty node spaces
+   *   in between two consecutive list nodes.
+   */
+
   // Set the head
-  Node * prev = new (&(head[0])) Node();
+  Node * prev = new (&(buffer[0])) Node();
 
   // Init the rest of the list
   for (int n = 1; n < NODES; n++)
   {
-    Node * temp = new (&(head[n*stride])) Node();
+    Node * temp = new (&(buffer[n*stride])) Node();
     prev->next = temp;
     prev = temp;
   }
 
 }
 
-// List random initializer
 __global__ void initialize_random_list(Node * buffer, uint32_t *indices)
 {
+  /* List random initializer
+   * - buffer: where the list is to be placed.
+   * - indices: array containing the node ordering indices as offsets in
+   *     the buffer.
+   */
+
   // Set the head
   Node * prev = new (&(buffer[indices[0]])) Node();
 
@@ -106,9 +121,13 @@ __global__ void initialize_random_list(Node * buffer, uint32_t *indices)
 
 }
 
-// Simple list traverse without any timers
 __global__ void simple_traverse(Node * __restrict__ buffer, uint32_t headIndex)
 {
+  /* Simple list traverse - no timing is done here
+   * - buffer: where the list is
+   * - headIndex: index in the buffer where the head of the list is
+   */
+
   uint32_t count = 0;
   Node * head = &(buffer[headIndex]);
   Node * ptr = head;
@@ -144,6 +163,10 @@ __global__ void simple_traverse(Node * __restrict__ buffer, uint32_t headIndex)
 template < unsigned int repeat >
 __device__ __forceinline__ void nextNode( __VOLATILE__ Node ** ptr, uint32_t * timings, Node ** ptrs)
 {
+  /*
+   * Go to the next node in the list
+   */
+
 # ifdef TIME_EACH_STEP
   uint32_t t1 = __ownClock();
 # endif
@@ -162,11 +185,11 @@ template<>
 __device__ __forceinline__ void  nextNode<0>( __VOLATILE__ Node ** ptr, uint32_t * timings, Node ** ptrs){}
 
 
-/* List traversal to make a singly-linked list circular. This is just to have a data dependency and
- * cover from a potential compiler optimization that might throw the list traversal away.
- */
-__global__ void make_circular(Node * __restrict__ buffer, uint32_t headIndex, int dev_id, char * nid)
+__global__ void timed_list_traversal(Node * __restrict__ buffer, uint32_t headIndex, int dev_id, char * nid)
 {
+  /* Timed List traversal - we make a singly-linked list circular just to have a data dep. and
+   * prevent from compiler optimisations.
+   */
 
   // These are used to prevent ILP when timing each jump.
   __shared__ uint32_t timings[NODES-1];
@@ -211,8 +234,24 @@ __global__ void make_circular(Node * __restrict__ buffer, uint32_t headIndex, in
 }
 
 
+/*
+ * List structure definitions
+ */
+
 struct List
 {
+  /*
+   * Contains the buffer where the list is stored, the index in this buffer where the head
+   * of the list is, the buffer size, and the stride in between nodes (this last one is only
+   * meaningful if the list is not initialised as random).
+   *
+   * The member functions are:
+   *  - info: prints the list details.
+   *  - initialize: populatest the buffer with the list nodes.
+   *  - traverse: simple list traversal.
+   *  - timed_traverse: traverses the list and measures the number of cycles per node jump.
+   */
+
   Node * buffer = nullptr;
   uint32_t headIndex = 0;
   size_t buffSize;
@@ -232,6 +271,11 @@ struct List
 
   void initialize(int mode=0)
   {
+    /*
+     * mode 0 initializes the list as serial.
+     * mode 1 initializes the list in a random order.
+     */
+
     if (mode < 0 || mode > 1)
     {
       printf("Unknown list initialization scheme. Default to 0.");
@@ -276,6 +320,8 @@ struct List
         nodeIndices[i] = currentIndex;
         s.insert(currentIndex);
       }
+
+      // Copy the node indices to the device and init the random list
       uint32_t * d_nodeIndices;
       XMalloc((void**)&d_nodeIndices, sizeof(uint32_t)*NODES);
       XMemcpy(d_nodeIndices, nodeIndices, sizeof(uint32_t)*NODES, XMemcpyHostToDevice);
@@ -290,15 +336,26 @@ struct List
 
   void traverse()
   {
+    /*
+     * Simple list traversal - NOT timed.
+     */
     simple_traverse<<<1,1>>>(buffer, headIndex);
     XDeviceSynchronize();
   }
+
   void time_traversal(int dev_id, char * nid)
   {
+    /*
+     * Timed list traversal
+     */
+
+    // Copy the node id into the device to print the info from the kernel.
     char * d_nid;
     XMalloc((void**)&d_nid, sizeof(char)*HOSTNAME_SIZE);
     XMemcpy(d_nid, nid, sizeof(char)*HOSTNAME_SIZE, XMemcpyHostToDevice);
-    make_circular<<<1,1>>>(buffer, headIndex, dev_id, d_nid);
+
+    // Time the pointer chase
+    timed_list_traversal<<<1,1>>>(buffer, headIndex, dev_id, d_nid);
     XDeviceSynchronize();
   }
 
@@ -307,6 +364,10 @@ struct List
 
 struct DeviceList : public List
 {
+  /*
+   * List allocated in device memory
+   */
+
   DeviceList(size_t n, size_t buffSize, size_t stride) : List(buffSize, stride)
   {
 #   ifdef DEBUG
@@ -314,21 +375,30 @@ struct DeviceList : public List
 #   endif
     XMalloc((void**)&buffer, sizeof(Node)*buffSize);
   }
+
   ~DeviceList()
   {
     XFree(buffer);
   }
 };
 
+
 struct HostList : public List
 {
+  /*
+   * List allocated in pinned host memory
+   */
+
   Node * h_buffer;
   HostList(size_t n, size_t buffSize, size_t stride) : List(buffSize,stride)
   {
+#   ifdef DEBUG
     List::info(n, buffSize);
+#   endif
     XHostMalloc((void**)&h_buffer, sizeof(Node)*buffSize, XHostAllocMapped);
     XHostGetDevicePointer((void**)&buffer, (void*)h_buffer, 0);
   }
+
   ~HostList()
   {
     XFreeHost(buffer);
@@ -339,6 +409,10 @@ struct HostList : public List
 template < class LIST >
 void devicePointerChase(int m, size_t buffSize, size_t stride, int dev_id, char * nid)
 {
+  /*
+   * Driver to manage the whole allocation, list traversal, etc.
+   */
+
   LIST l(NODES, buffSize, stride);
 
   l.initialize(m);
@@ -350,7 +424,7 @@ void devicePointerChase(int m, size_t buffSize, size_t stride, int dev_id, char
 int main(int argc, char ** argv)
 {
   // Set program defaults before parsing the command line args.
-  int list_init = 0;
+  int list_init_mode = 0;
   size_t stride = 1;
   size_t buffSize = NODES*stride;
 
@@ -370,12 +444,13 @@ int main(int argc, char ** argv)
     }
     else if (str == "--rand")
     {
-      list_init = 1;
+      list_init_mode = 1;
     }
     else if (str == "--stride")
     {
       stride = std::stoi((std::string)argv[++i]);
-      buffSize = NODES*stride;
+      if (buffSize < NODES*stride)
+          buffSize = NODES*stride;
     }
     else if (str == "--buffer")
     {
@@ -411,7 +486,7 @@ int main(int argc, char ** argv)
   for (int i = 0; i < num_devices; i++)
   {
     XSetDevice(i);
-    devicePointerChase<LIST_TYPE>(list_init, buffSize, stride, i, nid_name);
+    devicePointerChase<LIST_TYPE>(list_init_mode, buffSize, stride, i, nid_name);
   }
 
   printf("[%s] Pointer chase complete.\n", nid_name);

From 1c1ab5e83eebaaa54bca9e10d06866f853b55878 Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Wed, 11 Nov 2020 18:25:42 +0100
Subject: [PATCH 11/51] Add P2P pointer chase.

---
 .../gpu/pointer_chase/src/pointer_chase.cu    | 179 ++++++++++++++----
 1 file changed, 145 insertions(+), 34 deletions(-)

diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
index 8dc51f2251..e9e655837f 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
@@ -5,6 +5,7 @@
 #include <random>
 #include <chrono>
 #include <set>
+#include <memory>
 
 // Include the CUDA/HIP wrappers from the other test for now.
 #include "../../memory_bandwidth/src/Xdevice/runtime.hpp"
@@ -161,7 +162,7 @@ __global__ void simple_traverse(Node * __restrict__ buffer, uint32_t headIndex)
  * list traversal as a whole.
  */
 template < unsigned int repeat >
-__device__ __forceinline__ void nextNode( __VOLATILE__ Node ** ptr, uint32_t * timings, Node ** ptrs)
+__device__ __forceinline__ void nextNode( __VOLATILE__ Node ** ptr, uint32_t * timer, Node ** ptrs)
 {
   /*
    * Go to the next node in the list
@@ -173,28 +174,27 @@ __device__ __forceinline__ void nextNode( __VOLATILE__ Node ** ptr, uint32_t * t
   (*ptr) = (*ptr)->next;
 # ifdef TIME_EACH_STEP
   (*ptrs) = (Node*)(*ptr);  // Data dep. to prevent ILP.
-  *timings = __ownClock() - t1; // Time the jump
+  *timer = __ownClock() - t1; // Time the jump
 # endif
 
   // Keep traversing the list.
-  nextNode<repeat-1>(ptr, timings+1, ptrs+1);
+  nextNode<repeat-1>(ptr, timer+1, ptrs+1);
 }
 
 // Specialize the function to break the recursion.
 template<>
-__device__ __forceinline__ void  nextNode<0>( __VOLATILE__ Node ** ptr, uint32_t * timings, Node ** ptrs){}
+__device__ __forceinline__ void  nextNode<0>( __VOLATILE__ Node ** ptr, uint32_t * timer, Node ** ptrs){}
 
 
-__global__ void timed_list_traversal(Node * __restrict__ buffer, uint32_t headIndex, int dev_id, char * nid)
+__global__ void timed_list_traversal(Node * __restrict__ buffer, uint32_t headIndex, uint32_t * timer)
 {
   /* Timed List traversal - we make a singly-linked list circular just to have a data dep. and
    * prevent from compiler optimisations.
    */
 
   // These are used to prevent ILP when timing each jump.
-  __shared__ uint32_t timings[NODES-1];
+  __shared__ uint32_t s_timer[NODES-1];
   __shared__ Node * ptrs[NODES-1];
-  uint32_t sum = 0;
 
   // Create a pointer to iterate through the list
   __VOLATILE__ Node * ptr = &(buffer[headIndex]);
@@ -204,18 +204,16 @@ __global__ void timed_list_traversal(Node * __restrict__ buffer, uint32_t headIn
   uint32_t start = __ownClock();
 #endif
 
-  nextNode<NODES-1>(&ptr, timings, ptrs);
+  nextNode<NODES-1>(&ptr, s_timer, ptrs);
 
 #ifndef TIME_EACH_STEP
   // end cycle count
   uint32_t end = __ownClock();
-  sum = end - start;
+  timer[0] = end - start;
 #else
-  printf("[%s] Latency for each node jump (device %d):\n", nid, dev_id);
   for (uint32_t i = 0; i < NODES-1; i++)
   {
-    printf("[%s] %d\n", nid, timings[i]);
-    sum += timings[i];
+    timer[i] = s_timer[i];
   }
   if (ptr == ptrs[0])
   {
@@ -223,8 +221,6 @@ __global__ void timed_list_traversal(Node * __restrict__ buffer, uint32_t headIn
   }
 #endif
 
-  printf("[%s] On device %d, the chase took on average %d cycles per node jump.\n", nid, dev_id, sum/(NODES-1));
-
   // Join the tail with the head (just for the data dependency).
   if (ptr->next == nullptr)
   {
@@ -254,10 +250,22 @@ struct List
 
   Node * buffer = nullptr;
   uint32_t headIndex = 0;
+  uint32_t * timer = nullptr;
+  uint32_t * d_timer = nullptr;
   size_t buffSize;
   size_t stride;
 
-  List(size_t bSize, size_t st) : buffSize(bSize), stride(st) {};
+  List(size_t bSize, size_t st) : buffSize(bSize), stride(st)
+  {
+    // Allocate the buffers to store the timings measured in the kernel
+    timer = new uint32_t[NODES];
+    XMalloc((void**)&d_timer, sizeof(uint32_t)*(NODES));
+  };
+
+  virtual ~List()
+  {
+    XFree(d_timer);
+  }
 
   void info(size_t n, size_t buffSize)
   {
@@ -278,7 +286,7 @@ struct List
 
     if (mode < 0 || mode > 1)
     {
-      printf("Unknown list initialization scheme. Default to 0.");
+      printf("Unknown list initialization scheme. Defaulting back to 0.");
       mode = 0;
     }
 
@@ -343,20 +351,17 @@ struct List
     XDeviceSynchronize();
   }
 
-  void time_traversal(int dev_id, char * nid)
+  void time_traversal()
   {
     /*
      * Timed list traversal
      */
 
-    // Copy the node id into the device to print the info from the kernel.
-    char * d_nid;
-    XMalloc((void**)&d_nid, sizeof(char)*HOSTNAME_SIZE);
-    XMemcpy(d_nid, nid, sizeof(char)*HOSTNAME_SIZE, XMemcpyHostToDevice);
-
-    // Time the pointer chase
-    timed_list_traversal<<<1,1>>>(buffer, headIndex, dev_id, d_nid);
+    timed_list_traversal<<<1,1>>>(buffer, headIndex, d_timer);
     XDeviceSynchronize();
+
+    // Copy the timing data back to the host
+    XMemcpy(timer, d_timer, sizeof(uint32_t)*(NODES-1), XMemcpyDeviceToHost);
   }
 
 };
@@ -407,20 +412,130 @@ struct HostList : public List
 
 
 template < class LIST >
-void devicePointerChase(int m, size_t buffSize, size_t stride, int dev_id, char * nid)
+uint32_t * generalPointerChase(int local_device, int remote_device, int init_mode, size_t buffSize, size_t stride)
 {
   /*
    * Driver to manage the whole allocation, list traversal, etc.
+   * It returns the array containing the timings. Note that these values will depend on whether the
+   * flag -DTIME_EACH_STEP was defined or not (see top of the file).
+   *
+   * - local_device: ID of the device where the allocation of the list takes place
+   * - remote_device: ID of the device doing the pointer chase.
+   * - init_mode: see the class List.
+   * - buff_size: Size (in nodes) of the buffer.
+   * - stride: Gap (in nodes) between two consecutive nodes. This only applies if init_mode is 0.
    */
 
+  XSetDevice(remote_device);
   LIST l(NODES, buffSize, stride);
+  l.initialize(init_mode);
+
+  // Check if we have remote memory access.
+  XSetDevice(local_device);
+  bool peerAccessSet = false;
+  if (local_device!=remote_device)
+  {
+    int hasPeerAccess;
+    XDeviceCanAccessPeer(&hasPeerAccess, local_device, remote_device);
+    if (!hasPeerAccess)
+    {
+      printf("Devices have no peer access.\n");
+      exit(1);
+    }
+
+    // Enable the peerAccess access.
+    peerAccessSet = true;
+    XDeviceEnablePeerAccess(remote_device, 0);
+  }
 
-  l.initialize(m);
-  l.traverse(); // warmup kernel
-  l.time_traversal(dev_id, nid);
+  // Warm-up kernel
+  l.traverse();
 
+  // Time the pointer chase
+  l.time_traversal();
+
+  if (peerAccessSet)
+    XDeviceDisablePeerAccess(remote_device);
+
+   // Set again the device where the allocations were placed, so it can take care of it's
+   // own deallocations in the List destructor.
+   XSetDevice(remote_device);
+
+   return l.timer;
 }
 
+
+template < class LIST >
+void localPointerChase(int num_devices, int init_mode, size_t buffSize, size_t stride, char * nid)
+{
+  /*
+   * Specialised pointer chase on a single device.
+   */
+  for (int gpu_id = 0; gpu_id < num_devices; gpu_id++)
+  {
+    uint32_t* timer = generalPointerChase< LIST >(gpu_id, gpu_id, init_mode, buffSize, stride);
+
+    // Print the timings of the pointer chase
+#   ifndef TIME_EACH_STEP
+    printf("[%s] On device %d, the chase took on average %d cycles per node jump.\n", nid, gpu_id, timer[0]/(NODES-1));
+#   else
+    printf("[%s] Latency for each node jump (device %d):\n", nid, gpu_id);
+    for (uint32_t i = 0; i < NODES-1; i++)
+    {
+      printf("[%s][device %d] %d\n", nid, gpu_id, timer[i]);
+    }
+#   endif
+    delete [] timer;
+  }
+}
+
+
+template < class LIST >
+void remotePointerChase(int num_devices, int init_mode, size_t buffSize, size_t stride, char * nid)
+{
+  /*
+   * Specialised pointer chase to allocate the list in one device, and do the pointer chase from another device.
+   */
+
+#ifdef SYMM
+# define LIMITS j
+#else
+# define LIMITS 0
+#endif
+
+  auto fetch = [](uint32_t* t){return t[0]/(NODES-1);};
+
+  printf("[%s] Memory latency (cycles) with remote direct memory access\n", nid);
+  printf("[%s] %10s", nid, "From \\ To ");
+  for (int ds = 0; ds < num_devices; ds++)
+  {
+    printf("%4sGPU %2d", "", ds);
+  } printf("%10s\n", "Totals");
+
+  for (int j = 0; j < num_devices; j++)
+  {
+    // Track the sum of the latencies
+    uint32_t totals = 0;
+
+    printf("[%s] GPU %2d%4s", nid, j, " ");
+    for (int i = 0; i < LIMITS; i++)
+    {
+      printf("%10s", "X");
+    }
+
+    for (int i = LIMITS; i < num_devices; i++)
+    {
+      uint32_t timer = fetch(generalPointerChase< LIST >(i, j, init_mode, buffSize, stride));
+      if (i != j)
+      {
+        totals += timer;
+      }
+      printf("%10d", timer);
+    } printf("%10d\n", totals);
+  }
+}
+
+
 int main(int argc, char ** argv)
 {
   // Set program defaults before parsing the command line args.
@@ -482,12 +597,8 @@ int main(int argc, char ** argv)
     printf("[%s] Found %d device(s).\n", nid_name, num_devices);
   }
 
-  // Run the pointer chase on each device in the node.
-  for (int i = 0; i < num_devices; i++)
-  {
-    XSetDevice(i);
-    devicePointerChase<LIST_TYPE>(list_init_mode, buffSize, stride, i, nid_name);
-  }
+  localPointerChase<LIST_TYPE>(num_devices, list_init_mode, buffSize, stride, nid_name);
+  remotePointerChase<LIST_TYPE>(num_devices, list_init_mode, buffSize, stride, nid_name);
 
   printf("[%s] Pointer chase complete.\n", nid_name);
   return 0;

From 1d6b06b832a69e97ee5a3b60d0eb761858d4cac2 Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Wed, 11 Nov 2020 18:58:06 +0100
Subject: [PATCH 12/51] Extend options to retrieve min latency.

---
 .../gpu/pointer_chase/src/pointer_chase.cu    | 31 ++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
index e9e655837f..d064f1347c 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
@@ -503,7 +503,33 @@ void remotePointerChase(int num_devices, int init_mode, size_t buffSize, size_t
 # define LIMITS 0
 #endif
 
+# ifndef TIME_EACH_STEP
   auto fetch = [](uint32_t* t){return t[0]/(NODES-1);};
+# else
+# ifdef MAX_LATENCY
+  auto fetch = [](uint32_t* t)
+  {
+    uint32_t max = 0;
+    for (int i = 0; i < NODES-1; i++)
+    {
+      if (t[i] > max)
+        max = t[i];
+    }
+    return max;
+  };
+# else
+  auto fetch = [](uint32_t* t)
+  {
+    uint32_t min = ~0;
+    for (int i = 0; i < NODES-1; i++)
+    {
+      if (t[i] < min)
+        min = t[i];
+    }
+    return min;
+  };
+# endif
+# endif
 
   printf("[%s] Memory latency (cycles) with remote direct memory access\n", nid);
   printf("[%s] %10s", nid, "From \\ To ");
@@ -525,7 +551,10 @@ void remotePointerChase(int num_devices, int init_mode, size_t buffSize, size_t
 
     for (int i = LIMITS; i < num_devices; i++)
     {
-      uint32_t timer = fetch(generalPointerChase< LIST >(i, j, init_mode, buffSize, stride));
+
+      uint32_t * timer_ptr = generalPointerChase< LIST >(i, j, init_mode, buffSize, stride);
+      uint32_t timer = fetch(timer_ptr);
+      delete [] timer_ptr;
       if (i != j)
       {
         totals += timer;

From 20fda053a973a63217cb1a649dce09339573f068 Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Fri, 13 Nov 2020 13:30:04 +0100
Subject: [PATCH 13/51] Add asm XClock and XClock64 functions.

---
 .../src/Xdevice/cuda/tools.hpp                | 10 +++++++-
 .../src/Xdevice/hip/tools.hpp                 | 23 +++++++++++++++----
 .../src/Xdevice/hip/utils.hpp                 |  1 +
 .../gpu/pointer_chase/src/pointer_chase.cu    | 12 +++++-----
 4 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/tools.hpp b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/tools.hpp
index e50679ad19..102776fbdb 100644
--- a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/tools.hpp
+++ b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/tools.hpp
@@ -76,7 +76,7 @@ Smi::~Smi()
  * ASM tools
  */
 
-__device__ __forceinline__ uint32_t __ownClock()
+__device__ __forceinline__ uint32_t XClock()
 {
   // Clock counter
   uint32_t x;
@@ -84,6 +84,14 @@ __device__ __forceinline__ uint32_t __ownClock()
   return x;
 }
 
+__device__ __forceinline__ uint64_t XClock64()
+{
+  // Clock counter
+  uint64_t x;
+  asm volatile ("mov.u64 %0, %%clock64;" : "=l"(x) :: "memory");
+  return x;
+}
+
 __device__ __forceinline__ int __smId()
 {
   // SM ID
diff --git a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/tools.hpp b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/tools.hpp
index b10aa4fa72..27ba7c0302 100644
--- a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/tools.hpp
+++ b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/tools.hpp
@@ -73,14 +73,29 @@ Smi::~Smi()
  * ASM tools
  */
 
-__device__ __forceinline__ uint32_t __ownClock()
+template< class T >
+__device__ __forceinline__ T __XClock()
 {
   // Clock counter
-    uint64_t x;
-    asm volatile ("s_memtime %0" : "=r"(x));
-    return (uint32_t)x;
+  uint64_t x;
+  asm volatile ("s_memtime %0; \t\n"
+                "s_waitcnt lgkmcnt(0);"
+                : "=r"(x)
+               );
+  return (T)x;
 }
 
+__device__ __forceinline__ uint32_t XClock()
+{
+  return __XClock<uint32_t>();
+}
+
+__device__ __forceinline__ uint64_t XClock64()
+{
+  return __XClock<uint64_t>();
+}
+
+
 __device__ __forceinline__ int __smId()
 {
   // NOT possible to retrieve the workgroup ID with AMD GPUs
diff --git a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/utils.hpp b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/utils.hpp
index 5d21299150..471733b366 100644
--- a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/utils.hpp
+++ b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/utils.hpp
@@ -2,6 +2,7 @@
 #define __INCLUDED_HIP_UTILS__
 
 #include <iostream>
+#include <unistd.h>
 #include <hip/hip_runtime.h>
 
 static inline void checkError(hipError_t errorCode)
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
index d064f1347c..54d4e37457 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
@@ -51,8 +51,8 @@
 #endif
 __device__ uint32_t __clockLatency()
 {
-  uint32_t start = __ownClock();
-  uint32_t end = __ownClock();
+  uint32_t start = XClock();
+  uint32_t end = XClock();
   return end-start;
 }
 
@@ -169,12 +169,12 @@ __device__ __forceinline__ void nextNode( __VOLATILE__ Node ** ptr, uint32_t * t
    */
 
 # ifdef TIME_EACH_STEP
-  uint32_t t1 = __ownClock();
+  uint32_t t1 = XClock();
 # endif
   (*ptr) = (*ptr)->next;
 # ifdef TIME_EACH_STEP
   (*ptrs) = (Node*)(*ptr);  // Data dep. to prevent ILP.
-  *timer = __ownClock() - t1; // Time the jump
+  *timer = XClock() - t1; // Time the jump
 # endif
 
   // Keep traversing the list.
@@ -201,14 +201,14 @@ __global__ void timed_list_traversal(Node * __restrict__ buffer, uint32_t headIn
 
 #ifndef TIME_EACH_STEP
   // start timer
-  uint32_t start = __ownClock();
+  uint32_t start = XClock();
 #endif
 
   nextNode<NODES-1>(&ptr, s_timer, ptrs);
 
 #ifndef TIME_EACH_STEP
   // end cycle count
-  uint32_t end = __ownClock();
+  uint32_t end = XClock();
   timer[0] = end - start;
 #else
   for (uint32_t i = 0; i < NODES-1; i++)

From ff209dbee73fd9d7e219f3d42dea6daf3520df81 Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Fri, 13 Nov 2020 17:49:34 +0100
Subject: [PATCH 14/51] Restructure pChase algo

---
 .../gpu/pointer_chase/src/pChase_list.hpp     | 371 ++++++++++++++
 .../gpu/pointer_chase/src/pointer_chase.cu    | 485 ++++--------------
 2 files changed, 460 insertions(+), 396 deletions(-)
 create mode 100644 cscs-checks/microbenchmarks/gpu/pointer_chase/src/pChase_list.hpp

diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pChase_list.hpp b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pChase_list.hpp
new file mode 100644
index 0000000000..91a44b8450
--- /dev/null
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pChase_list.hpp
@@ -0,0 +1,371 @@
+
+
+/*
+ *
+ * Singly linked list implementation for GPUs
+ *
+ */
+
+
+__device__ uint32_t __clockLatency()
+{
+  uint32_t start = XClock();
+  uint32_t end = XClock();
+  return end-start;
+}
+
+
+__global__ void clockLatency()
+{
+  uint32_t clkLatency = __clockLatency();
+  printf(" - Clock latency is %d.\n", clkLatency);
+}
+
+
+/*
+ * Linked list definitions
+ */
+
+struct Node
+{
+  /* The node */
+  Node * next = nullptr;
+  char _padding[8*NODE_PADDING];
+};
+
+
+/*
+ *  Kernels and device functions
+ */
+
+__global__ void initialize_list(Node * buffer, int stride = 1)
+{
+  /* List serial initializer.
+   * - buffer: where the list is to be placed.
+   * - stride: argument controls the number of empty node spaces
+   *   in between two consecutive list nodes.
+   */
+
+  // Set the head
+  Node * prev = new (&(buffer[0])) Node();
+
+  // Init the rest of the list
+  for (int n = 1; n < NODES; n++)
+  {
+    Node * temp = new (&(buffer[n*stride])) Node();
+    prev->next = temp;
+    prev = temp;
+  }
+
+}
+
+__global__ void initialize_random_list(Node * buffer, uint32_t *indices)
+{
+  /* List random initializer
+   * - buffer: where the list is to be placed.
+   * - indices: array containing the node ordering indices as offsets in
+   *     the buffer.
+   */
+
+  // Set the head
+  Node * prev = new (&(buffer[indices[0]])) Node();
+
+  // Init the rest of the list
+  for (int n = 1; n < NODES; n++)
+  {
+    Node * temp = new (&(buffer[indices[n]])) Node();
+    prev->next = temp;
+    prev = temp;
+  }
+
+}
+
+__global__ void simple_traverse(Node * __restrict__ buffer, uint32_t headIndex)
+{
+  /* Simple list traverse - no timing is done here
+   * - buffer: where the list is
+   * - headIndex: index in the buffer where the head of the list is
+   */
+
+  uint32_t count = 0;
+  Node * head = &(buffer[headIndex]);
+  Node * ptr = head;
+  while(ptr->next != nullptr || count < NODES-1)
+  {
+    ptr = ptr->next;
+    count++;
+  }
+
+  // Silly dep. to tell the compiler not to throw away this kernel.
+  if (ptr->next == head)
+  {
+    printf("You had a circular list :(\n");
+  }
+
+}
+
+
+#ifdef VOLATILE
+# define __VOLATILE__ volatile
+#else
+# define __VOLATILE__
+#endif
+
+/*
+ * Timed list traversal. This implementation is recursive (because it's less code) so you have to
+ * watch out to not exceed the recursion limits. The functions are force-inlined, so the PTX code
+ * looks identical as if you were to unwrap the recursion manually.
+ *
+ * Depending on the compiler flags used, the timing can either measure each node jump, or the entire
+ * list traversal as a whole.
+ */
+template < unsigned int repeat >
+__device__ __forceinline__ void nextNode( __VOLATILE__ Node ** ptr, uint32_t * timer, Node ** ptrs)
+{
+  /*
+   * Go to the next node in the list
+   */
+
+# ifdef TIME_EACH_STEP
+  uint32_t t1 = XClock();
+# endif
+  (*ptr) = (*ptr)->next;
+# ifdef TIME_EACH_STEP
+  (*ptrs) = (Node*)(*ptr);  // Data dep. to prevent ILP.
+  *timer = XClock() - t1; // Time the jump
+# endif
+
+  // Keep traversing the list.
+  nextNode<repeat-1>(ptr, timer+1, ptrs+1);
+}
+
+// Specialize the function to break the recursion.
+template<>
+__device__ __forceinline__ void  nextNode<0>( __VOLATILE__ Node ** ptr, uint32_t * timer, Node ** ptrs){}
+
+
+__global__ void timed_list_traversal(Node * __restrict__ buffer, uint32_t headIndex, uint32_t * timer)
+{
+  /* Timed List traversal - we make a singly-linked list circular just to have a data dep. and
+   * prevent from compiler optimisations.
+   */
+
+  // These are used to prevent ILP when timing each jump.
+  __shared__ uint32_t s_timer[NODES-1];
+  __shared__ Node * ptrs[NODES-1];
+
+  // Create a pointer to iterate through the list
+  __VOLATILE__ Node * ptr = &(buffer[headIndex]);
+
+#ifndef TIME_EACH_STEP
+  // start timer
+  uint32_t start = XClock();
+#endif
+
+  nextNode<NODES-1>(&ptr, s_timer, ptrs);
+
+#ifndef TIME_EACH_STEP
+  // end cycle count
+  uint32_t end = XClock();
+  timer[0] = end - start;
+#else
+  for (uint32_t i = 0; i < NODES-1; i++)
+  {
+    timer[i] = s_timer[i];
+  }
+  if (ptr == ptrs[0])
+  {
+    printf("This is some data dependency that will never be executed.");
+  }
+#endif
+
+  // Join the tail with the head (just for the data dependency).
+  if (ptr->next == nullptr)
+  {
+    ptr->next = &(buffer[headIndex]);
+  }
+
+}
+
+
+/*
+ * List structure definitions
+ */
+
+struct List
+{
+  /*
+   * Contains the buffer where the list is stored, the index in this buffer where the head
+   * of the list is, the buffer size, and the stride in between nodes (this last one is only
+   * meaningful if the list is not initialised as random).
+   *
+   * The member functions are:
+   *  - info: prints the list details.
+   *  - initialize: populatest the buffer with the list nodes.
+   *  - traverse: simple list traversal.
+   *  - timed_traverse: traverses the list and measures the number of cycles per node jump.
+   */
+
+  Node * buffer = nullptr;
+  uint32_t headIndex = 0;
+  uint32_t * timer = nullptr;
+  uint32_t * d_timer = nullptr;
+  size_t buffSize;
+  size_t stride;
+
+  List(size_t bSize, size_t st) : buffSize(bSize), stride(st)
+  {
+    // Allocate the buffers to store the timings measured in the kernel
+    timer = new uint32_t[NODES];
+    XMalloc((void**)&d_timer, sizeof(uint32_t)*(NODES));
+  };
+
+  virtual ~List()
+  {
+    XFree(d_timer);
+  }
+
+  void info(size_t n, size_t buffSize)
+  {
+    printf("Creating Linked list:\n");
+    printf(" - Node size: %lu\n", sizeof(Node));
+    printf(" - Number of nodes: %lu:\n", n);
+    printf(" - Total buffer size: %10.2f MB:\n", float(sizeof(Node)*buffSize)/1024.0/1024);
+    clockLatency<<<1,1>>>();
+    XDeviceSynchronize();
+  }
+
+  void initialize(int mode=0)
+  {
+    /*
+     * mode 0 initializes the list as serial.
+     * mode 1 initializes the list in a random order.
+     */
+
+    if (mode < 0 || mode > 1)
+    {
+      printf("Unknown list initialization scheme. Defaulting back to 0.");
+      mode = 0;
+    }
+
+    if (mode == 0)
+    {
+      initialize_list<<<1,1>>>(buffer, stride);
+      XDeviceSynchronize();
+    }
+    else
+    {
+      // Random number engine.
+      std::mt19937_64 rng;
+      uint64_t timeSeed = std::chrono::high_resolution_clock::now().time_since_epoch().count();
+      std::seed_seq ss{uint32_t(timeSeed & 0xffffffff), uint32_t(timeSeed>>32)};
+      rng.seed(ss);
+      std::uniform_real_distribution<double> unif(0, 1);
+
+      uint32_t * nodeIndices = (uint32_t*)malloc(sizeof(uint32_t)*NODES);
+      // Create set to keep track of the assigned indices.
+      std::set<uint32_t> s = {};
+      for (int i = 0; i < NODES; i++)
+      {
+        // Get a random index.
+        uint32_t currentIndex = (uint32_t)(unif(rng)*buffSize);
+
+        // If already present in the set, find another alternative index.
+        while (s.find(currentIndex) != s.end())
+        {
+          if (currentIndex < NODES-1)
+          {
+            currentIndex++;
+          }
+          else
+          {
+            currentIndex = 0;
+          }
+        }
+
+        nodeIndices[i] = currentIndex;
+        s.insert(currentIndex);
+      }
+
+      // Copy the node indices to the device and init the random list
+      uint32_t * d_nodeIndices;
+      XMalloc((void**)&d_nodeIndices, sizeof(uint32_t)*NODES);
+      XMemcpy(d_nodeIndices, nodeIndices, sizeof(uint32_t)*NODES, XMemcpyHostToDevice);
+      initialize_random_list<<<1,1>>>(buffer, d_nodeIndices);
+      headIndex = nodeIndices[0];
+      free(nodeIndices);
+      XFree(d_nodeIndices);
+    }
+
+    XDeviceSynchronize();
+  }
+
+  void traverse()
+  {
+    /*
+     * Simple list traversal - NOT timed.
+     */
+    simple_traverse<<<1,1>>>(buffer, headIndex);
+    XDeviceSynchronize();
+  }
+
+  void time_traversal()
+  {
+    /*
+     * Timed list traversal
+     */
+
+    timed_list_traversal<<<1,1>>>(buffer, headIndex, d_timer);
+    XDeviceSynchronize();
+
+    // Copy the timing data back to the host
+    XMemcpy(timer, d_timer, sizeof(uint32_t)*(NODES-1), XMemcpyDeviceToHost);
+  }
+
+};
+
+
+struct DeviceList : public List
+{
+  /*
+   * List allocated in device memory
+   */
+
+  DeviceList(size_t n, size_t buffSize, size_t stride) : List(buffSize, stride)
+  {
+#   ifdef DEBUG
+    List::info(n, buffSize);
+#   endif
+    XMalloc((void**)&buffer, sizeof(Node)*buffSize);
+  }
+
+  ~DeviceList()
+  {
+    XFree(buffer);
+  }
+};
+
+
+struct HostList : public List
+{
+  /*
+   * List allocated in pinned host memory
+   */
+
+  Node * h_buffer;
+  HostList(size_t n, size_t buffSize, size_t stride) : List(buffSize,stride)
+  {
+#   ifdef DEBUG
+    List::info(n, buffSize);
+#   endif
+    XHostMalloc((void**)&h_buffer, sizeof(Node)*buffSize, XHostAllocMapped);
+    XHostGetDevicePointer((void**)&buffer, (void*)h_buffer, 0);
+  }
+
+  ~HostList()
+  {
+    XFreeHost(buffer);
+  }
+};
+
+
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
index 54d4e37457..17b2b0ccff 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
@@ -6,6 +6,7 @@
 #include <chrono>
 #include <set>
 #include <memory>
+#include <queue>
 
 // Include the CUDA/HIP wrappers from the other test for now.
 #include "../../memory_bandwidth/src/Xdevice/runtime.hpp"
@@ -21,7 +22,7 @@
  The stride and the full buffer size can be set with "--stride" and "--buffer",
  both in number of nodes.
 
- The macro NODES sets the total number of nodes in the list. Node that the
+ The macro NODES sets the total number of nodes in the list. Note that the
  list traversal is 'unrolled' inlining a recursive template, and this will
  not work if you use a large number of nodes.
 
@@ -32,7 +33,7 @@
  (default option) the linked list is allocated in device memory. In contrast, if
  HostList is used, the list is allocated as host's pinned memory.
 
- The links of the list can be made vlatile defining the macro VOLATILE.
+ The links of the list can be made volatile defining the macro VOLATILE.
 
  By default, the code returns the aveage number of cycles per jump, but this can
  be changed to return the cycle count on a per-jump basis by defining the flag
@@ -49,366 +50,13 @@
 #ifndef HOSTNAME_SIZE
 # define HOSTNAME_SIZE 80
 #endif
-__device__ uint32_t __clockLatency()
-{
-  uint32_t start = XClock();
-  uint32_t end = XClock();
-  return end-start;
-}
-
-
-__global__ void clockLatency()
-{
-  uint32_t clkLatency = __clockLatency();
-  printf(" - Clock latency is %d.\n", clkLatency);
-}
-
-
-/*
- * Linked list definitions
- */
-
-struct Node
-{
-  /* The node */
-  Node * next = nullptr;
-  char _padding[8*NODE_PADDING];
-};
-
-
-/*
- *  Kernels and device functions
- */
-
-__global__ void initialize_list(Node * buffer, int stride = 1)
-{
-  /* List serial initializer.
-   * - buffer: where the list is to be placed.
-   * - stride: argument controls the number of empty node spaces
-   *   in between two consecutive list nodes.
-   */
-
-  // Set the head
-  Node * prev = new (&(buffer[0])) Node();
-
-  // Init the rest of the list
-  for (int n = 1; n < NODES; n++)
-  {
-    Node * temp = new (&(buffer[n*stride])) Node();
-    prev->next = temp;
-    prev = temp;
-  }
-
-}
-
-__global__ void initialize_random_list(Node * buffer, uint32_t *indices)
-{
-  /* List random initializer
-   * - buffer: where the list is to be placed.
-   * - indices: array containing the node ordering indices as offsets in
-   *     the buffer.
-   */
-
-  // Set the head
-  Node * prev = new (&(buffer[indices[0]])) Node();
-
-  // Init the rest of the list
-  for (int n = 1; n < NODES; n++)
-  {
-    Node * temp = new (&(buffer[indices[n]])) Node();
-    prev->next = temp;
-    prev = temp;
-  }
-
-}
-
-__global__ void simple_traverse(Node * __restrict__ buffer, uint32_t headIndex)
-{
-  /* Simple list traverse - no timing is done here
-   * - buffer: where the list is
-   * - headIndex: index in the buffer where the head of the list is
-   */
-
-  uint32_t count = 0;
-  Node * head = &(buffer[headIndex]);
-  Node * ptr = head;
-  while(ptr->next != nullptr || count < NODES-1)
-  {
-    ptr = ptr->next;
-    count++;
-  }
-
-  // Silly dep. to tell the compiler not to throw away this kernel.
-  if (ptr->next == head)
-  {
-    printf("You had a circular list :(\n");
-  }
-
-}
-
-
-#ifdef VOLATILE
-# define __VOLATILE__ volatile
-#else
-# define __VOLATILE__
-#endif
-
-/*
- * Timed list traversal. This implementation is recursive (because it's less code) so you have to
- * watch out to not exceed the recursion limits. The functions are force-inlined, so the PTX code
- * looks identical as if you were to unwrap the recursion manually.
- *
- * Depending on the compiler flags used, the timing can either measure each node jump, or the entire
- * list traversal as a whole.
- */
-template < unsigned int repeat >
-__device__ __forceinline__ void nextNode( __VOLATILE__ Node ** ptr, uint32_t * timer, Node ** ptrs)
-{
-  /*
-   * Go to the next node in the list
-   */
-
-# ifdef TIME_EACH_STEP
-  uint32_t t1 = XClock();
-# endif
-  (*ptr) = (*ptr)->next;
-# ifdef TIME_EACH_STEP
-  (*ptrs) = (Node*)(*ptr);  // Data dep. to prevent ILP.
-  *timer = XClock() - t1; // Time the jump
-# endif
-
-  // Keep traversing the list.
-  nextNode<repeat-1>(ptr, timer+1, ptrs+1);
-}
-
-// Specialize the function to break the recursion.
-template<>
-__device__ __forceinline__ void  nextNode<0>( __VOLATILE__ Node ** ptr, uint32_t * timer, Node ** ptrs){}
-
-
-__global__ void timed_list_traversal(Node * __restrict__ buffer, uint32_t headIndex, uint32_t * timer)
-{
-  /* Timed List traversal - we make a singly-linked list circular just to have a data dep. and
-   * prevent from compiler optimisations.
-   */
-
-  // These are used to prevent ILP when timing each jump.
-  __shared__ uint32_t s_timer[NODES-1];
-  __shared__ Node * ptrs[NODES-1];
-
-  // Create a pointer to iterate through the list
-  __VOLATILE__ Node * ptr = &(buffer[headIndex]);
-
-#ifndef TIME_EACH_STEP
-  // start timer
-  uint32_t start = XClock();
-#endif
-
-  nextNode<NODES-1>(&ptr, s_timer, ptrs);
-
-#ifndef TIME_EACH_STEP
-  // end cycle count
-  uint32_t end = XClock();
-  timer[0] = end - start;
-#else
-  for (uint32_t i = 0; i < NODES-1; i++)
-  {
-    timer[i] = s_timer[i];
-  }
-  if (ptr == ptrs[0])
-  {
-    printf("This is some data dependency that will never be executed.");
-  }
-#endif
-
-  // Join the tail with the head (just for the data dependency).
-  if (ptr->next == nullptr)
-  {
-    ptr->next = &(buffer[headIndex]);
-  }
-
-}
-
-
-/*
- * List structure definitions
- */
-
-struct List
-{
-  /*
-   * Contains the buffer where the list is stored, the index in this buffer where the head
-   * of the list is, the buffer size, and the stride in between nodes (this last one is only
-   * meaningful if the list is not initialised as random).
-   *
-   * The member functions are:
-   *  - info: prints the list details.
-   *  - initialize: populatest the buffer with the list nodes.
-   *  - traverse: simple list traversal.
-   *  - timed_traverse: traverses the list and measures the number of cycles per node jump.
-   */
-
-  Node * buffer = nullptr;
-  uint32_t headIndex = 0;
-  uint32_t * timer = nullptr;
-  uint32_t * d_timer = nullptr;
-  size_t buffSize;
-  size_t stride;
-
-  List(size_t bSize, size_t st) : buffSize(bSize), stride(st)
-  {
-    // Allocate the buffers to store the timings measured in the kernel
-    timer = new uint32_t[NODES];
-    XMalloc((void**)&d_timer, sizeof(uint32_t)*(NODES));
-  };
-
-  virtual ~List()
-  {
-    XFree(d_timer);
-  }
-
-  void info(size_t n, size_t buffSize)
-  {
-    printf("Creating Linked list:\n");
-    printf(" - Node size: %lu\n", sizeof(Node));
-    printf(" - Number of nodes: %lu:\n", n);
-    printf(" - Total buffer size: %10.2f MB:\n", float(sizeof(Node)*buffSize)/1024.0/1024);
-    clockLatency<<<1,1>>>();
-    XDeviceSynchronize();
-  }
-
-  void initialize(int mode=0)
-  {
-    /*
-     * mode 0 initializes the list as serial.
-     * mode 1 initializes the list in a random order.
-     */
-
-    if (mode < 0 || mode > 1)
-    {
-      printf("Unknown list initialization scheme. Defaulting back to 0.");
-      mode = 0;
-    }
-
-    if (mode == 0)
-    {
-      initialize_list<<<1,1>>>(buffer, stride);
-      XDeviceSynchronize();
-    }
-    else
-    {
-      // Random number engine.
-      std::mt19937_64 rng;
-      uint64_t timeSeed = std::chrono::high_resolution_clock::now().time_since_epoch().count();
-      std::seed_seq ss{uint32_t(timeSeed & 0xffffffff), uint32_t(timeSeed>>32)};
-      rng.seed(ss);
-      std::uniform_real_distribution<double> unif(0, 1);
-
-      uint32_t * nodeIndices = (uint32_t*)malloc(sizeof(uint32_t)*NODES);
-      // Create set to keep track of the assigned indices.
-      std::set<uint32_t> s = {};
-      for (int i = 0; i < NODES; i++)
-      {
-        // Get a random index.
-        uint32_t currentIndex = (uint32_t)(unif(rng)*buffSize);
-
-        // If already present in the set, find another alternative index.
-        while (s.find(currentIndex) != s.end())
-        {
-          if (currentIndex < NODES-1)
-          {
-            currentIndex++;
-          }
-          else
-          {
-            currentIndex = 0;
-          }
-        }
-
-        nodeIndices[i] = currentIndex;
-        s.insert(currentIndex);
-      }
 
-      // Copy the node indices to the device and init the random list
-      uint32_t * d_nodeIndices;
-      XMalloc((void**)&d_nodeIndices, sizeof(uint32_t)*NODES);
-      XMemcpy(d_nodeIndices, nodeIndices, sizeof(uint32_t)*NODES, XMemcpyHostToDevice);
-      initialize_random_list<<<1,1>>>(buffer, d_nodeIndices);
-      headIndex = nodeIndices[0];
-      free(nodeIndices);
-      XFree(d_nodeIndices);
-    }
 
-    XDeviceSynchronize();
-  }
-
-  void traverse()
-  {
-    /*
-     * Simple list traversal - NOT timed.
-     */
-    simple_traverse<<<1,1>>>(buffer, headIndex);
-    XDeviceSynchronize();
-  }
-
-  void time_traversal()
-  {
-    /*
-     * Timed list traversal
-     */
-
-    timed_list_traversal<<<1,1>>>(buffer, headIndex, d_timer);
-    XDeviceSynchronize();
-
-    // Copy the timing data back to the host
-    XMemcpy(timer, d_timer, sizeof(uint32_t)*(NODES-1), XMemcpyDeviceToHost);
-  }
-
-};
-
-
-struct DeviceList : public List
-{
-  /*
-   * List allocated in device memory
-   */
-
-  DeviceList(size_t n, size_t buffSize, size_t stride) : List(buffSize, stride)
-  {
-#   ifdef DEBUG
-    List::info(n, buffSize);
-#   endif
-    XMalloc((void**)&buffer, sizeof(Node)*buffSize);
-  }
-
-  ~DeviceList()
-  {
-    XFree(buffer);
-  }
-};
-
-
-struct HostList : public List
-{
-  /*
-   * List allocated in pinned host memory
-   */
-
-  Node * h_buffer;
-  HostList(size_t n, size_t buffSize, size_t stride) : List(buffSize,stride)
-  {
-#   ifdef DEBUG
-    List::info(n, buffSize);
-#   endif
-    XHostMalloc((void**)&h_buffer, sizeof(Node)*buffSize, XHostAllocMapped);
-    XHostGetDevicePointer((void**)&buffer, (void*)h_buffer, 0);
-  }
+// Include the CUDA/HIP wrappers from the other test for now.
+#include "../../memory_bandwidth/src/Xdevice/runtime.hpp"
 
-  ~HostList()
-  {
-    XFreeHost(buffer);
-  }
-};
+// List structure
+#include "pChase_list.hpp"
 
 
 template < class LIST >
@@ -490,6 +138,45 @@ void localPointerChase(int num_devices, int init_mode, size_t buffSize, size_t s
 }
 
 
+#ifdef SYMM
+# define LIMITS j
+#else
+# define LIMITS 0
+#endif
+
+void print_device_table(int num_devices, std::queue<uint32_t> q, const char * what, const char * nid)
+{
+  printf("[%s] %s memory latency (in clock cycles) with remote direct memory access\n", nid, what);
+  printf("[%s] %10s", nid, "From \\ To ");
+  for (int ds = 0; ds < num_devices; ds++)
+  {
+    printf("%4sGPU %2d", "", ds);
+  } printf("%10s\n", "Totals");
+
+  for (int j = 0; j < num_devices; j++)
+  {
+    // Track the sum of the latencies
+    uint32_t totals = 0;
+
+    printf("[%s] GPU %2d%4s", nid, j, " ");
+    for (int i = 0; i < LIMITS; i++)
+    {
+      printf("%10s", "X");
+    }
+
+    for (int i = LIMITS; i < num_devices; i++)
+    {
+      uint32_t timer = q.front();
+      q.pop();
+      if (i != j)
+      {
+        totals += timer;
+      }
+      printf("%10d", timer);
+    } printf("%10d\n", totals);
+  }
+}
+
 template < class LIST >
 void remotePointerChase(int num_devices, int init_mode, size_t buffSize, size_t stride, char * nid)
 {
@@ -497,17 +184,13 @@ void remotePointerChase(int num_devices, int init_mode, size_t buffSize, size_t
    * Specialised pointer chase to allocate the list in one device, and do the pointer chase from another device.
    */
 
-#ifdef SYMM
-# define LIMITS j
-#else
-# define LIMITS 0
-#endif
-
 # ifndef TIME_EACH_STEP
+  std::queue<uint32_t> q_average;
   auto fetch = [](uint32_t* t){return t[0]/(NODES-1);};
 # else
-# ifdef MAX_LATENCY
-  auto fetch = [](uint32_t* t)
+  std::queue<uint32_t> q_max;
+  std::queue<uint32_t> q_min;
+  auto fetchMax = [](uint32_t* t)
   {
     uint32_t max = 0;
     for (int i = 0; i < NODES-1; i++)
@@ -517,8 +200,7 @@ void remotePointerChase(int num_devices, int init_mode, size_t buffSize, size_t
     }
     return max;
   };
-# else
-  auto fetch = [](uint32_t* t)
+  auto fetchMin = [](uint32_t* t)
   {
     uint32_t min = ~0;
     for (int i = 0; i < NODES-1; i++)
@@ -529,39 +211,37 @@ void remotePointerChase(int num_devices, int init_mode, size_t buffSize, size_t
     return min;
   };
 # endif
-# endif
-
-  printf("[%s] Memory latency (cycles) with remote direct memory access\n", nid);
-  printf("[%s] %10s", nid, "From \\ To ");
-  for (int ds = 0; ds < num_devices; ds++)
-  {
-    printf("%4sGPU %2d", "", ds);
-  } printf("%10s\n", "Totals");
 
+  // Do the latency measurements
   for (int j = 0; j < num_devices; j++)
   {
-    // Track the sum of the latencies
-    uint32_t totals = 0;
-
-    printf("[%s] GPU %2d%4s", nid, j, " ");
-    for (int i = 0; i < LIMITS; i++)
-    {
-      printf("%10s", "X");
-    }
-
     for (int i = LIMITS; i < num_devices; i++)
     {
-
       uint32_t * timer_ptr = generalPointerChase< LIST >(i, j, init_mode, buffSize, stride);
-      uint32_t timer = fetch(timer_ptr);
+
+      // Store the desired values for each element of the matrix in queues
+#     ifndef TIME_EACH_STEP
+      q_average.push(fetch(timer_ptr));
+#     else
+      q_min.push(fetchMin(timer_ptr));
+      q_max.push(fetchMax(timer_ptr));
+#     endif
       delete [] timer_ptr;
-      if (i != j)
-      {
-        totals += timer;
-      }
-      printf("%10d", timer);
-    } printf("%10d\n", totals);
+    }
   }
+
+  std::string what;
+# ifndef TIME_EACH_STEP
+  what = "Average";
+  print_device_table(num_devices, q_average, what.c_str(), nid);
+# else
+  what = "Min.";
+  print_device_table(num_devices, q_min, what.c_str(), nid);
+  printf("\n");
+  what = "Max.";
+  print_device_table(num_devices, q_max, what.c_str(), nid);
+# endif
+
 }
 
 
@@ -571,6 +251,7 @@ int main(int argc, char ** argv)
   int list_init_mode = 0;
   size_t stride = 1;
   size_t buffSize = NODES*stride;
+  int multiGPU = 0;
 
   // Parse the command line args.
   for (int i = 0; i < argc; i++)
@@ -583,6 +264,8 @@ int main(int argc, char ** argv)
       std::cout << "              If --rand is used, this parameter just changes the buffer size." << std::endl;
       std::cout << "--buffer #  : Sets the size of the buffer where the linked list is allocated on. " << std::endl;
       std::cout << "              The number indicates the size of the buffer in list nodes." << std::endl;
+      std::cout << "--multiGPU  : Runs the pointer chase algo using all device-pair combinations." << std::endl;
+      std::cout << "              This measures the device-to-device memory latency." << std::endl;
       std::cout << "--help (-h) : I guess you figured what this does already ;)" << std::endl;
       return 0;
     }
@@ -600,6 +283,10 @@ int main(int argc, char ** argv)
     {
       buffSize = std::stoi((std::string)argv[++i]);
     }
+    else if (str == "--multiGPU")
+    {
+      multiGPU = 1;
+    }
   }
 
   // Sanity of the command line args.
@@ -626,8 +313,14 @@ int main(int argc, char ** argv)
     printf("[%s] Found %d device(s).\n", nid_name, num_devices);
   }
 
-  localPointerChase<LIST_TYPE>(num_devices, list_init_mode, buffSize, stride, nid_name);
-  remotePointerChase<LIST_TYPE>(num_devices, list_init_mode, buffSize, stride, nid_name);
+  if (!multiGPU)
+  {
+    localPointerChase<LIST_TYPE>(num_devices, list_init_mode, buffSize, stride, nid_name);
+  }
+  else
+  {
+    remotePointerChase<LIST_TYPE>(num_devices, list_init_mode, buffSize, stride, nid_name);
+  }
 
   printf("[%s] Pointer chase complete.\n", nid_name);
   return 0;

From 12033fcb884bb48ad5b3d6fd31b1a743da573b61 Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Mon, 16 Nov 2020 19:05:41 +0100
Subject: [PATCH 15/51] Create pointer_chase reframe test.

---
 config/cscs.py                                |   1 -
 .../gpu/pointer_chase/pointer_chase.py        | 158 +++++++++++++++++-
 .../gpu/pointer_chase/src/Xdevice             |   1 +
 .../gpu/pointer_chase/src/makefile.hip        |   6 +-
 .../gpu/pointer_chase/src/pointer_chase.cu    |   5 +-
 5 files changed, 155 insertions(+), 16 deletions(-)
 create mode 120000 cscs-checks/microbenchmarks/gpu/pointer_chase/src/Xdevice

diff --git a/config/cscs.py b/config/cscs.py
index 8f5f828a79..d33fc2cc67 100644
--- a/config/cscs.py
+++ b/config/cscs.py
@@ -719,7 +719,6 @@
             ],
             'modules': [
                 'gcc/9.3.0',
-                'cuda/11.0',
                 'openmpi/3.1.6'
             ],
             'cc': 'mpicc',
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
index 025fb1341e..e3da82b5e2 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
@@ -6,33 +6,71 @@
 import reframe.utility.sanity as sn
 import reframe as rfm
 
+import os
 
 @rfm.simple_test
-class GpuPointerChase(rfm.RegressionTest):
+class CompileGpuPointerChase(rfm.CompileOnlyRegressionTest):
     def __init__(self):
         self.valid_systems = ['ault:intelv100', 'ault:amdv100',
-                              'ault:amda100']
+                              'ault:amda100', 'ault:amdvega']
         self.valid_prog_environs = ['PrgEnv-gnu']
         self.exclusive_access = True
-        self.pre_build_cmds = ['cp makefile.cuda Makefile']
         self.build_system = 'Make'
-        self.executable = 'pChase.x'
         self.num_tasks = 0
         self.num_tasks_per_node = 1
+        self.postbuild_cmds = ['ls .']
+        self.sanity_patterns = sn.assert_found(r'pChase.x', self.stdout)
+        self.maintainers = ['JO']
 
-
+    @rfm.run_after('setup')
+    def select_makefile(self):
+        cp = self.current_partition.fullname
+        if cp == 'ault:amdvega':
+            self.prebuild_cmds = ['cp makefile.hip Makefile']
+        else:
+            self.prebuild_cmds = ['cp makefile.cuda Makefile']
 
     @rfm.run_before('compile')
     def set_gpu_arch(self):
         cp = self.current_partition.fullname
+
+        # Deal with the NVIDIA options first
+        nvidia_sm = None
         if cp[-4:] == 'v100':
             nvidia_sm = '70'
         elif cp[-4:] == 'a100':
             nvidia_sm = '80'
-        else
-            nvidia_sm = None
 
-        self.build_system.cxxflags += [f'-arch=sm_{nvidia_sm}']
+        if nvidia_sm:
+            self.build_system.cxxflags += [f'-arch=sm_{nvidia_sm}']
+            self.modules += ['cuda']
+
+        # Deal with the AMD options
+        amd_trgt = None
+        if cp == 'ault:amdvega':
+            amd_trgt = 'gfx908'
+
+        if amd_trgt:
+            self.build_system.cxxflags += [f'--amdgpu-target={amd_trgt}']
+            self.modules += ['rocm']
+
+
+class GpuPointerChaseBase(rfm.RunOnlyRegressionTest):
+    def __init__(self):
+        self.depends_on('CompileGpuPointerChase')
+        self.valid_systems = ['ault:intelv100', 'ault:amdv100',
+                              'ault:amda100', 'ault:amdvega']
+        self.valid_prog_environs = ['PrgEnv-gnu']
+        self.num_tasks = 0
+        self.num_tasks_per_node = 1
+        self.exclusive_access = True
+        self.sanity_patterns = self.do_sanity_check()
+        self.maintainers = ['JO']
+
+    @rfm.require_deps
+    def set_executable(self, CompileGpuPointerChase):
+        self.executable = os.path.join(
+            CompileGpuPointerChase().stagedir, 'pChase.x')
 
     @rfm.run_before('run')
     def set_num_gpus_per_node(self):
@@ -41,5 +79,109 @@ def set_num_gpus_per_node(self):
             self.num_gpus_per_node = 4
         elif cp in {'ault:amdv100'}:
             self.num_gpus_per_node = 2
+        elif cp in {'ault:amdvega'}:
+            self.num_gpus_per_node = 3
         else:
             self.num_gpus_per_node = 1
+
+    @sn.sanity_function
+    def do_sanity_check(self):
+
+        # Check that every node has the right number of GPUs
+        healthy_nodes = len(set(sn.extractall(
+            r'^\s*\[([^,]*)\]\s*Found %d device\(s\).' % self.num_gpus_per_node,
+            self.stdout, 1)))
+
+        # Check that every node has made it to the end.
+        nodes_at_end = len(set(sn.extractall(
+            r'^\s*\[([^,]{1,20})\]\s*Pointer chase complete.',
+            self.stdout, 1)))
+        return sn.evaluate(sn.assert_eq(
+            sn.assert_eq(self.job.num_tasks, healthy_nodes),
+            sn.assert_eq(self.job.num_tasks, nodes_at_end)))
+
+
+@rfm.parameterized_test([1], [2], [4], [4096])
+class GpuPointerChaseSingle(GpuPointerChaseBase):
+    def __init__(self, stride):
+        super().__init__()
+
+        self.perf_patterns = {
+            'average': sn.min(sn.extractall(r'^\s*\[[^\]]{1,20}\]\s* On device \d+, '
+                                            r'the chase took on average (\d+) '
+                                            r'cycles per node jump.',
+                                            self.stdout, 1, int)),
+        }
+
+        if stride == 1:
+            self.reference = {
+                'ault:amda100': {
+                    'average': (76, None, 0.1, 'clock cycles')
+                },
+                'ault:amdv100': {
+                    'average': (77, None, 0.1, 'clock cycles')
+                },
+                'dom:gpu': {
+                    'average': (143, None, 0.1, 'clock cycles')
+                },
+                'daint:gpu': {
+                    'average': (143, None, 0.1, 'clock cycles')
+                },
+                'ault:amdvega': {
+                    'average': (225, None, 0.1, 'clock cycles')
+                },
+            }
+        elif stride == 2:
+            self.reference = {
+                'ault:amda100': {
+                    'average': (116, None, 0.1, 'clock cycles')
+                },
+                'ault:amdv100': {
+                    'average': (118, None, 0.1, 'clock cycles')
+                },
+                'dom:gpu': {
+                    'average': (181, None, 0.1, 'clock cycles')
+                },
+                'daint:gpu': {
+                    'average': (181, None, 0.1, 'clock cycles')
+                },
+                'ault:amdvega': {
+                    'average': (300, None, 0.1, 'clock cycles')
+                },
+            }
+        elif stride == 4:
+            self.reference = {
+                'ault:amda100': {
+                    'average': (118, None, 0.1, 'clock cycles')
+                },
+                'ault:amdv100': {
+                    'average': (200, None, 0.1, 'clock cycles')
+                },
+                'dom:gpu': {
+                    'average': (260, None, 0.1, 'clock cycles')
+                },
+                'daint:gpu': {
+                    'average': (260, None, 0.1, 'clock cycles')
+                },
+                'ault:amdvega': {
+                    'average': (470, None, 0.1, 'clock cycles')
+                },
+            }
+        elif stride == 4096:
+            self.reference = {
+                'ault:amda100': {
+                    'average': (206, None, 0.1, 'clock cycles')
+                },
+                'ault:amdv100': {
+                    'average': (220, None, 0.1, 'clock cycles')
+                },
+                'dom:gpu': {
+                    'average': (260, None, 0.1, 'clock cycles')
+                },
+                'daint:gpu': {
+                    'average': (260, None, 0.1, 'clock cycles')
+                },
+                'ault:amdvega': {
+                    'average': (800, None, 0.1, 'clock cycles')
+                },
+            }
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/Xdevice b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/Xdevice
new file mode 120000
index 0000000000..68ede93327
--- /dev/null
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/Xdevice
@@ -0,0 +1 @@
+../../memory_bandwidth/src/Xdevice
\ No newline at end of file
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/makefile.hip b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/makefile.hip
index 7ccc63176f..f10578c035 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/makefile.hip
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/makefile.hip
@@ -1,5 +1,5 @@
-RSMI_ROOT=/opt/rocm-3.9.0/rocm_smi
-AMDGPU_TARGET=gfx906,gfx908
+RSMI_ROOT?=/opt/rocm-3.9.0/rocm_smi
+CXXFLAGS?=--amdgpu-target=gfx906,gfx908
 
 pointerChase:
-	hipcc -o pChase.x -O3 pointer_chase.cu -DTARGET_HIP ${CXXFLAGS} -std=c++11 -lnuma --amdgpu-target=${AMDGPU_TARGET}  -I${RSMI_ROOT}/include -L${RSMI_ROOT}/lib -lrocm_smi64
+	hipcc -o pChase.x -O3 pointer_chase.cu -DTARGET_HIP ${CXXFLAGS} -std=c++11 -lnuma -I${RSMI_ROOT}/include -L${RSMI_ROOT}/lib -lrocm_smi64
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
index 17b2b0ccff..aa1ac35e45 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
@@ -8,9 +8,6 @@
 #include <memory>
 #include <queue>
 
-// Include the CUDA/HIP wrappers from the other test for now.
-#include "../../memory_bandwidth/src/Xdevice/runtime.hpp"
-
 /*
  ~~ GPU Linked list pointer chase algorithm ~~
  Times in clock cycles the time it takes to jump from one node to the next
@@ -53,7 +50,7 @@
 
 
 // Include the CUDA/HIP wrappers from the other test for now.
-#include "../../memory_bandwidth/src/Xdevice/runtime.hpp"
+#include "Xdevice/runtime.hpp"
 
 // List structure
 #include "pChase_list.hpp"

From ec1317b7bb12cb7bdabc3ad23b098ad1d9a34ac3 Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Tue, 17 Nov 2020 13:49:47 +0100
Subject: [PATCH 16/51] Update ref for A100s,

---
 .../gpu/pointer_chase/pointer_chase.py        | 30 ++++++++++++-------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
index e3da82b5e2..ee393cdc25 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
@@ -8,12 +8,21 @@
 
 import os
 
+
+class Pchase:
+    '''
+    Public storage class to avoid writing the parameters below multiple times.
+    '''
+    valid_systems = ['ault:intelv100', 'ault:amdv100',
+                     'ault:amda100', 'ault:amdvega']
+    valid_prog_environs = ['PrgEnv-gnu']
+
+
 @rfm.simple_test
 class CompileGpuPointerChase(rfm.CompileOnlyRegressionTest):
     def __init__(self):
-        self.valid_systems = ['ault:intelv100', 'ault:amdv100',
-                              'ault:amda100', 'ault:amdvega']
-        self.valid_prog_environs = ['PrgEnv-gnu']
+        self.valid_systems = Pchase.valid_systems
+        self.valid_prog_environs = Pchase.valid_prog_environs
         self.exclusive_access = True
         self.build_system = 'Make'
         self.num_tasks = 0
@@ -58,9 +67,8 @@ def set_gpu_arch(self):
 class GpuPointerChaseBase(rfm.RunOnlyRegressionTest):
     def __init__(self):
         self.depends_on('CompileGpuPointerChase')
-        self.valid_systems = ['ault:intelv100', 'ault:amdv100',
-                              'ault:amda100', 'ault:amdvega']
-        self.valid_prog_environs = ['PrgEnv-gnu']
+        self.valid_systems = Pchase.valid_systems
+        self.valid_prog_environs = Pchase.valid_prog_environs
         self.num_tasks = 0
         self.num_tasks_per_node = 1
         self.exclusive_access = True
@@ -89,12 +97,12 @@ def do_sanity_check(self):
 
         # Check that every node has the right number of GPUs
         healthy_nodes = len(set(sn.extractall(
-            r'^\s*\[([^,]*)\]\s*Found %d device\(s\).' % self.num_gpus_per_node,
+            r'^\s*\[([^\]]*)\]\s*Found %d device\(s\).' % self.num_gpus_per_node,
             self.stdout, 1)))
 
         # Check that every node has made it to the end.
         nodes_at_end = len(set(sn.extractall(
-            r'^\s*\[([^,]{1,20})\]\s*Pointer chase complete.',
+            r'^\s*\[([^\]]*)\]\s*Pointer chase complete.',
             self.stdout, 1)))
         return sn.evaluate(sn.assert_eq(
             sn.assert_eq(self.job.num_tasks, healthy_nodes),
@@ -105,9 +113,9 @@ def do_sanity_check(self):
 class GpuPointerChaseSingle(GpuPointerChaseBase):
     def __init__(self, stride):
         super().__init__()
-
+        self.executable_opts = ['--stride', f'{stride}']
         self.perf_patterns = {
-            'average': sn.min(sn.extractall(r'^\s*\[[^\]]{1,20}\]\s* On device \d+, '
+            'average': sn.min(sn.extractall(r'^\s*\[[^\]]*\]\s* On device \d+, '
                                             r'the chase took on average (\d+) '
                                             r'cycles per node jump.',
                                             self.stdout, 1, int)),
@@ -152,7 +160,7 @@ def __init__(self, stride):
         elif stride == 4:
             self.reference = {
                 'ault:amda100': {
-                    'average': (118, None, 0.1, 'clock cycles')
+                    'average': (198, None, 0.1, 'clock cycles')
                 },
                 'ault:amdv100': {
                     'average': (200, None, 0.1, 'clock cycles')

From ee42305a5fd05d7616afadde8c14dcdcc9f080de Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Wed, 18 Nov 2020 13:55:21 +0100
Subject: [PATCH 17/51] Add XClocks class to Xdevice lib.

---
 .../src/Xdevice/cuda/tools.hpp                | 57 ++++++++++++++++++
 .../src/Xdevice/hip/tools.hpp                 | 59 ++++++++++++++++---
 .../gpu/pointer_chase/pointer_chase.py        | 44 +++++++++++---
 .../gpu/pointer_chase/src/pChase_list.hpp     | 11 ++--
 4 files changed, 151 insertions(+), 20 deletions(-)

diff --git a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/tools.hpp b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/tools.hpp
index 102776fbdb..ae628746f6 100644
--- a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/tools.hpp
+++ b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/tools.hpp
@@ -92,6 +92,63 @@ __device__ __forceinline__ uint64_t XClock64()
   return x;
 }
 
+__device__ __forceinline__ uint32_t XSyncClock()
+{
+  // Clock counter with a preceeding barrier.
+  uint32_t x;
+  asm volatile ("bar.sync	0;\n\t"
+                "mov.u32 %0, %%clock;" : "=r"(x) :: "memory");
+  return x;
+}
+
+__device__ __forceinline__ uint64_t XSyncClock64()
+{
+  // Clock counter with a preceeding barrier.
+  uint64_t x;
+  asm volatile ("bar.sync	0;\n\t"
+                "mov.u64 %0, %%clock64;" : "=l"(x) :: "memory");
+  return x;
+}
+
+
+template<class T = uint32_t>
+class __XClocks
+{
+  /*
+   * XClocks timer tool
+   * Tracks the number of clock cycles between a call to the start
+   * and end member functions.
+   */
+public:
+  T startClock;
+  __device__ void start()
+  {
+    startClock = XSyncClock();
+  }
+  __device__ T end()
+  {
+    return XClock() - startClock;
+  }
+  // Use a data dependency (i.e. store the address of a given variable) to force the compiler to wait.
+
+};
+
+template<>
+void __XClocks<uint64_t>::start()
+{
+  this->startClock = XSyncClock64();
+}
+
+template<>
+uint64_t __XClocks<uint64_t>::end()
+{
+  return XClock64() - this->startClock;
+}
+
+using XClocks64 = __XClocks<uint64_t>;
+using XClocks = __XClocks<>;
+
+
 __device__ __forceinline__ int __smId()
 {
   // SM ID
diff --git a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/tools.hpp b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/tools.hpp
index 27ba7c0302..cc6dbe8450 100644
--- a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/tools.hpp
+++ b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/tools.hpp
@@ -74,27 +74,70 @@ Smi::~Smi()
  */
 
 template< class T >
-__device__ __forceinline__ T __XClock()
+__device__ __forceinline__ T __XSyncClock()
 {
-  // Clock counter
+  // Force the completion of other pending operations before requesting the
+  // clock counter. The clock counter is read "asyncrhonously" and its value
+  // is not guaranteed to be present in "x" on return.
   uint64_t x;
-  asm volatile ("s_memtime %0; \t\n"
-                "s_waitcnt lgkmcnt(0);"
+  asm volatile (
+                "s_waitcnt vmcnt(0) & vscnt(0) & lgkmcnt(0) & expcnt(0);\n\t"
+                "s_memtime %0;"
                 : "=r"(x)
                );
   return (T)x;
 }
 
-__device__ __forceinline__ uint32_t XClock()
+__device__ __forceinline__ uint32_t XSyncClock()
 {
-  return __XClock<uint32_t>();
+  return __XSyncClock<uint32_t>();
 }
 
-__device__ __forceinline__ uint64_t XClock64()
+__device__ __forceinline__ uint64_t XSyncClock64()
 {
-  return __XClock<uint64_t>();
+  return __XSyncClock<uint64_t>();
 }
 
+template< class T >
+__device__ __forceinline__ T __XClock()
+{
+  // Retrieve the clock couner and forces a wait on the associated
+  // memory operation.
+  uint64_t x;
+  asm volatile ("s_memtime %0; \t\n"
+                "s_waitcnt lgkmcnt(0);"
+                : "=r"(x)
+               );
+  return (T)x;
+}
+
+using XClock = __XClock<uint32_t>;
+using XClock64 = __XClock<uint64_t>;
+
+
+template < class T = uint32_t>
+class __Xclocks
+{
+  /*
+   * XClocks timer tool
+   * Tracks the number of clock cycles between a call to the start
+   * and end member functions.
+   */
+public:
+  T startClock;
+  __device__ void start()
+  {
+    startClock = __XSyncClock<T>();
+  }
+  __device__ T end()
+  {
+    return __XClock<T>() - startClock;
+  }
+};
+
+using XClocks = __XClocks<>;
+using XClocks64 = __XClocks<uint64_t>;
+
 
 __device__ __forceinline__ int __smId()
 {
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
index ee393cdc25..180ab42021 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
@@ -13,8 +13,11 @@ class Pchase:
     '''
     Public storage class to avoid writing the parameters below multiple times.
     '''
-    valid_systems = ['ault:intelv100', 'ault:amdv100',
-                     'ault:amda100', 'ault:amdvega']
+    single_device = ['daint:gpu', 'dom:gpu']
+    multi_device = ['ault:intelv100', 'ault:amdv100',
+                    'ault:amda100', 'ault:amdvega',
+                    'tsa:cn']
+    valid_systems = single_device+multi_device
     valid_prog_environs = ['PrgEnv-gnu']
 
 
@@ -57,7 +60,7 @@ def set_gpu_arch(self):
         # Deal with the AMD options
         amd_trgt = None
         if cp == 'ault:amdvega':
-            amd_trgt = 'gfx908'
+            amd_trgt = 'gfx906,gfx908'
 
         if amd_trgt:
             self.build_system.cxxflags += [f'--amdgpu-target={amd_trgt}']
@@ -67,7 +70,6 @@ def set_gpu_arch(self):
 class GpuPointerChaseBase(rfm.RunOnlyRegressionTest):
     def __init__(self):
         self.depends_on('CompileGpuPointerChase')
-        self.valid_systems = Pchase.valid_systems
         self.valid_prog_environs = Pchase.valid_prog_environs
         self.num_tasks = 0
         self.num_tasks_per_node = 1
@@ -106,16 +108,17 @@ def do_sanity_check(self):
             self.stdout, 1)))
         return sn.evaluate(sn.assert_eq(
             sn.assert_eq(self.job.num_tasks, healthy_nodes),
-            sn.assert_eq(self.job.num_tasks, nodes_at_end)))
+	    sn.assert_eq(self.job.num_tasks, nodes_at_end)))
 
 
-@rfm.parameterized_test([1], [2], [4], [4096])
+#@rfm.parameterized_test([1], [2], [4], [4096])
 class GpuPointerChaseSingle(GpuPointerChaseBase):
     def __init__(self, stride):
         super().__init__()
+        self.valid_systems = Pchase.valid_systems
         self.executable_opts = ['--stride', f'{stride}']
         self.perf_patterns = {
-            'average': sn.min(sn.extractall(r'^\s*\[[^\]]*\]\s* On device \d+, '
+            'average': sn.max(sn.extractall(r'^\s*\[[^\]]*\]\s* On device \d+, '
                                             r'the chase took on average (\d+) '
                                             r'cycles per node jump.',
                                             self.stdout, 1, int)),
@@ -193,3 +196,30 @@ def __init__(self, stride):
                     'average': (800, None, 0.1, 'clock cycles')
                 },
             }
+
+
+@rfm.simple_test
+class GpuPointerChaseMulti(GpuPointerChaseBase):
+    def __init__(self):
+        super().__init__()
+        self.valid_systems = Pchase.mulit_device
+        self.executable_opts = ['--multiGPU']
+        self.perf_patterns = {
+            'average': sn.max(sn.extractall(r'^\s*\[[^\]]*\]\s*GPU\s*\d+\s+(\s*\d+.\s+)+',
+                                            self.stdout, 1, int)),
+        }
+
+        self.reference = {
+            'ault:amda100': {
+                'average': (668, None, 0.1, 'clock cycles')
+            },
+            'ault:amdv100': {
+                'average': (611, None, 0.1, 'clock cycles')
+            },
+            'ault:amdvega': {
+                'average': (1010, None, 0.1, 'clock cycles')
+            },
+            'tsa:cn': {
+                'average': (2760, None, 0.1, 'clock cycles')
+            },
+        }
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pChase_list.hpp b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pChase_list.hpp
index 91a44b8450..f678e9c3bb 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pChase_list.hpp
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pChase_list.hpp
@@ -127,12 +127,13 @@ __device__ __forceinline__ void nextNode( __VOLATILE__ Node ** ptr, uint32_t * t
    */
 
 # ifdef TIME_EACH_STEP
-  uint32_t t1 = XClock();
+  XClocks clocks;
+  clocks.start();
 # endif
   (*ptr) = (*ptr)->next;
 # ifdef TIME_EACH_STEP
   (*ptrs) = (Node*)(*ptr);  // Data dep. to prevent ILP.
-  *timer = XClock() - t1; // Time the jump
+  *timer = clocks.end(); // Time the jump
 # endif
 
   // Keep traversing the list.
@@ -159,15 +160,15 @@ __global__ void timed_list_traversal(Node * __restrict__ buffer, uint32_t headIn
 
 #ifndef TIME_EACH_STEP
   // start timer
-  uint32_t start = XClock();
+  XClocks clocks;
+  clocks.start();
 #endif
 
   nextNode<NODES-1>(&ptr, s_timer, ptrs);
 
 #ifndef TIME_EACH_STEP
   // end cycle count
-  uint32_t end = XClock();
-  timer[0] = end - start;
+  timer[0] = clocks.end();
 #else
   for (uint32_t i = 0; i < NODES-1; i++)
   {

From 51567a2fad6ab23db1650818e921b8bd85efb4f6 Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Wed, 18 Nov 2020 14:54:56 +0100
Subject: [PATCH 18/51] Bugfix in the clocks implementation for hip.

---
 .../memory_bandwidth/src/Xdevice/cuda/tools.hpp   |  2 --
 .../memory_bandwidth/src/Xdevice/hip/tools.hpp    | 15 +++++++++++----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/tools.hpp b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/tools.hpp
index ae628746f6..7f7eed93a4 100644
--- a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/tools.hpp
+++ b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/tools.hpp
@@ -129,8 +129,6 @@ class __XClocks
   {
     return XClock() - startClock;
   }
-  // Use a data dependency (i.e. store the address of a given variable) to force the compiler to wait.
-
 };
 
 template<>
diff --git a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/tools.hpp b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/tools.hpp
index cc6dbe8450..889bab865d 100644
--- a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/tools.hpp
+++ b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/tools.hpp
@@ -81,7 +81,7 @@ __device__ __forceinline__ T __XSyncClock()
   // is not guaranteed to be present in "x" on return.
   uint64_t x;
   asm volatile (
-                "s_waitcnt vmcnt(0) & vscnt(0) & lgkmcnt(0) & expcnt(0);\n\t"
+                "s_waitcnt vmcnt(0) & lgkmcnt(0) & expcnt(0);\n\t"
                 "s_memtime %0;"
                 : "=r"(x)
                );
@@ -111,12 +111,19 @@ __device__ __forceinline__ T __XClock()
   return (T)x;
 }
 
-using XClock = __XClock<uint32_t>;
-using XClock64 = __XClock<uint64_t>;
+__device__ uint32_t XClock()
+{
+  return __XClock<uint32_t>();
+}
+
+__device__ uint64_t XClock64()
+{
+  return  __XClock<uint64_t>();
+}
 
 
 template < class T = uint32_t>
-class __Xclocks
+class __XClocks
 {
   /*
    * XClocks timer tool

From 525d25bb68656de7bfe390a736d90bf365ca4f27 Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Thu, 19 Nov 2020 13:59:27 +0100
Subject: [PATCH 19/51] Expand pointer_chase checks.

---
 .../gpu/pointer_chase/pointer_chase.py        | 285 ++++++++++++++++--
 .../gpu/pointer_chase/src/pointer_chase.cu    |  38 ++-
 2 files changed, 297 insertions(+), 26 deletions(-)

diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
index 180ab42021..50f661ff9c 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
@@ -7,7 +7,7 @@
 import reframe as rfm
 
 import os
-
+from math import ceil
 
 class Pchase:
     '''
@@ -21,6 +21,11 @@ class Pchase:
     valid_prog_environs = ['PrgEnv-gnu']
 
 
+#
+# PChase tests tracking the averaged latencies for all node jumps
+#
+
+
 @rfm.simple_test
 class CompileGpuPointerChase(rfm.CompileOnlyRegressionTest):
     def __init__(self):
@@ -48,10 +53,12 @@ def set_gpu_arch(self):
 
         # Deal with the NVIDIA options first
         nvidia_sm = None
-        if cp[-4:] == 'v100':
+        if cp in {'tsa:cn', 'ault:intelv100', 'ault:amdv100'}:
             nvidia_sm = '70'
-        elif cp[-4:] == 'a100':
+        elif cp == 'ault:amda100':
             nvidia_sm = '80'
+        elif cp in {'dom:gpu', 'daint:gpu'}:
+            nvidia_sm == '60'
 
         if nvidia_sm:
             self.build_system.cxxflags += [f'-arch=sm_{nvidia_sm}']
@@ -69,7 +76,6 @@ def set_gpu_arch(self):
 
 class GpuPointerChaseBase(rfm.RunOnlyRegressionTest):
     def __init__(self):
-        self.depends_on('CompileGpuPointerChase')
         self.valid_prog_environs = Pchase.valid_prog_environs
         self.num_tasks = 0
         self.num_tasks_per_node = 1
@@ -77,11 +83,6 @@ def __init__(self):
         self.sanity_patterns = self.do_sanity_check()
         self.maintainers = ['JO']
 
-    @rfm.require_deps
-    def set_executable(self, CompileGpuPointerChase):
-        self.executable = os.path.join(
-            CompileGpuPointerChase().stagedir, 'pChase.x')
-
     @rfm.run_before('run')
     def set_num_gpus_per_node(self):
         cp = self.current_partition.fullname
@@ -98,21 +99,33 @@ def set_num_gpus_per_node(self):
     def do_sanity_check(self):
 
         # Check that every node has the right number of GPUs
-        healthy_nodes = len(set(sn.extractall(
+        # Store this nodes in case they're used later by the perf functions.
+        self.my_nodes = set(sn.extractall(
             r'^\s*\[([^\]]*)\]\s*Found %d device\(s\).' % self.num_gpus_per_node,
-            self.stdout, 1)))
+            self.stdout, 1))
 
         # Check that every node has made it to the end.
         nodes_at_end = len(set(sn.extractall(
             r'^\s*\[([^\]]*)\]\s*Pointer chase complete.',
             self.stdout, 1)))
         return sn.evaluate(sn.assert_eq(
-            sn.assert_eq(self.job.num_tasks, healthy_nodes),
+            sn.assert_eq(self.job.num_tasks, len(self.my_nodes)),
 	    sn.assert_eq(self.job.num_tasks, nodes_at_end)))
 
 
-#@rfm.parameterized_test([1], [2], [4], [4096])
-class GpuPointerChaseSingle(GpuPointerChaseBase):
+class GpuPointerChaseDep(GpuPointerChaseBase):
+    def __init__(self):
+        super().__init__()
+        self.depends_on('CompileGpuPointerChase')
+
+    @rfm.require_deps
+    def set_executable(self, CompileGpuPointerChase):
+        self.executable = os.path.join(
+            CompileGpuPointerChase().stagedir, 'pChase.x')
+
+
+@rfm.parameterized_test([1], [2], [4], [4096])
+class GpuPointerChaseSingle(GpuPointerChaseDep):
     def __init__(self, stride):
         super().__init__()
         self.valid_systems = Pchase.valid_systems
@@ -166,7 +179,7 @@ def __init__(self, stride):
                     'average': (198, None, 0.1, 'clock cycles')
                 },
                 'ault:amdv100': {
-                    'average': (200, None, 0.1, 'clock cycles')
+                    'average': (204, None, 0.1, 'clock cycles')
                 },
                 'dom:gpu': {
                     'average': (260, None, 0.1, 'clock cycles')
@@ -199,10 +212,10 @@ def __init__(self, stride):
 
 
 @rfm.simple_test
-class GpuPointerChaseMulti(GpuPointerChaseBase):
+class GpuPointerChaseMultiAgg(GpuPointerChaseDep):
     def __init__(self):
         super().__init__()
-        self.valid_systems = Pchase.mulit_device
+        self.valid_systems = Pchase.multi_device
         self.executable_opts = ['--multiGPU']
         self.perf_patterns = {
             'average': sn.max(sn.extractall(r'^\s*\[[^\]]*\]\s*GPU\s*\d+\s+(\s*\d+.\s+)+',
@@ -223,3 +236,241 @@ def __init__(self):
                 'average': (2760, None, 0.1, 'clock cycles')
             },
         }
+
+
+#
+# PChase tests tracking the individual latencies of each node jump
+#
+
+
+@rfm.simple_test
+class CompileGpuPointerChaseFine(CompileGpuPointerChase):
+    '''
+    Compile the pChase code to time each node jump.
+    '''
+    def __init__(self):
+        super().__init__()
+
+    @rfm.run_before('compile')
+    def set_cxxflags(self):
+        self.build_system.cxxflags += ['-DTIME_EACH_STEP']
+
+
+class GpuPointerChaseFineDep(GpuPointerChaseBase):
+    def __init__(self):
+        super().__init__()
+        self.depends_on('CompileGpuPointerChaseFine')
+
+    @rfm.require_deps
+    def set_executable(self, CompileGpuPointerChaseFine):
+        self.executable = os.path.join(
+            CompileGpuPointerChaseFine().stagedir, 'pChase.x')
+
+    @sn.sanity_function
+    def get_all_latencies(self, pattern):
+        return sn.extractall(pattern, self.stdout, 1, int)
+
+
+class L1_filter:
+    def filter_out_L1_hits(self, L1, all_latencies):
+        '''
+        Return a list with the latencies that are above 20% L1.
+        '''
+        return list(filter(lambda x: x>1.2*L1, all_latencies))
+
+
+@rfm.simple_test
+class GpuPointerChaseL1(GpuPointerChaseFineDep, L1_filter):
+    '''
+    Check L1 latency, L1 miss rate and average latency of an L1 miss.
+    '''
+    def __init__(self):
+        super().__init__()
+        self.valid_systems = Pchase.valid_systems
+        self.perf_patterns = {
+            'L1_latency': self.max_L1_latency(),
+            'L1_miss_rate': self.L1_miss_rate(),
+            'L1_miss_latency': self.L1_miss_latency(),
+        }
+
+        self.reference = {
+           'dom:gpu': {
+                'L1_latency': (112, None, 0.1, 'clock cycles')
+            },
+           'daint:gpu': {
+                'L1_latency': (112, None, 0.1, 'clock cycles')
+            },
+            'ault:amda100': {
+                'L1_latency': (70, None, 0.1, 'clock cycles'),
+                'L1_misses': (25.4, None, 0.1, '%'),
+            },
+            'ault:amdv100': {
+                'L1_latency': (39, None, 0.1, 'clock cycles'),
+                'L1_misses': (25.4, None, 0.1, '%'),
+                'L1_miss_latency': (208, None, 0.1, 'clock cycles'),
+            },
+            'ault:amdvega': {
+                'L1_latency': (164, None, 0.1, 'clock cycles'),
+                'L1_miss_rate': (23.8, None, 0.1, '%'),
+                'L1_miss_latency': (840, None, 0.1, 'clock cycles'),
+            },
+        }
+
+    @staticmethod
+    def target_str(node, device):
+        return r'^\s*\[%s\]\[device %d\]\s*(\d+)' % (node, device)
+
+    @sn.sanity_function
+    def max_L1_latency(self):
+        '''
+        Max. L1 latency amongst all devices.
+        '''
+        l1_latency = []
+        for n in self.my_nodes:
+            for d in range(self.num_gpus_per_node):
+                l1_latency.append(
+                    sn.min(self.get_all_latencies(self.target_str(n,d)))
+                )
+
+        # Return the data from the worst performing device
+        return sn.max(l1_latency)
+
+    def get_L1_misses(self, n, d, all_latencies=None):
+        '''
+        The idea here is to get the lowest value and model the L1 hits as the
+        values with a latency up to 20% higher than this lowest value. Every
+        other node jump with a higher latency will be counted as an L1 miss.
+        '''
+        if all_latencies is None:
+            all_latencies = self.get_all_latencies(self.target_str(n,d))
+
+        L1 = sn.min(all_latencies)
+        return self.filter_out_L1_hits(L1, all_latencies)
+
+    @sn.sanity_function
+    def L1_miss_rate(self):
+        '''
+        Calculate the rate of L1 misses based on the model implemented by the
+        get_L1_misses sanity function. Return the worst performing rate from
+        all nodes/devices.
+        '''
+        l1_miss_rate = []
+        for n in self.my_nodes:
+            for d in range(self.num_gpus_per_node):
+                all_lat = sn.evaluate(
+                    self.get_all_latencies(self.target_str(n,d))
+                )
+                l1_miss_rate.append(
+                    len(self.get_L1_misses(n,d,all_lat))/len(all_lat)
+                )
+
+        return max(l1_miss_rate)*100
+
+    @sn.sanity_function
+    def L1_miss_latency(self):
+        '''
+        Count the average number of cycles taken only by the node jumps
+        with an L1 miss. Return the worst performing values for all
+        nodes/devices.
+        '''
+        l1_miss_latency = []
+        for n in self.my_nodes:
+            for d in range(self.num_gpus_per_node):
+                l1_miss_latency.append(
+                    ceil(sn.evaluate(sn.avg(self.get_L1_misses(n,d))))
+                )
+
+        return max(l1_miss_latency)
+
+
+@rfm.simple_test
+class GpuPointerChaseL1P2P(GpuPointerChaseFineDep, L1_filter):
+    '''
+    Pointer chase through P2P, checking L1 miss rates and L1 miss
+    latency averaged amogst all devices in each node.
+    '''
+    def __init__(self):
+        super().__init__()
+        self.valid_systems = Pchase.multi_device
+        self.executable_opts = ['--multiGPU']
+        self.perf_patterns = {
+            'L1_latency': self.max_L1_latency(),
+            'L1_miss_rate': self.L1_miss_rate(),
+            'L1_miss_latency': self.L1_miss_latency()
+        }
+        self.reference = {
+            'ault:amda100': {
+                'L1_latency': (70, None, 0.1, 'clock cycles'),
+            },
+            'ault:amdv100': {
+                'L1_latency': (39, None, 0.1, 'clock cycles'),
+            },
+            'ault:amdvega': {
+                'L1_latency': (164, None, 0.1, 'clock cycles'),
+                'L1_miss_rate': (19.3, None, 0.1, '%'),
+                'L1_miss_latency': (2200, None, 0.1, 'clock cycles'),
+            },
+        }
+
+    @staticmethod
+    def target_str(node, d1, d2):
+        return r'^\s*\[%s\]\[device %d\]\[device %d\]\s*(\d+)' % (node, d1, d2)
+
+    @sn.sanity_function
+    def max_L1_latency(self):
+        '''
+        Max. L1 latency amongst all devices.
+        '''
+        l1_latency = []
+        for n in self.my_nodes:
+            for d1 in range(self.num_gpus_per_node):
+                for d2 in range(self.num_gpus_per_node):
+                    l1_latency.append(
+                        sn.min(self.get_all_latencies(
+                            self.target_str(n, d1, d2))
+                        )
+                    )
+
+        # Return the data from the worst performing device
+        return sn.max(l1_latency)
+
+    @sn.sanity_function
+    def L1_miss_rate(self):
+        '''
+        Calculates the L1 miss rate across P2P list traversals.
+        '''
+        total_node_jumps = 0
+        total_L1_misses = 0
+        for n in self.my_nodes:
+            for d1 in range(self.num_gpus_per_node):
+                for d2 in range(self.num_gpus_per_node):
+                    if(d1 != d2):
+                        all_lat = sn.evaluate(self.get_all_latencies(
+                            self.target_str(n, d1, d2)
+                        ))
+                        L1 = min(all_lat)
+                        total_L1_misses += len(
+                            self.filter_out_L1_hits(L1, all_lat)
+                        )
+                        total_node_jumps += len(all_lat)
+
+        return total_L1_misses/total_node_jumps
+
+    @sn.sanity_function
+    def L1_miss_latency(self):
+        '''
+        Calculate the latency of all L1 misses across all P2P list traversals
+        '''
+        L1_misses = []
+        for n in self.my_nodes:
+            for d1 in range(self.num_gpus_per_node):
+                for d2 in range(self.num_gpus_per_node):
+                    if (d1 != d2):
+                        all_lat = sn.evaluate(self.get_all_latencies(
+                            self.target_str(n, d1, d2)
+                        ))
+                        L1 = min(all_lat)
+                        L1_misses += self.filter_out_L1_hits(L1, all_lat)
+
+        return int(sn.evaluate(sn.avg(L1_misses)))
+
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
index aa1ac35e45..431beadfb8 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
@@ -175,7 +175,7 @@ void print_device_table(int num_devices, std::queue<uint32_t> q, const char * wh
 }
 
 template < class LIST >
-void remotePointerChase(int num_devices, int init_mode, size_t buffSize, size_t stride, char * nid)
+void remotePointerChase(int num_devices, int init_mode, size_t buffSize, size_t stride, char * nid, int summarize)
 {
   /*
    * Specialised pointer chase to allocate the list in one device, and do the pointer chase from another device.
@@ -220,8 +220,18 @@ void remotePointerChase(int num_devices, int init_mode, size_t buffSize, size_t
 #     ifndef TIME_EACH_STEP
       q_average.push(fetch(timer_ptr));
 #     else
-      q_min.push(fetchMin(timer_ptr));
-      q_max.push(fetchMax(timer_ptr));
+      if (summarize)
+      {
+        q_min.push(fetchMin(timer_ptr));
+        q_max.push(fetchMax(timer_ptr));
+      }
+      else
+      {
+        for (int n = 0; n < NODES-1; n++)
+        {
+          printf("[%s][device %d][device %d] %d\n", nid, j, i, timer_ptr[n]);
+        }
+      }
 #     endif
       delete [] timer_ptr;
     }
@@ -232,11 +242,14 @@ void remotePointerChase(int num_devices, int init_mode, size_t buffSize, size_t
   what = "Average";
   print_device_table(num_devices, q_average, what.c_str(), nid);
 # else
-  what = "Min.";
-  print_device_table(num_devices, q_min, what.c_str(), nid);
-  printf("\n");
-  what = "Max.";
-  print_device_table(num_devices, q_max, what.c_str(), nid);
+  if (summarize)
+  {
+    what = "Min.";
+    print_device_table(num_devices, q_min, what.c_str(), nid);
+    printf("\n");
+    what = "Max.";
+    print_device_table(num_devices, q_max, what.c_str(), nid);
+  }
 # endif
 
 }
@@ -249,6 +262,7 @@ int main(int argc, char ** argv)
   size_t stride = 1;
   size_t buffSize = NODES*stride;
   int multiGPU = 0;
+  int print_mode = 0;
 
   // Parse the command line args.
   for (int i = 0; i < argc; i++)
@@ -263,6 +277,8 @@ int main(int argc, char ** argv)
       std::cout << "              The number indicates the size of the buffer in list nodes." << std::endl;
       std::cout << "--multiGPU  : Runs the pointer chase algo using all device-pair combinations." << std::endl;
       std::cout << "              This measures the device-to-device memory latency." << std::endl;
+      std::cout << "--summary   : When timing each node jump individually and used alongside --multiGPU, " << std::endl;
+      std::cout << "              this collapses the output into two tables with the min and max latencies." << std::endl;
       std::cout << "--help (-h) : I guess you figured what this does already ;)" << std::endl;
       return 0;
     }
@@ -284,6 +300,10 @@ int main(int argc, char ** argv)
     {
       multiGPU = 1;
     }
+    else if (str == "--summary")
+    {
+      print_mode = 1;
+    }
   }
 
   // Sanity of the command line args.
@@ -316,7 +336,7 @@ int main(int argc, char ** argv)
   }
   else
   {
-    remotePointerChase<LIST_TYPE>(num_devices, list_init_mode, buffSize, stride, nid_name);
+    remotePointerChase<LIST_TYPE>(num_devices, list_init_mode, buffSize, stride, nid_name, print_mode);
   }
 
   printf("[%s] Pointer chase complete.\n", nid_name);

From c481ee9a5bc3f6fe506b2cedb96963a891787d3d Mon Sep 17 00:00:00 2001
From: Javier Otero <jotero@cscs.ch>
Date: Thu, 19 Nov 2020 14:47:44 +0100
Subject: [PATCH 20/51] Add tsa references.

---
 .../gpu/pointer_chase/pointer_chase.py        | 125 ++++++++++++------
 1 file changed, 85 insertions(+), 40 deletions(-)

diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
index 50f661ff9c..2e33523743 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
@@ -86,7 +86,9 @@ def __init__(self):
     @rfm.run_before('run')
     def set_num_gpus_per_node(self):
         cp = self.current_partition.fullname
-        if cp in {'ault:intelv100', 'ault:amda100'}:
+        if cp == 'tsa:cn':
+            self.num_gpus_per_node = 8
+        elif cp in {'ault:intelv100', 'ault:amda100'}:
             self.num_gpus_per_node = 4
         elif cp in {'ault:amdv100'}:
             self.num_gpus_per_node = 2
@@ -126,117 +128,148 @@ def set_executable(self, CompileGpuPointerChase):
 
 @rfm.parameterized_test([1], [2], [4], [4096])
 class GpuPointerChaseSingle(GpuPointerChaseDep):
+    '''
+    Pointer chase on a single device with increasing stride.
+    '''
     def __init__(self, stride):
         super().__init__()
         self.valid_systems = Pchase.valid_systems
         self.executable_opts = ['--stride', f'{stride}']
         self.perf_patterns = {
-            'average': sn.max(sn.extractall(r'^\s*\[[^\]]*\]\s* On device \d+, '
-                                            r'the chase took on average (\d+) '
-                                            r'cycles per node jump.',
-                                            self.stdout, 1, int)),
+            'average_latency': sn.max(sn.extractall(
+                r'^\s*\[[^\]]*\]\s* On device \d+, '
+                r'the chase took on average (\d+) '
+                r'cycles per node jump.', self.stdout, 1, int)
+            ),
         }
 
         if stride == 1:
             self.reference = {
+                'tsa:cn': {
+                    'average_latency': (80, None, 0.1, 'clock cycles')
+                },
                 'ault:amda100': {
-                    'average': (76, None, 0.1, 'clock cycles')
+                    'average_latency': (76, None, 0.1, 'clock cycles')
                 },
                 'ault:amdv100': {
-                    'average': (77, None, 0.1, 'clock cycles')
+                    'average_latency': (77, None, 0.1, 'clock cycles')
                 },
                 'dom:gpu': {
-                    'average': (143, None, 0.1, 'clock cycles')
+                    'average_latency': (143, None, 0.1, 'clock cycles')
                 },
                 'daint:gpu': {
-                    'average': (143, None, 0.1, 'clock cycles')
+                    'average_latency': (143, None, 0.1, 'clock cycles')
                 },
                 'ault:amdvega': {
-                    'average': (225, None, 0.1, 'clock cycles')
+                    'average_latency': (225, None, 0.1, 'clock cycles')
                 },
             }
         elif stride == 2:
             self.reference = {
+                'tsa:cn': {
+                    'average_latency': (120, None, 0.1, 'clock cycles')
+                },
                 'ault:amda100': {
-                    'average': (116, None, 0.1, 'clock cycles')
+                    'average_latency': (116, None, 0.1, 'clock cycles')
                 },
                 'ault:amdv100': {
-                    'average': (118, None, 0.1, 'clock cycles')
+                    'average_latency': (118, None, 0.1, 'clock cycles')
                 },
                 'dom:gpu': {
-                    'average': (181, None, 0.1, 'clock cycles')
+                    'average_latency': (181, None, 0.1, 'clock cycles')
                 },
                 'daint:gpu': {
-                    'average': (181, None, 0.1, 'clock cycles')
+                    'average_latency': (181, None, 0.1, 'clock cycles')
                 },
                 'ault:amdvega': {
-                    'average': (300, None, 0.1, 'clock cycles')
+                    'average_latency': (300, None, 0.1, 'clock cycles')
                 },
             }
         elif stride == 4:
             self.reference = {
+                'tsa:cn': {
+                    'average_latency': (204, None, 0.1, 'clock cycles')
+                },
                 'ault:amda100': {
-                    'average': (198, None, 0.1, 'clock cycles')
+                    'average_latency': (198, None, 0.1, 'clock cycles')
                 },
                 'ault:amdv100': {
-                    'average': (204, None, 0.1, 'clock cycles')
+                    'average_latency': (204, None, 0.1, 'clock cycles')
                 },
                 'dom:gpu': {
-                    'average': (260, None, 0.1, 'clock cycles')
+                    'average_latency': (260, None, 0.1, 'clock cycles')
                 },
                 'daint:gpu': {
-                    'average': (260, None, 0.1, 'clock cycles')
+                    'average_latency': (260, None, 0.1, 'clock cycles')
                 },
                 'ault:amdvega': {
-                    'average': (470, None, 0.1, 'clock cycles')
+                    'average_latency': (470, None, 0.1, 'clock cycles')
                 },
             }
         elif stride == 4096:
             self.reference = {
+                'tsa:cn': {
+                    'average_latency': (220, None, 0.1, 'clock cycles')
+                },
                 'ault:amda100': {
-                    'average': (206, None, 0.1, 'clock cycles')
+                    'average_latency': (206, None, 0.1, 'clock cycles')
                 },
                 'ault:amdv100': {
-                    'average': (220, None, 0.1, 'clock cycles')
+                    'average_latency': (220, None, 0.1, 'clock cycles')
                 },
                 'dom:gpu': {
-                    'average': (260, None, 0.1, 'clock cycles')
+                    'average_latency': (260, None, 0.1, 'clock cycles')
                 },
                 'daint:gpu': {
-                    'average': (260, None, 0.1, 'clock cycles')
+                    'average_latency': (260, None, 0.1, 'clock cycles')
                 },
                 'ault:amdvega': {
-                    'average': (800, None, 0.1, 'clock cycles')
+                    'average_latency': (800, None, 0.1, 'clock cycles')
                 },
             }
 
 
 @rfm.simple_test
-class GpuPointerChaseMultiAgg(GpuPointerChaseDep):
+class GpuPointerChaseAverageP2PLatency(GpuPointerChaseDep):
+    '''
+    Average inter-node P2P latency.
+    '''
     def __init__(self):
         super().__init__()
         self.valid_systems = Pchase.multi_device
         self.executable_opts = ['--multiGPU']
         self.perf_patterns = {
-            'average': sn.max(sn.extractall(r'^\s*\[[^\]]*\]\s*GPU\s*\d+\s+(\s*\d+.\s+)+',
-                                            self.stdout, 1, int)),
+            'average_latency': self.average_P2P_latency(),
         }
 
         self.reference = {
             'ault:amda100': {
-                'average': (668, None, 0.1, 'clock cycles')
+                'average_latency': (223, None, 0.1, 'clock cycles')
             },
             'ault:amdv100': {
-                'average': (611, None, 0.1, 'clock cycles')
+                'average_latency': (611, None, 0.1, 'clock cycles')
             },
             'ault:amdvega': {
-                'average': (1010, None, 0.1, 'clock cycles')
+                'average_latency': (336, None, 0.1, 'clock cycles')
             },
             'tsa:cn': {
-                'average': (2760, None, 0.1, 'clock cycles')
+                'average_latency': (394, None, 0.1, 'clock cycles')
             },
         }
 
+    @sn.sanity_function
+    def average_P2P_latency(self):
+        '''
+        Extract the average P2P latency. Note that the pChase code
+        returns a table with the cummulative latency for all P2P
+        list traversals.
+        '''
+        return int(sn.evaluate(sn.max(sn.extractall(
+                   r'^\s*\[[^\]]*\]\s*GPU\s*\d+\s+(\s*\d+.\s+)+',
+                   self.stdout, 1, int)
+                   ))/(self.num_gpus_per_node-1)
+               )
+
 
 #
 # PChase tests tracking the individual latencies of each node jump
@@ -272,17 +305,19 @@ def get_all_latencies(self, pattern):
 
 
 class L1_filter:
-    def filter_out_L1_hits(self, L1, all_latencies):
+    def filter_out_L1_hits(self, threshold, all_latencies):
         '''
-        Return a list with the latencies that are above 20% L1.
+        Return a list with the latencies that are above 20% threshold.
         '''
-        return list(filter(lambda x: x>1.2*L1, all_latencies))
+        return list(filter(lambda x: x>1.2*threshold, all_latencies))
 
 
 @rfm.simple_test
 class GpuPointerChaseL1(GpuPointerChaseFineDep, L1_filter):
     '''
-    Check L1 latency, L1 miss rate and average latency of an L1 miss.
+    Pointer chase for all the devices present on each node.
+    The traversal is done with unit stride, checking the L1 latency,
+    L1 miss rate and average latency of an L1 miss.
     '''
     def __init__(self):
         super().__init__()
@@ -300,6 +335,11 @@ def __init__(self):
            'daint:gpu': {
                 'L1_latency': (112, None, 0.1, 'clock cycles')
             },
+            'tsa:cn': {
+                'L1_latency': (38, None, 0.1, 'clock cycles'),
+                'L1_miss_rate': (25.4, None, 0.1, '%'),
+                'L1_miss_latency': (240, None, 0.1, 'clock cycles'),
+            },
             'ault:amda100': {
                 'L1_latency': (70, None, 0.1, 'clock cycles'),
                 'L1_misses': (25.4, None, 0.1, '%'),
@@ -337,9 +377,9 @@ def max_L1_latency(self):
 
     def get_L1_misses(self, n, d, all_latencies=None):
         '''
-        The idea here is to get the lowest value and model the L1 hits as the
-        values with a latency up to 20% higher than this lowest value. Every
-        other node jump with a higher latency will be counted as an L1 miss.
+        The idea here is to get the lowest value and model the L1 hits as
+        implemented in the self.filter_out_L1_hits function. Every
+        node jump returned by this function will be counted as an L1 miss.
         '''
         if all_latencies is None:
             all_latencies = self.get_all_latencies(self.target_str(n,d))
@@ -399,6 +439,11 @@ def __init__(self):
             'L1_miss_latency': self.L1_miss_latency()
         }
         self.reference = {
+            'tsa:cn': {
+                'L1_latency': (38, None, 0.1, 'clock cycles'),
+                'L1_miss_rate': (25.4, None, 0.1, '%'),
+                'L1_miss_latency': (1463, None, 0.1, 'clock cycles'),
+            },
             'ault:amda100': {
                 'L1_latency': (70, None, 0.1, 'clock cycles'),
             },
@@ -454,7 +499,7 @@ def L1_miss_rate(self):
                         )
                         total_node_jumps += len(all_lat)
 
-        return total_L1_misses/total_node_jumps
+        return (total_L1_misses/total_node_jumps)*100
 
     @sn.sanity_function
     def L1_miss_latency(self):

From b3ea42c558d4e8cc857e6e0f9e4af0add1c255c6 Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Thu, 19 Nov 2020 14:57:26 +0100
Subject: [PATCH 21/51] Update a100 refs.

---
 .../microbenchmarks/gpu/pointer_chase/pointer_chase.py       | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
index 2e33523743..8e45a0911a 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
@@ -343,6 +343,7 @@ def __init__(self):
             'ault:amda100': {
                 'L1_latency': (70, None, 0.1, 'clock cycles'),
                 'L1_misses': (25.4, None, 0.1, '%'),
+                'L1_miss_latency': (243, None, 0.1, 'clock cycles'),
             },
             'ault:amdv100': {
                 'L1_latency': (39, None, 0.1, 'clock cycles'),
@@ -446,9 +447,13 @@ def __init__(self):
             },
             'ault:amda100': {
                 'L1_latency': (70, None, 0.1, 'clock cycles'),
+                'L1_miss_rate': (25.4, None, 0.1, '%'),
+                'L1_miss_latency': (822, None, 0.1, 'clock cycles'),
             },
             'ault:amdv100': {
                 'L1_latency': (39, None, 0.1, 'clock cycles'),
+                'L1_miss_rate': (25.4, None, 0.1, '%'),
+                'L1_miss_latency': (2620, None, 0.1, 'clock cycles'),
             },
             'ault:amdvega': {
                 'L1_latency': (164, None, 0.1, 'clock cycles'),

From 8280c183916ad35a0c55f067de52be0634b51148 Mon Sep 17 00:00:00 2001
From: Javier Otero <jotero@cscs.ch>
Date: Thu, 19 Nov 2020 15:11:34 +0100
Subject: [PATCH 22/51] Update refs for dom/daint.

---
 .../gpu/pointer_chase/pointer_chase.py          | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
index 8e45a0911a..40fac67c5a 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
@@ -47,7 +47,7 @@ def select_makefile(self):
         else:
             self.prebuild_cmds = ['cp makefile.cuda Makefile']
 
-    @rfm.run_before('compile')
+    @rfm.run_after('setup')
     def set_gpu_arch(self):
         cp = self.current_partition.fullname
 
@@ -62,7 +62,10 @@ def set_gpu_arch(self):
 
         if nvidia_sm:
             self.build_system.cxxflags += [f'-arch=sm_{nvidia_sm}']
-            self.modules += ['cuda']
+            if cp in {'dom:gpu', 'daint:gpu'}:
+                self.modules += ['cudatoolkit']
+            else:
+                self.modules += ['cuda']
 
         # Deal with the AMD options
         amd_trgt = None
@@ -330,10 +333,16 @@ def __init__(self):
 
         self.reference = {
            'dom:gpu': {
-                'L1_latency': (112, None, 0.1, 'clock cycles')
+                'L1_latency': (112, None, 0.1, 'clock cycles'),
+                'L1_miss_rate': (33.3, None, 0.1, '%'),
+                'L1_miss_latency': (268, None, 0.1, 'clock cycles'),
+
             },
            'daint:gpu': {
-                'L1_latency': (112, None, 0.1, 'clock cycles')
+                'L1_latency': (112, None, 0.1, 'clock cycles'),
+                'L1_miss_rate': (33.3, None, 0.1, '%'),
+                'L1_miss_latency': (268, None, 0.1, 'clock cycles'),
+
             },
             'tsa:cn': {
                 'L1_latency': (38, None, 0.1, 'clock cycles'),

From 8056cd6a50104202df1a0c3db3b19715f417fcf2 Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Fri, 20 Nov 2020 11:33:45 +0100
Subject: [PATCH 23/51] Add clock latency check.

---
 .../src/Xdevice/cuda/tools.hpp                |  8 +++++
 .../src/Xdevice/hip/tools.hpp                 | 32 +++++++++++++++--
 .../gpu/pointer_chase/pointer_chase.py        | 35 ++++++++++++++++---
 .../gpu/pointer_chase/src/pChase_list.hpp     | 24 +++++++------
 .../gpu/pointer_chase/src/pointer_chase.cu    | 22 ++++++++++--
 5 files changed, 102 insertions(+), 19 deletions(-)

diff --git a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/tools.hpp b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/tools.hpp
index 7f7eed93a4..b3e6b8f4af 100644
--- a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/tools.hpp
+++ b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/tools.hpp
@@ -147,6 +147,14 @@ using XClocks64 = __XClocks<uint64_t>;
 using XClocks = __XClocks<>;
 
 
+template<class T>
+__device__ T XClockLatency()
+{
+  uint64_t start = XClock64();
+  uint64_t end   = XClock64();
+  return (T)(end-start);
+}
+
 __device__ __forceinline__ int __smId()
 {
   // SM ID
diff --git a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/tools.hpp b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/tools.hpp
index 889bab865d..efa1075a77 100644
--- a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/tools.hpp
+++ b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/tools.hpp
@@ -83,7 +83,7 @@ __device__ __forceinline__ T __XSyncClock()
   asm volatile (
                 "s_waitcnt vmcnt(0) & lgkmcnt(0) & expcnt(0);\n\t"
                 "s_memtime %0;"
-                : "=r"(x)
+                : "=s"(x)
                );
   return (T)x;
 }
@@ -106,7 +106,7 @@ __device__ __forceinline__ T __XClock()
   uint64_t x;
   asm volatile ("s_memtime %0; \t\n"
                 "s_waitcnt lgkmcnt(0);"
-                : "=r"(x)
+                : "=s"(x)
                );
   return (T)x;
 }
@@ -146,6 +146,34 @@ using XClocks = __XClocks<>;
 using XClocks64 = __XClocks<uint64_t>;
 
 
+__device__ void __clockLatency64( uint64_t * clk)
+{
+  /*
+   * There's a bit of a weird compiler behaviour when computing the
+   * clock latency by doing 2 consecutive calls to XClock. To go
+   * around this issue, we implement this straight with inline asm.
+   */
+  uint64_t c0, c1;
+  asm volatile ("s_memtime %[a];\n\t"
+                "s_waitcnt lgkmcnt(0);\n\t"
+                "s_memtime %[b];\n\t"
+                "s_waitcnt lgkmcnt(0);\n\t"
+                "s_mov_b64 %[c] %[a];\n\t"
+                "s_mov_b64 %[d] %[b];\n\t"
+                "s_waitcnt lgkmcnt(0);\n\t"
+                :[a]"=s"(c0), [b]"=s"(c1), [c]"=r"(clk[0]), [d]"=r"(clk[1]) :: "memory");
+}
+
+
+template <class T>
+__device__ T XClockLatency()
+{
+  uint64_t c[2];
+  __clockLatency64(c);
+  return (T)(c[1]-c[0]);
+}
+
+
 __device__ __forceinline__ int __smId()
 {
   // NOT possible to retrieve the workgroup ID with AMD GPUs
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
index 40fac67c5a..88a9168e60 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
@@ -129,6 +129,33 @@ def set_executable(self, CompileGpuPointerChase):
             CompileGpuPointerChase().stagedir, 'pChase.x')
 
 
+@rfm.simple_test
+class GpuPointerChaseClockLatency(GpuPointerChaseDep):
+    '''
+    Check the clock latencies. This can be thought of the
+    measuring error.
+    '''
+    def __init__(self):
+        super().__init__()
+        self.valid_systems = Pchase.valid_systems
+        self.executable_opts = ['--clock']
+        self.perf_patterns = {
+            'clock_latency': sn.max(sn.extractall(
+                r'^\s*\[[^\]]*\]\s*The clock latency on device \d+ '
+                r'is (\d+) cycles.', self.stdout, 1, int)
+            ),
+        }
+
+        self.reference = {
+            'ault:amda100': {
+                'clock_latency': (7, None, 0.1, 'cycles'),
+            },
+            'ault:amdv100': {
+                'clock_latency': (8, None, 0.1, 'cycles'),
+            },
+        }
+
+
 @rfm.parameterized_test([1], [2], [4], [4096])
 class GpuPointerChaseSingle(GpuPointerChaseDep):
     '''
@@ -350,9 +377,9 @@ def __init__(self):
                 'L1_miss_latency': (240, None, 0.1, 'clock cycles'),
             },
             'ault:amda100': {
-                'L1_latency': (70, None, 0.1, 'clock cycles'),
+                'L1_latency': (42, None, 0.1, 'clock cycles'),
                 'L1_misses': (25.4, None, 0.1, '%'),
-                'L1_miss_latency': (243, None, 0.1, 'clock cycles'),
+                'L1_miss_latency': (215, None, 0.1, 'clock cycles'),
             },
             'ault:amdv100': {
                 'L1_latency': (39, None, 0.1, 'clock cycles'),
@@ -455,9 +482,9 @@ def __init__(self):
                 'L1_miss_latency': (1463, None, 0.1, 'clock cycles'),
             },
             'ault:amda100': {
-                'L1_latency': (70, None, 0.1, 'clock cycles'),
+                'L1_latency': (42, None, 0.1, 'clock cycles'),
                 'L1_miss_rate': (25.4, None, 0.1, '%'),
-                'L1_miss_latency': (822, None, 0.1, 'clock cycles'),
+                'L1_miss_latency': (792, None, 0.1, 'clock cycles'),
             },
             'ault:amdv100': {
                 'L1_latency': (39, None, 0.1, 'clock cycles'),
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pChase_list.hpp b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pChase_list.hpp
index f678e9c3bb..1333ee446c 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pChase_list.hpp
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pChase_list.hpp
@@ -7,18 +7,23 @@
  */
 
 
-__device__ uint32_t __clockLatency()
+__global__ void clockLatency(int * clk)
 {
-  uint32_t start = XClock();
-  uint32_t end = XClock();
-  return end-start;
+  clk[0] = XClockLatency<int>();
 }
 
 
-__global__ void clockLatency()
+void printClockLatency(char * nid, int dev)
 {
-  uint32_t clkLatency = __clockLatency();
-  printf(" - Clock latency is %d.\n", clkLatency);
+  int * clk_d;
+  int clk;
+  XSetDevice(dev);
+  XMalloc((void**)&clk_d, sizeof(int));
+  clockLatency<<<1,1>>>(clk_d);
+  XDeviceSynchronize();
+  XMemcpy(&clk, clk_d, sizeof(int), XMemcpyDeviceToHost);
+  XFree(clk_d);
+  printf("[%s] The clock latency on device %d is %d cycles.\n", nid, dev, clk);
 }
 
 
@@ -127,7 +132,7 @@ __device__ __forceinline__ void nextNode( __VOLATILE__ Node ** ptr, uint32_t * t
    */
 
 # ifdef TIME_EACH_STEP
-  XClocks clocks;
+  XClocks64 clocks;
   clocks.start();
 # endif
   (*ptr) = (*ptr)->next;
@@ -160,7 +165,7 @@ __global__ void timed_list_traversal(Node * __restrict__ buffer, uint32_t headIn
 
 #ifndef TIME_EACH_STEP
   // start timer
-  XClocks clocks;
+  XClocks64 clocks;
   clocks.start();
 #endif
 
@@ -232,7 +237,6 @@ struct List
     printf(" - Node size: %lu\n", sizeof(Node));
     printf(" - Number of nodes: %lu:\n", n);
     printf(" - Total buffer size: %10.2f MB:\n", float(sizeof(Node)*buffSize)/1024.0/1024);
-    clockLatency<<<1,1>>>();
     XDeviceSynchronize();
   }
 
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
index 431beadfb8..28b6b9f421 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
@@ -263,6 +263,7 @@ int main(int argc, char ** argv)
   size_t buffSize = NODES*stride;
   int multiGPU = 0;
   int print_mode = 0;
+  int clock = 0;
 
   // Parse the command line args.
   for (int i = 0; i < argc; i++)
@@ -279,6 +280,7 @@ int main(int argc, char ** argv)
       std::cout << "              This measures the device-to-device memory latency." << std::endl;
       std::cout << "--summary   : When timing each node jump individually and used alongside --multiGPU, " << std::endl;
       std::cout << "              this collapses the output into two tables with the min and max latencies." << std::endl;
+      std::cout << "--clock     : Skip all the above and just print the clock latency for all devices." << std::endl;
       std::cout << "--help (-h) : I guess you figured what this does already ;)" << std::endl;
       return 0;
     }
@@ -304,6 +306,10 @@ int main(int argc, char ** argv)
     {
       print_mode = 1;
     }
+    else if (str == "--clock")
+    {
+      clock = 1;
+    }
   }
 
   // Sanity of the command line args.
@@ -330,13 +336,23 @@ int main(int argc, char ** argv)
     printf("[%s] Found %d device(s).\n", nid_name, num_devices);
   }
 
-  if (!multiGPU)
+  if (clock)
   {
-    localPointerChase<LIST_TYPE>(num_devices, list_init_mode, buffSize, stride, nid_name);
+    for (int i = 0; i < num_devices; i++)
+    {
+      printClockLatency(nid_name,i);
+    }
   }
   else
   {
-    remotePointerChase<LIST_TYPE>(num_devices, list_init_mode, buffSize, stride, nid_name, print_mode);
+    if (!multiGPU)
+    {
+      localPointerChase<LIST_TYPE>(num_devices, list_init_mode, buffSize, stride, nid_name);
+    }
+    else
+    {
+      remotePointerChase<LIST_TYPE>(num_devices, list_init_mode, buffSize, stride, nid_name, print_mode);
+    }
   }
 
   printf("[%s] Pointer chase complete.\n", nid_name);

From c7b23f2b7bcb997a01054aa763f082a0b33de570 Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Fri, 20 Nov 2020 11:51:38 +0100
Subject: [PATCH 24/51] Add refs for daint, dom and tsa.

---
 .../gpu/pointer_chase/pointer_chase.py               | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
index 88a9168e60..90e96e5fd5 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
@@ -147,12 +147,24 @@ def __init__(self):
         }
 
         self.reference = {
+            'daint:gpu': {
+                'clock_latency': (56, None, 0.1, 'cycles'),
+            },
+            'dom:gpu': {
+                'clock_latency': (56, None, 0.1, 'cycles'),
+            },
+            'tsa:cn': {
+                'clock_latency': (8, None, 0.1, 'cycles'),
+            },
             'ault:amda100': {
                 'clock_latency': (7, None, 0.1, 'cycles'),
             },
             'ault:amdv100': {
                 'clock_latency': (8, None, 0.1, 'cycles'),
             },
+            'ault:amdvega': {
+                'clock_latency': (40, None, 0.1, 'cycles'),
+            },
         }
 
 

From a44c67e070a1c5e1269f4d4627618e99862b4dbb Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Fri, 20 Nov 2020 17:40:23 +0100
Subject: [PATCH 25/51] Fix PEP8 issues and comments to the src code.

---
 .../gpu/pointer_chase/pointer_chase.py        | 30 +++++++++++--------
 .../src/{pChase_list.hpp => linked_list.hpp}  | 15 +++++++---
 .../gpu/pointer_chase/src/pointer_chase.cu    | 11 +++++--
 3 files changed, 38 insertions(+), 18 deletions(-)
 rename cscs-checks/microbenchmarks/gpu/pointer_chase/src/{pChase_list.hpp => linked_list.hpp} (93%)

diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
index 90e96e5fd5..fdde47c306 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
@@ -9,6 +9,7 @@
 import os
 from math import ceil
 
+
 class Pchase:
     '''
     Public storage class to avoid writing the parameters below multiple times.
@@ -115,7 +116,7 @@ def do_sanity_check(self):
             self.stdout, 1)))
         return sn.evaluate(sn.assert_eq(
             sn.assert_eq(self.job.num_tasks, len(self.my_nodes)),
-	    sn.assert_eq(self.job.num_tasks, nodes_at_end)))
+            sn.assert_eq(self.job.num_tasks, nodes_at_end)))
 
 
 class GpuPointerChaseDep(GpuPointerChaseBase):
@@ -135,6 +136,7 @@ class GpuPointerChaseClockLatency(GpuPointerChaseDep):
     Check the clock latencies. This can be thought of the
     measuring error.
     '''
+
     def __init__(self):
         super().__init__()
         self.valid_systems = Pchase.valid_systems
@@ -173,6 +175,7 @@ class GpuPointerChaseSingle(GpuPointerChaseDep):
     '''
     Pointer chase on a single device with increasing stride.
     '''
+
     def __init__(self, stride):
         super().__init__()
         self.valid_systems = Pchase.valid_systems
@@ -276,6 +279,7 @@ class GpuPointerChaseAverageP2PLatency(GpuPointerChaseDep):
     '''
     Average inter-node P2P latency.
     '''
+
     def __init__(self):
         super().__init__()
         self.valid_systems = Pchase.multi_device
@@ -309,8 +313,8 @@ def average_P2P_latency(self):
         return int(sn.evaluate(sn.max(sn.extractall(
                    r'^\s*\[[^\]]*\]\s*GPU\s*\d+\s+(\s*\d+.\s+)+',
                    self.stdout, 1, int)
-                   ))/(self.num_gpus_per_node-1)
-               )
+        ))/(self.num_gpus_per_node-1)
+        )
 
 
 #
@@ -323,6 +327,7 @@ class CompileGpuPointerChaseFine(CompileGpuPointerChase):
     '''
     Compile the pChase code to time each node jump.
     '''
+
     def __init__(self):
         super().__init__()
 
@@ -351,7 +356,7 @@ def filter_out_L1_hits(self, threshold, all_latencies):
         '''
         Return a list with the latencies that are above 20% threshold.
         '''
-        return list(filter(lambda x: x>1.2*threshold, all_latencies))
+        return list(filter(lambda x: x > 1.2*threshold, all_latencies))
 
 
 @rfm.simple_test
@@ -361,6 +366,7 @@ class GpuPointerChaseL1(GpuPointerChaseFineDep, L1_filter):
     The traversal is done with unit stride, checking the L1 latency,
     L1 miss rate and average latency of an L1 miss.
     '''
+
     def __init__(self):
         super().__init__()
         self.valid_systems = Pchase.valid_systems
@@ -371,13 +377,13 @@ def __init__(self):
         }
 
         self.reference = {
-           'dom:gpu': {
+            'dom:gpu': {
                 'L1_latency': (112, None, 0.1, 'clock cycles'),
                 'L1_miss_rate': (33.3, None, 0.1, '%'),
                 'L1_miss_latency': (268, None, 0.1, 'clock cycles'),
 
             },
-           'daint:gpu': {
+            'daint:gpu': {
                 'L1_latency': (112, None, 0.1, 'clock cycles'),
                 'L1_miss_rate': (33.3, None, 0.1, '%'),
                 'L1_miss_latency': (268, None, 0.1, 'clock cycles'),
@@ -418,7 +424,7 @@ def max_L1_latency(self):
         for n in self.my_nodes:
             for d in range(self.num_gpus_per_node):
                 l1_latency.append(
-                    sn.min(self.get_all_latencies(self.target_str(n,d)))
+                    sn.min(self.get_all_latencies(self.target_str(n, d)))
                 )
 
         # Return the data from the worst performing device
@@ -431,7 +437,7 @@ def get_L1_misses(self, n, d, all_latencies=None):
         node jump returned by this function will be counted as an L1 miss.
         '''
         if all_latencies is None:
-            all_latencies = self.get_all_latencies(self.target_str(n,d))
+            all_latencies = self.get_all_latencies(self.target_str(n, d))
 
         L1 = sn.min(all_latencies)
         return self.filter_out_L1_hits(L1, all_latencies)
@@ -447,10 +453,10 @@ def L1_miss_rate(self):
         for n in self.my_nodes:
             for d in range(self.num_gpus_per_node):
                 all_lat = sn.evaluate(
-                    self.get_all_latencies(self.target_str(n,d))
+                    self.get_all_latencies(self.target_str(n, d))
                 )
                 l1_miss_rate.append(
-                    len(self.get_L1_misses(n,d,all_lat))/len(all_lat)
+                    len(self.get_L1_misses(n, d, all_lat))/len(all_lat)
                 )
 
         return max(l1_miss_rate)*100
@@ -466,7 +472,7 @@ def L1_miss_latency(self):
         for n in self.my_nodes:
             for d in range(self.num_gpus_per_node):
                 l1_miss_latency.append(
-                    ceil(sn.evaluate(sn.avg(self.get_L1_misses(n,d))))
+                    ceil(sn.evaluate(sn.avg(self.get_L1_misses(n, d))))
                 )
 
         return max(l1_miss_latency)
@@ -478,6 +484,7 @@ class GpuPointerChaseL1P2P(GpuPointerChaseFineDep, L1_filter):
     Pointer chase through P2P, checking L1 miss rates and L1 miss
     latency averaged amogst all devices in each node.
     '''
+
     def __init__(self):
         super().__init__()
         self.valid_systems = Pchase.multi_device
@@ -571,4 +578,3 @@ def L1_miss_latency(self):
                         L1_misses += self.filter_out_L1_hits(L1, all_lat)
 
         return int(sn.evaluate(sn.avg(L1_misses)))
-
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pChase_list.hpp b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/linked_list.hpp
similarity index 93%
rename from cscs-checks/microbenchmarks/gpu/pointer_chase/src/pChase_list.hpp
rename to cscs-checks/microbenchmarks/gpu/pointer_chase/src/linked_list.hpp
index 1333ee446c..fa6c2ee5a8 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pChase_list.hpp
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/linked_list.hpp
@@ -9,12 +9,14 @@
 
 __global__ void clockLatency(int * clk)
 {
+  // This returns the clock latency when reading the 64-bit clock counter.
   clk[0] = XClockLatency<int>();
 }
 
 
 void printClockLatency(char * nid, int dev)
 {
+  /* Prints the latency of reading the clock cycles */
   int * clk_d;
   int clk;
   XSetDevice(dev);
@@ -128,7 +130,11 @@ template < unsigned int repeat >
 __device__ __forceinline__ void nextNode( __VOLATILE__ Node ** ptr, uint32_t * timer, Node ** ptrs)
 {
   /*
-   * Go to the next node in the list
+   * Recursive function to traverse the list.
+   * - ptr: Pointer of a pointer to a node in the linked list.
+   * - timer: Array to store the timings of each individual node jump.
+   *   Only used if this option is activated (-DTIME_EACH_STEP)
+   * - ptrs: Just used to have a data dependency to block ILP.
    */
 
 # ifdef TIME_EACH_STEP
@@ -138,7 +144,7 @@ __device__ __forceinline__ void nextNode( __VOLATILE__ Node ** ptr, uint32_t * t
   (*ptr) = (*ptr)->next;
 # ifdef TIME_EACH_STEP
   (*ptrs) = (Node*)(*ptr);  // Data dep. to prevent ILP.
-  *timer = clocks.end(); // Time the jump
+  *timer = clocks.end();    // Time the jump
 # endif
 
   // Keep traversing the list.
@@ -153,7 +159,7 @@ __device__ __forceinline__ void  nextNode<0>( __VOLATILE__ Node ** ptr, uint32_t
 __global__ void timed_list_traversal(Node * __restrict__ buffer, uint32_t headIndex, uint32_t * timer)
 {
   /* Timed List traversal - we make a singly-linked list circular just to have a data dep. and
-   * prevent from compiler optimisations.
+   * cover from compiler optimisations.
    */
 
   // These are used to prevent ILP when timing each jump.
@@ -169,6 +175,7 @@ __global__ void timed_list_traversal(Node * __restrict__ buffer, uint32_t headIn
   clocks.start();
 #endif
 
+  // Traverse the list
   nextNode<NODES-1>(&ptr, s_timer, ptrs);
 
 #ifndef TIME_EACH_STEP
@@ -207,7 +214,7 @@ struct List
    *
    * The member functions are:
    *  - info: prints the list details.
-   *  - initialize: populatest the buffer with the list nodes.
+   *  - initialize: populate the buffer with the list nodes.
    *  - traverse: simple list traversal.
    *  - timed_traverse: traverses the list and measures the number of cycles per node jump.
    */
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
index 28b6b9f421..36cf7da72d 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
@@ -53,7 +53,7 @@
 #include "Xdevice/runtime.hpp"
 
 // List structure
-#include "pChase_list.hpp"
+#include "linked_list.hpp"
 
 
 template < class LIST >
@@ -66,7 +66,7 @@ uint32_t * generalPointerChase(int local_device, int remote_device, int init_mod
    *
    * - local_device: ID of the device where the allocation of the list takes place
    * - remote_device: ID of the device doing the pointer chase.
-   * - init_mode: see the class List.
+   * - init_mode: see the List class.
    * - buff_size: Size (in nodes) of the buffer.
    * - stride: Gap (in nodes) between two consecutive nodes. This only applies if init_mode is 0.
    */
@@ -143,6 +143,10 @@ void localPointerChase(int num_devices, int init_mode, size_t buffSize, size_t s
 
 void print_device_table(int num_devices, std::queue<uint32_t> q, const char * what, const char * nid)
 {
+  /*
+   * Print the data in a table format - useful when doing P2P list traversals.
+   */
+
   printf("[%s] %s memory latency (in clock cycles) with remote direct memory access\n", nid, what);
   printf("[%s] %10s", nid, "From \\ To ");
   for (int ds = 0; ds < num_devices; ds++)
@@ -174,11 +178,14 @@ void print_device_table(int num_devices, std::queue<uint32_t> q, const char * wh
   }
 }
 
+
 template < class LIST >
 void remotePointerChase(int num_devices, int init_mode, size_t buffSize, size_t stride, char * nid, int summarize)
 {
   /*
    * Specialised pointer chase to allocate the list in one device, and do the pointer chase from another device.
+   * - summarize: if different than zero, the results will be printed in a table format with the function above.
+   *   Otherwise, every single result will be printed out.
    */
 
 # ifndef TIME_EACH_STEP

From 9272d9756f52f379eca94a68e23e9849eb49c9f3 Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Tue, 24 Nov 2020 15:32:13 +0100
Subject: [PATCH 26/51] Bugfix in the HIP clockLatency function.

---
 .../gpu/memory_bandwidth/src/Xdevice/hip/tools.hpp | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/tools.hpp b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/tools.hpp
index efa1075a77..3a6b6cac4e 100644
--- a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/tools.hpp
+++ b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/tools.hpp
@@ -149,19 +149,15 @@ using XClocks64 = __XClocks<uint64_t>;
 __device__ void __clockLatency64( uint64_t * clk)
 {
   /*
-   * There's a bit of a weird compiler behaviour when computing the
-   * clock latency by doing 2 consecutive calls to XClock. To go
-   * around this issue, we implement this straight with inline asm.
+   * Expose the latency of a clock read.
    */
-  uint64_t c0, c1;
-  asm volatile ("s_memtime %[a];\n\t"
+  asm volatile (
                 "s_waitcnt lgkmcnt(0);\n\t"
-                "s_memtime %[b];\n\t"
+                "s_memtime %[a];\n\t"
                 "s_waitcnt lgkmcnt(0);\n\t"
-                "s_mov_b64 %[c] %[a];\n\t"
-                "s_mov_b64 %[d] %[b];\n\t"
+                "s_memtime %[b];\n\t"
                 "s_waitcnt lgkmcnt(0);\n\t"
-                :[a]"=s"(c0), [b]"=s"(c1), [c]"=r"(clk[0]), [d]"=r"(clk[1]) :: "memory");
+                :[a]"=r"(clk[0]), [b]"=r"(clk[1]) :: "memory");
 }
 
 

From 6566ee13aae2f13508a3cda65ee636dbc7109cb5 Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Tue, 24 Nov 2020 18:04:26 +0100
Subject: [PATCH 27/51] Port kernel latency test to AMD GPUs.

---
 .../gpu/kernel_latency/kernel_latency.py      | 110 +++++++++++-------
 .../gpu/kernel_latency/src/Xdevice            |   1 +
 .../gpu/kernel_latency/src/kernel_latency.cu  |  36 +++---
 .../gpu/kernel_latency/src/makefile.cuda      |   2 +
 .../gpu/kernel_latency/src/makefile.hip       |   5 +
 .../src/Xdevice/cuda/utils.hpp                |   4 +-
 .../src/Xdevice/hip/utils.hpp                 |   4 +-
 .../memory_bandwidth/src/memory_bandwidth.cu  |   2 +-
 .../gpu/memory_bandwidth/src/p2p_bandwidth.cu |   2 +-
 9 files changed, 101 insertions(+), 65 deletions(-)
 create mode 120000 cscs-checks/microbenchmarks/gpu/kernel_latency/src/Xdevice
 create mode 100644 cscs-checks/microbenchmarks/gpu/kernel_latency/src/makefile.cuda
 create mode 100644 cscs-checks/microbenchmarks/gpu/kernel_latency/src/makefile.hip

diff --git a/cscs-checks/microbenchmarks/gpu/kernel_latency/kernel_latency.py b/cscs-checks/microbenchmarks/gpu/kernel_latency/kernel_latency.py
index 61c1323852..9a419c6a07 100644
--- a/cscs-checks/microbenchmarks/gpu/kernel_latency/kernel_latency.py
+++ b/cscs-checks/microbenchmarks/gpu/kernel_latency/kernel_latency.py
@@ -14,42 +14,26 @@ def __init__(self, kernel_version):
         self.valid_systems = ['daint:gpu', 'dom:gpu',
                               'arolla:cn', 'tsa:cn',
                               'ault:amdv100', 'ault:intelv100',
-                              'ault:amda100']
-        self.num_tasks = 0
-        self.num_tasks_per_node = 1
-        self.sourcepath = 'kernel_latency.cu'
-        self.build_system = 'SingleSource'
-        self.build_system.cxxflags = ['-std=c++11', '-O3']
-        if self.current_system.name in {'dom', 'daint'}:
-            self.modules = ['craype-accel-nvidia60']
+                              'ault:amda100', 'ault:amdvega']
+        cs = self.current_system.name
+        if cs in {'dom', 'daint'}:
             self.valid_prog_environs = ['PrgEnv-cray_classic', 'PrgEnv-cray',
                                         'PrgEnv-pgi', 'PrgEnv-gnu']
-        elif self.current_system.name in ['arolla', 'tsa']:
+        elif cs in {'arolla', 'tsa'}:
             self.valid_prog_environs = ['PrgEnv-pgi']
-            self.modules = ['cuda/10.1.243']
-        elif self.current_system.name in ['ault']:
+        elif cs in {'ault'}:
             self.valid_prog_environs = ['PrgEnv-gnu']
-            self.modules = ['cuda']
-        else:
-            self.num_gpus_per_node = 1
-            self.valid_systems = ['*']
-            self.valid_prog_environs = ['*']
 
+        self.num_tasks = 0
+        self.num_tasks_per_node = 1
+        self.build_system = 'Make'
+        self.executable = 'kernel_latency.x'
         if kernel_version == 'sync':
             self.build_system.cppflags = ['-D SYNCKERNEL=1']
         else:
             self.build_system.cppflags = ['-D SYNCKERNEL=0']
 
-        self.sanity_patterns = sn.all([
-            sn.assert_eq(
-                sn.count(sn.findall(r'\[\S+\] Found \d+ gpu\(s\)',
-                                    self.stdout)),
-                self.num_tasks_assigned),
-            sn.assert_eq(
-                sn.count(sn.findall(r'\[\S+\] \[gpu \d+\] Kernel launch '
-                                    r'latency: \S+ us', self.stdout)),
-                self.num_tasks_assigned * self.num_gpus_per_node)
-        ])
+        self.sanity_patterns = self.count_gpus()
 
         self.perf_patterns = {
             'latency': sn.max(sn.extractall(
@@ -73,6 +57,9 @@ def __init__(self, kernel_version):
                 'ault:amda100': {
                     'latency': (9.65, None, 0.10, 'us')
                 },
+                'ault:amdv100': {
+                    'latency': (15.1, None, 0.10, 'us')
+                },
             },
             'async': {
                 'dom:gpu': {
@@ -87,9 +74,12 @@ def __init__(self, kernel_version):
                 'ault:amdv100': {
                     'latency': (1.83, None, 0.10, 'us')
                 },
-               'ault:amda100': {
+                'ault:amda100': {
                     'latency': (2.7, None, 0.10, 'us')
                 },
+                'ault:amdvega': {
+                    'latency': (2.64, None, 0.10, 'us')
+                },
 
             },
         }
@@ -104,23 +94,42 @@ def __init__(self, kernel_version):
     def num_tasks_assigned(self):
         return self.job.num_tasks
 
-    @rfm.run_before('compile')
+    @rfm.run_after('setup')
+    def select_makefile(self):
+        cp = self.current_partition.fullname
+        if cp == 'ault:amdvega':
+            self.prebuild_cmds = ['cp makefile.hip Makefile']
+        else:
+            self.prebuild_cmds = ['cp makefile.cuda Makefile']
+
+    @rfm.run_after('setup')
     def set_gpu_arch(self):
         cp = self.current_partition.fullname
-        cs = self.current_system.name
-        gpu_arch = None
-        if cs in {'dom', 'daint'}:
-            gpu_arch = '60'
-        elif (cs in {'arola', 'tsa'} or
-              cp in {'ault:amdv100', 'ault:intelv100'}):
-            gpu_arch = '70'
-        elif cp in {'ault:amda100'}:
-            gpu_arch = '80'
 
-        if gpu_arch:
-            self.build_system.cxxflags += ['-arch=compute_%s' % gpu_arch,
-                                           '-code=sm_%s' % gpu_arch]
+        # Deal with the NVIDIA options first
+        nvidia_sm = None
+        if cp in {'tsa:cn', 'ault:intelv100', 'ault:amdv100'}:
+            nvidia_sm = '70'
+        elif cp == 'ault:amda100':
+            nvidia_sm = '80'
+        elif cp in {'dom:gpu', 'daint:gpu'}:
+            nvidia_sm == '60'
+
+        if nvidia_sm:
+            self.build_system.cxxflags += [f'-arch=sm_{nvidia_sm}']
+            if cp in {'dom:gpu', 'daint:gpu'}:
+                self.modules += ['cudatoolkit']
+            else:
+                self.modules += ['cuda']
 
+        # Deal with the AMD options
+        amd_trgt = None
+        if cp == 'ault:amdvega':
+            amd_trgt = 'gfx906,gfx908'
+
+        if amd_trgt:
+            self.build_system.cxxflags += [f'--amdgpu-target={amd_trgt}']
+            self.modules += ['rocm']
 
     @rfm.run_before('run')
     def set_num_gpus_per_node(self):
@@ -134,3 +143,24 @@ def set_num_gpus_per_node(self):
             self.num_gpus_per_node = 4
         elif cp in {'ault:amdav100'}:
             self.num_gpus_per_node = 2
+        elif cp in {'ault:amdvega'}:
+            self.num_gpus_per_node = 3
+
+    @sn.sanity_function
+    def count_gpus(self):
+        return sn.all([
+            sn.assert_eq(
+                sn.count(
+                    sn.findall(r'\[\S+\] Found \d+ gpu\(s\)',
+                               self.stdout)
+                ),
+                self.num_tasks_assigned
+            ),
+            sn.assert_eq(
+                sn.count(
+                    sn.findall(r'\[\S+\] \[gpu \d+\] Kernel launch '
+                               r'latency: \S+ us', self.stdout)
+                ),
+                self.num_tasks_assigned * self.num_gpus_per_node
+            )
+        ])
diff --git a/cscs-checks/microbenchmarks/gpu/kernel_latency/src/Xdevice b/cscs-checks/microbenchmarks/gpu/kernel_latency/src/Xdevice
new file mode 120000
index 0000000000..297b870666
--- /dev/null
+++ b/cscs-checks/microbenchmarks/gpu/kernel_latency/src/Xdevice
@@ -0,0 +1 @@
+../../memory_bandwidth/src/Xdevice/
\ No newline at end of file
diff --git a/cscs-checks/microbenchmarks/gpu/kernel_latency/src/kernel_latency.cu b/cscs-checks/microbenchmarks/gpu/kernel_latency/src/kernel_latency.cu
index 461fd282a6..87f912611d 100644
--- a/cscs-checks/microbenchmarks/gpu/kernel_latency/src/kernel_latency.cu
+++ b/cscs-checks/microbenchmarks/gpu/kernel_latency/src/kernel_latency.cu
@@ -2,7 +2,8 @@
 #include <chrono>
 #include <ratio>
 #include <unistd.h>
-#include <cuda.h>
+
+#include "Xdevice/runtime.hpp"
 
 __global__ void null_kernel() {
 };
@@ -13,27 +14,20 @@ int main(int argc, char* argv[]) {
     hostname[255]='\0';
     gethostname(hostname, 255);
 
-    cudaError_t error;
     int gpu_count = 0;
+    XGetDeviceCount(&gpu_count);
 
-    error = cudaGetDeviceCount(&gpu_count);
-
-    if (error == cudaSuccess) {
-        if (gpu_count <= 0) {
-            std::cout << "[" << hostname << "] " << "Could not find any gpu\n";
-            return 1;
-        }
-        std::cout << "[" << hostname << "] " << "Found " << gpu_count << " gpu(s)\n";
-    }
-    else{
-        std::cout << "[" << hostname << "] " << "Error getting gpu count, exiting...\n";
+    if (gpu_count <= 0) {
+        std::cout << "[" << hostname << "] " << "Could not find any gpu\n";
         return 1;
     }
+    std::cout << "[" << hostname << "] " << "Found " << gpu_count << " gpu(s)\n";
 
-    for (int i = 0; i < gpu_count; i++) {
+    for (int i = 0; i < gpu_count; i++)
+    {
+        XSetDevice(i);
 
-        cudaSetDevice(i);
-        // Single kernel launch to initialize cuda runtime
+        // Warm-up kernel
         null_kernel<<<1, 1>>>();
 
         auto t_start = std::chrono::system_clock::now();
@@ -42,16 +36,20 @@ int main(int argc, char* argv[]) {
         for (int j = 0; j < kernel_count; ++j) {
             null_kernel<<<1, 1>>>();
             #if SYNCKERNEL == 1
-            cudaDeviceSynchronize();
+            XDeviceSynchronize();
             #endif
         }
 
         #if SYNCKERNEL != 1
-        cudaDeviceSynchronize();
+        XDeviceSynchronize();
         #endif
 
+        // End the timing
         auto t_end = std::chrono::system_clock::now();
-        std::cout << "[" << hostname << "] " << "[gpu " << i << "] " <<  "Kernel launch latency: " << std::chrono::duration_cast<std::chrono::duration<double, std::micro>>(t_end - t_start).count() / kernel_count << " us\n";
+        std::cout << "[" << hostname << "] " << "[gpu " << i << "] " <<
+            "Kernel launch latency: " <<
+            std::chrono::duration_cast<std::chrono::duration<double, std::micro>>(t_end - t_start).count() / kernel_count <<
+            " us\n";
     }
 
     return 0;
diff --git a/cscs-checks/microbenchmarks/gpu/kernel_latency/src/makefile.cuda b/cscs-checks/microbenchmarks/gpu/kernel_latency/src/makefile.cuda
new file mode 100644
index 0000000000..68bd22236b
--- /dev/null
+++ b/cscs-checks/microbenchmarks/gpu/kernel_latency/src/makefile.cuda
@@ -0,0 +1,2 @@
+test:
+	nvcc -O3 -std=c++11 ${CXXFLAGS} -lnvidia-ml kernel_latency.cu -o kernel_latency.x
diff --git a/cscs-checks/microbenchmarks/gpu/kernel_latency/src/makefile.hip b/cscs-checks/microbenchmarks/gpu/kernel_latency/src/makefile.hip
new file mode 100644
index 0000000000..1132ce3645
--- /dev/null
+++ b/cscs-checks/microbenchmarks/gpu/kernel_latency/src/makefile.hip
@@ -0,0 +1,5 @@
+RSMI_ROOT?=/opt/rocm-3.9.0/rocm_smi
+CXXFLAGS?=--amdgpu-target=gfx906,gfx908
+
+test:
+	hipcc -O3 kernel_latency.cu -o kernel_latency.x -DTARGET_HIP ${CXXFLAGS} -std=c++11 -lnuma -I${RSMI_ROOT}/include -L${RSMI_ROOT}/lib -lrocm_smi64
diff --git a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/utils.hpp b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/utils.hpp
index ec1a8b0b83..9b356cbc8f 100644
--- a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/utils.hpp
+++ b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/utils.hpp
@@ -63,9 +63,9 @@ void XDeviceSynchronize()
   checkError( cudaDeviceSynchronize() );
 }
 
-void XGetDeviceCount(int &devices)
+void XGetDeviceCount(int * devices)
 {
-  checkError( cudaGetDeviceCount(&devices) );
+  checkError( cudaGetDeviceCount(devices) );
 }
 
 void XSetDevice(int device)
diff --git a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/utils.hpp b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/utils.hpp
index 471733b366..67c90664a6 100644
--- a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/utils.hpp
+++ b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/utils.hpp
@@ -64,9 +64,9 @@ void XDeviceSynchronize()
   checkError( hipDeviceSynchronize() );
 }
 
-void XGetDeviceCount(int &devices)
+void XGetDeviceCount(int * devices)
 {
-  checkError( hipGetDeviceCount(&devices) );
+  checkError( hipGetDeviceCount(devices) );
 }
 
 void XSetDevice(int device)
diff --git a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/memory_bandwidth.cu b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/memory_bandwidth.cu
index dd5fe12d21..e3155bf8e6 100644
--- a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/memory_bandwidth.cu
+++ b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/memory_bandwidth.cu
@@ -7,7 +7,7 @@ int main()
   gethostname(nid_name, HOSTNAME_SIZE);
 
   int number_of_devices;
-  XGetDeviceCount(number_of_devices);
+  XGetDeviceCount(&number_of_devices);
 
   // Initialise the Smi to manage the devices.
   Smi smiHandle;
diff --git a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/p2p_bandwidth.cu b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/p2p_bandwidth.cu
index b297b551dd..06968c0b54 100644
--- a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/p2p_bandwidth.cu
+++ b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/p2p_bandwidth.cu
@@ -28,7 +28,7 @@ int main()
   gethostname(nid_name, HOSTNAME_SIZE);
 
   int number_of_devices;
-  XGetDeviceCount(number_of_devices);
+  XGetDeviceCount(&number_of_devices);
 
   // Make sure we've got devices aboard.
   if (number_of_devices == 0)

From 6aac2c9bf832c2a1cbba074c673a0f0e478faf00 Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Wed, 9 Dec 2020 13:43:10 +0100
Subject: [PATCH 28/51] Add dgemm-gpu sources.

---
 .../microbenchmarks/gpu/dgemm/src/Xdevice     |   1 +
 .../microbenchmarks/gpu/dgemm/src/dgemm.cu    | 106 ++++++++++++++++++
 .../gpu/dgemm/src/makefile.cuda               |   2 +
 .../gpu/dgemm/src/makefile.hip                |   6 +
 .../src/Xdevice/cuda/blas.hpp                 |   4 +
 .../memory_bandwidth/src/Xdevice/hip/blas.hpp |   5 +
 6 files changed, 124 insertions(+)
 create mode 120000 cscs-checks/microbenchmarks/gpu/dgemm/src/Xdevice
 create mode 100644 cscs-checks/microbenchmarks/gpu/dgemm/src/dgemm.cu
 create mode 100644 cscs-checks/microbenchmarks/gpu/dgemm/src/makefile.cuda
 create mode 100644 cscs-checks/microbenchmarks/gpu/dgemm/src/makefile.hip

diff --git a/cscs-checks/microbenchmarks/gpu/dgemm/src/Xdevice b/cscs-checks/microbenchmarks/gpu/dgemm/src/Xdevice
new file mode 120000
index 0000000000..68ede93327
--- /dev/null
+++ b/cscs-checks/microbenchmarks/gpu/dgemm/src/Xdevice
@@ -0,0 +1 @@
+../../memory_bandwidth/src/Xdevice
\ No newline at end of file
diff --git a/cscs-checks/microbenchmarks/gpu/dgemm/src/dgemm.cu b/cscs-checks/microbenchmarks/gpu/dgemm/src/dgemm.cu
new file mode 100644
index 0000000000..6a62ba297e
--- /dev/null
+++ b/cscs-checks/microbenchmarks/gpu/dgemm/src/dgemm.cu
@@ -0,0 +1,106 @@
+/*
+ * Basic DGEMM test
+ *
+ * Multiply two matrices of dimensions SIZE*SIZE filled with ones. Therefore,
+ * all the elements of the resulting matrix will be just SIZE.
+ */
+
+#define SIZE 2048
+#define REPEAT 30
+
+#include <iostream>
+
+#include "Xdevice/runtime.hpp"
+#include "Xdevice/blas.hpp"
+
+
+namespace kernels
+{
+  template<class T>
+  __global__ void init_as_ones(T * arr, size_t size)
+  {
+    unsigned int tid = threadIdx.x + blockIdx.x*blockDim.x;
+    if (tid < size)
+    {
+      arr[tid] = (T)1.0;
+    }
+  }
+
+  template<class T>
+  __global__ void verify(T * arr, size_t size, int * err)
+  {
+    unsigned int tid = threadIdx.x + blockIdx.x*blockDim.x;
+    if (tid < size)
+    {
+      if (int(arr[tid]) != SIZE)
+        atomicAdd(err, 1);
+    }
+  }
+}
+
+#define BLOCK_SIZE 128
+int main(int argc, char **argv)
+{
+    double * A;
+    double * B;
+    double * C;
+
+    XMalloc((void**)&A, sizeof(double)*SIZE*SIZE);
+    XMalloc((void**)&B, sizeof(double)*SIZE*SIZE);
+    XMalloc((void**)&C, sizeof(double)*SIZE*SIZE);
+
+    kernels::init_as_ones<double><<<(SIZE*SIZE+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(A, SIZE*SIZE);
+    kernels::init_as_ones<double><<<(SIZE*SIZE+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(B, SIZE*SIZE);
+    XDeviceSynchronize();
+
+    XStream_t stream;
+    XStreamCreate(&stream);
+    XblasHandle_t blas_handle;
+    XblasCreate(&blas_handle);
+    XblasSetStream(blas_handle, stream);
+    const double alpha = 1.0;
+    const double beta = 0.0;
+
+    // Warmup call
+    XblasDgemm(blas_handle,
+               XBLAS_OP_N, XBLAS_OP_N,
+               SIZE, SIZE, SIZE,
+               &alpha,
+               (const double*)A, SIZE,
+               (const double*)B, SIZE,
+               &beta,
+               C, SIZE);
+
+    // Time the execution
+    XTimer t(stream);
+    t.start();
+
+    for (int i = 0; i < REPEAT; i++)
+    {
+        XblasDgemm(blas_handle,
+                   XBLAS_OP_N, XBLAS_OP_N,
+                   SIZE, SIZE, SIZE,
+                   &alpha,
+                   (const double*)A, SIZE,
+                   (const double*)B, SIZE,
+                   &beta,
+                   C, SIZE);
+    }
+
+    std::cout << "Elapsed time: " << t.stop() << std::endl;
+
+    XblasDestroy(blas_handle);
+    XStreamDestroy(stream);
+
+    int * err, h_err = 0;
+    XMalloc((void**)&err, sizeof(int));
+    XMemset(&err, 0, sizeof(int));
+    kernels::verify<double><<<(SIZE+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(C, SIZE*SIZE, err);
+
+    XDeviceSynchronize();
+    XMemcpy(&h_err, err, sizeof(int), XMemcpyDeviceToHost);
+    XDeviceSynchronize();
+    std::cout << "Number of errors:" << h_err << std::endl;
+
+    return 0;
+}
diff --git a/cscs-checks/microbenchmarks/gpu/dgemm/src/makefile.cuda b/cscs-checks/microbenchmarks/gpu/dgemm/src/makefile.cuda
new file mode 100644
index 0000000000..c7519f9732
--- /dev/null
+++ b/cscs-checks/microbenchmarks/gpu/dgemm/src/makefile.cuda
@@ -0,0 +1,2 @@
+dgemm:
+	nvcc $@.cu -o $@.x ${CXXFLAGS} -lnvidia-ml -lcublas -std=c++14
diff --git a/cscs-checks/microbenchmarks/gpu/dgemm/src/makefile.hip b/cscs-checks/microbenchmarks/gpu/dgemm/src/makefile.hip
new file mode 100644
index 0000000000..ff91418d93
--- /dev/null
+++ b/cscs-checks/microbenchmarks/gpu/dgemm/src/makefile.hip
@@ -0,0 +1,6 @@
+CXXFLAGS?=--amdgpu-target=gfx906,gfx908
+ROCM_ROOT?=/opt/rocm
+RSMI_ROOT?=/opt/rocm/rocm_smi
+
+dgemm:
+	hipcc -O3 $@.cu -o $@.x -DTARGET_HIP ${CXXFLAGS} -std=c++14 -I${ROCM_ROOT} -I${RSMI_ROOT}/include -lnuma -lrocm_smi64 -lrocblas
diff --git a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/blas.hpp b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/blas.hpp
index 7d280f658d..3839c17596 100644
--- a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/blas.hpp
+++ b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/blas.hpp
@@ -39,6 +39,10 @@ void XblasDestroy(cublasHandle_t handle)
   checkError( cublasDestroy(handle) );
 }
 
+void XblasSetStream(cublasHandle_t h, cudaStream_t s)
+{
+  checkError ( cublasSetStream(h, s) );
+}
 
 auto XblasDgemm = cublasDgemm;
 auto XblasSgemm = cublasSgemm;
diff --git a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/blas.hpp b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/blas.hpp
index d4ab598c9c..b79cfac93a 100644
--- a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/blas.hpp
+++ b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/blas.hpp
@@ -39,6 +39,11 @@ void XblasDestroy(XblasHandle_t handle)
   checkError( rocblas_destroy_handle(handle) );
 }
 
+void XblasSetStream(XblasHandle_t handle, hipStream_t stream)
+{
+  checkError( rocblas_set_stream(handle, stream) );
+}
+
 auto XblasDgemm = rocblas_dgemm;
 auto XblasSgemm = rocblas_sgemm;
 

From 2a87f57a3919acc2899b8190eac7024345f01bcd Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Wed, 9 Dec 2020 15:34:41 +0100
Subject: [PATCH 29/51] Cleanup dgemm-gpu output.

---
 .../microbenchmarks/gpu/dgemm/src/dgemm.cu    | 129 +++++++++++-------
 1 file changed, 79 insertions(+), 50 deletions(-)

diff --git a/cscs-checks/microbenchmarks/gpu/dgemm/src/dgemm.cu b/cscs-checks/microbenchmarks/gpu/dgemm/src/dgemm.cu
index 6a62ba297e..ba478b357d 100644
--- a/cscs-checks/microbenchmarks/gpu/dgemm/src/dgemm.cu
+++ b/cscs-checks/microbenchmarks/gpu/dgemm/src/dgemm.cu
@@ -5,10 +5,11 @@
  * all the elements of the resulting matrix will be just SIZE.
  */
 
-#define SIZE 2048
+#define SIZE 1024
 #define REPEAT 30
 
 #include <iostream>
+#include <unistd.h>
 
 #include "Xdevice/runtime.hpp"
 #include "Xdevice/blas.hpp"
@@ -39,44 +40,47 @@ namespace kernels
 }
 
 #define BLOCK_SIZE 128
+#define HOST_NAME_SIZE 128
 int main(int argc, char **argv)
 {
-    double * A;
-    double * B;
-    double * C;
-
-    XMalloc((void**)&A, sizeof(double)*SIZE*SIZE);
-    XMalloc((void**)&B, sizeof(double)*SIZE*SIZE);
-    XMalloc((void**)&C, sizeof(double)*SIZE*SIZE);
-
-    kernels::init_as_ones<double><<<(SIZE*SIZE+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(A, SIZE*SIZE);
-    kernels::init_as_ones<double><<<(SIZE*SIZE+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(B, SIZE*SIZE);
-    XDeviceSynchronize();
-
-    XStream_t stream;
-    XStreamCreate(&stream);
-    XblasHandle_t blas_handle;
-    XblasCreate(&blas_handle);
-    XblasSetStream(blas_handle, stream);
-    const double alpha = 1.0;
-    const double beta = 0.0;
-
-    // Warmup call
-    XblasDgemm(blas_handle,
-               XBLAS_OP_N, XBLAS_OP_N,
-               SIZE, SIZE, SIZE,
-               &alpha,
-               (const double*)A, SIZE,
-               (const double*)B, SIZE,
-               &beta,
-               C, SIZE);
-
-    // Time the execution
-    XTimer t(stream);
-    t.start();
-
-    for (int i = 0; i < REPEAT; i++)
+
+    char hostname[HOST_NAME_SIZE];
+    gethostname(hostname, sizeof(hostname));
+
+    double tflops = SIZE*SIZE*SIZE*2.0 * 1E-12;
+    int num_devices, totalErrors = 0;
+    XGetDeviceCount(&num_devices);
+
+    // Print device count
+    printf("[%s] Found %d device(s).\n", hostname, num_devices);
+
+    // Do the dgemm for all devices in the node.
+    for (int device = 0; device < num_devices; device++)
     {
+
+        XSetDevice(device);
+
+        double * A;
+        double * B;
+        double * C;
+        const double alpha = 1.0;
+        const double beta = 0.0;
+
+        XMalloc((void**)&A, sizeof(double)*SIZE*SIZE);
+        XMalloc((void**)&B, sizeof(double)*SIZE*SIZE);
+        XMalloc((void**)&C, sizeof(double)*SIZE*SIZE);
+
+        kernels::init_as_ones<double><<<(SIZE*SIZE+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(A, SIZE*SIZE);
+        kernels::init_as_ones<double><<<(SIZE*SIZE+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(B, SIZE*SIZE);
+        XDeviceSynchronize();
+
+        XStream_t stream;
+        XStreamCreate(&stream);
+        XblasHandle_t blas_handle;
+        XblasCreate(&blas_handle);
+        XblasSetStream(blas_handle, stream);
+
+        // Warmup call
         XblasDgemm(blas_handle,
                    XBLAS_OP_N, XBLAS_OP_N,
                    SIZE, SIZE, SIZE,
@@ -85,22 +89,47 @@ int main(int argc, char **argv)
                    (const double*)B, SIZE,
                    &beta,
                    C, SIZE);
+        XDeviceSynchronize();
+
+        // Time the execution
+        XTimer t(stream);
+        t.start();
+        for (int i = 0; i < REPEAT; i++)
+        {
+            XblasDgemm(blas_handle,
+                       XBLAS_OP_N, XBLAS_OP_N,
+                       SIZE, SIZE, SIZE,
+                       &alpha,
+                       (const double*)A, SIZE,
+                       (const double*)B, SIZE,
+                       &beta,
+                       C, SIZE);
+        }
+
+        // Calc the performance data in TFlops/sec
+        double perf = tflops/(t.stop()/REPEAT/1000.0);
+
+        XblasDestroy(blas_handle);
+        XStreamDestroy(stream);
+
+        // Verify that the final values of C are correct.
+        int * err, h_err = 0;
+        XMalloc((void**)&err, sizeof(int));
+        XMemcpy( err, &h_err, sizeof(int), XMemcpyHostToDevice);
+        kernels::verify<double><<<(SIZE+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(C, SIZE*SIZE, err);
+        XMemcpy(&h_err, err, sizeof(int), XMemcpyDeviceToHost);
+        totalErrors += h_err;
+
+        // Print the performance results
+        printf("[%s][GPU %d] %4.2f TF/s\n", hostname, device, (float)perf);
+
+        XFree(A);
+        XFree(B);
+        XFree(C);
     }
 
-    std::cout << "Elapsed time: " << t.stop() << std::endl;
-
-    XblasDestroy(blas_handle);
-    XStreamDestroy(stream);
-
-    int * err, h_err = 0;
-    XMalloc((void**)&err, sizeof(int));
-    XMemset(&err, 0, sizeof(int));
-    kernels::verify<double><<<(SIZE+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(C, SIZE*SIZE, err);
-
-    XDeviceSynchronize();
-    XMemcpy(&h_err, err, sizeof(int), XMemcpyDeviceToHost);
-    XDeviceSynchronize();
-    std::cout << "Number of errors:" << h_err << std::endl;
+    // Test if there were any errors and print the test result.
+    printf("[%s] Test %s\n", hostname, totalErrors == 0 ? "passed" : "failed");
 
     return 0;
 }

From de7d2d8b168563cbf2c49598d9af3bc3bbbb2354 Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Wed, 9 Dec 2020 16:01:18 +0100
Subject: [PATCH 30/51] Add threading support to dgemm

---
 .../microbenchmarks/gpu/dgemm/src/dgemm.cu    | 172 ++++++++++--------
 1 file changed, 99 insertions(+), 73 deletions(-)

diff --git a/cscs-checks/microbenchmarks/gpu/dgemm/src/dgemm.cu b/cscs-checks/microbenchmarks/gpu/dgemm/src/dgemm.cu
index ba478b357d..326e94336e 100644
--- a/cscs-checks/microbenchmarks/gpu/dgemm/src/dgemm.cu
+++ b/cscs-checks/microbenchmarks/gpu/dgemm/src/dgemm.cu
@@ -10,6 +10,11 @@
 
 #include <iostream>
 #include <unistd.h>
+#include <thread>
+#include <mutex>
+#include <vector>
+#include <algorithm>
+#include <functional>
 
 #include "Xdevice/runtime.hpp"
 #include "Xdevice/blas.hpp"
@@ -39,95 +44,116 @@ namespace kernels
   }
 }
 
-#define BLOCK_SIZE 128
+/*
+ * This code uses a thread per device in the node.
+ * For simplicity, we define the variables below as global.
+ */
+
 #define HOST_NAME_SIZE 128
+char hostname[HOST_NAME_SIZE];
+double tflops = SIZE*SIZE*SIZE*2.0 * 1E-12;
+int totalErrors = 0;
+std::mutex mtx;
+
+#define BLOCK_SIZE 128
+void dgemm(int device)
+{
+    XSetDevice(device);
+
+    double * A;
+    double * B;
+    double * C;
+    const double alpha = 1.0;
+    const double beta = 0.0;
+
+    XMalloc((void**)&A, sizeof(double)*SIZE*SIZE);
+    XMalloc((void**)&B, sizeof(double)*SIZE*SIZE);
+    XMalloc((void**)&C, sizeof(double)*SIZE*SIZE);
+
+    kernels::init_as_ones<double><<<(SIZE*SIZE+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(A, SIZE*SIZE);
+    kernels::init_as_ones<double><<<(SIZE*SIZE+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(B, SIZE*SIZE);
+    XDeviceSynchronize();
+
+    XStream_t stream;
+    XStreamCreate(&stream);
+    XblasHandle_t blas_handle;
+    XblasCreate(&blas_handle);
+    XblasSetStream(blas_handle, stream);
+
+    // Warmup call
+    XblasDgemm(blas_handle,
+               XBLAS_OP_N, XBLAS_OP_N,
+               SIZE, SIZE, SIZE,
+               &alpha,
+               (const double*)A, SIZE,
+               (const double*)B, SIZE,
+               &beta,
+               C, SIZE);
+    XDeviceSynchronize();
+
+    // Time the execution
+    XTimer t(stream);
+    t.start();
+    for (int i = 0; i < REPEAT; i++)
+    {
+        XblasDgemm(blas_handle,
+                   XBLAS_OP_N, XBLAS_OP_N,
+                   SIZE, SIZE, SIZE,
+                   &alpha,
+                   (const double*)A, SIZE,
+                   (const double*)B, SIZE,
+                   &beta,
+                   C, SIZE);
+    }
+
+    // Calc the performance data in TFlops/sec
+    double perf = tflops/(t.stop()/REPEAT/1000.0);
+
+    XblasDestroy(blas_handle);
+    XStreamDestroy(stream);
+
+    // Verify that the final values of C are correct.
+    int * err, h_err = 0;
+    XMalloc((void**)&err, sizeof(int));
+    XMemcpy( err, &h_err, sizeof(int), XMemcpyHostToDevice);
+    kernels::verify<double><<<(SIZE+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(C, SIZE*SIZE, err);
+    XMemcpy(&h_err, err, sizeof(int), XMemcpyDeviceToHost);
+    {
+      std::lock_guard<std::mutex> lg(mtx);
+      totalErrors += h_err;
+
+      // Print the performance results
+      printf("[%s][GPU %d] %4.2f TF/s\n", hostname, device, (float)perf);
+    }
+    XFree(A);
+    XFree(B);
+    XFree(C);
+
+}
+
 int main(int argc, char **argv)
 {
 
-    char hostname[HOST_NAME_SIZE];
     gethostname(hostname, sizeof(hostname));
 
-    double tflops = SIZE*SIZE*SIZE*2.0 * 1E-12;
-    int num_devices, totalErrors = 0;
+    int num_devices;
     XGetDeviceCount(&num_devices);
 
     // Print device count
     printf("[%s] Found %d device(s).\n", hostname, num_devices);
 
+    // Create vector of threads.
+    std::vector<std::thread> threads;
+
     // Do the dgemm for all devices in the node.
     for (int device = 0; device < num_devices; device++)
     {
-
-        XSetDevice(device);
-
-        double * A;
-        double * B;
-        double * C;
-        const double alpha = 1.0;
-        const double beta = 0.0;
-
-        XMalloc((void**)&A, sizeof(double)*SIZE*SIZE);
-        XMalloc((void**)&B, sizeof(double)*SIZE*SIZE);
-        XMalloc((void**)&C, sizeof(double)*SIZE*SIZE);
-
-        kernels::init_as_ones<double><<<(SIZE*SIZE+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(A, SIZE*SIZE);
-        kernels::init_as_ones<double><<<(SIZE*SIZE+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(B, SIZE*SIZE);
-        XDeviceSynchronize();
-
-        XStream_t stream;
-        XStreamCreate(&stream);
-        XblasHandle_t blas_handle;
-        XblasCreate(&blas_handle);
-        XblasSetStream(blas_handle, stream);
-
-        // Warmup call
-        XblasDgemm(blas_handle,
-                   XBLAS_OP_N, XBLAS_OP_N,
-                   SIZE, SIZE, SIZE,
-                   &alpha,
-                   (const double*)A, SIZE,
-                   (const double*)B, SIZE,
-                   &beta,
-                   C, SIZE);
-        XDeviceSynchronize();
-
-        // Time the execution
-        XTimer t(stream);
-        t.start();
-        for (int i = 0; i < REPEAT; i++)
-        {
-            XblasDgemm(blas_handle,
-                       XBLAS_OP_N, XBLAS_OP_N,
-                       SIZE, SIZE, SIZE,
-                       &alpha,
-                       (const double*)A, SIZE,
-                       (const double*)B, SIZE,
-                       &beta,
-                       C, SIZE);
-        }
-
-        // Calc the performance data in TFlops/sec
-        double perf = tflops/(t.stop()/REPEAT/1000.0);
-
-        XblasDestroy(blas_handle);
-        XStreamDestroy(stream);
-
-        // Verify that the final values of C are correct.
-        int * err, h_err = 0;
-        XMalloc((void**)&err, sizeof(int));
-        XMemcpy( err, &h_err, sizeof(int), XMemcpyHostToDevice);
-        kernels::verify<double><<<(SIZE+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(C, SIZE*SIZE, err);
-        XMemcpy(&h_err, err, sizeof(int), XMemcpyDeviceToHost);
-        totalErrors += h_err;
-
-        // Print the performance results
-        printf("[%s][GPU %d] %4.2f TF/s\n", hostname, device, (float)perf);
-
-        XFree(A);
-        XFree(B);
-        XFree(C);
+        threads.push_back(std::thread(dgemm,device));
     }
 
+    // Join all threads
+    std::for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join));
+
     // Test if there were any errors and print the test result.
     printf("[%s] Test %s\n", hostname, totalErrors == 0 ? "passed" : "failed");
 

From a946e7bf271816fddf7d75d6a010cffe3b312328 Mon Sep 17 00:00:00 2001
From: Javier Otero <jotero@cscs.ch>
Date: Wed, 9 Dec 2020 17:01:46 +0100
Subject: [PATCH 31/51] Add dgemm test

---
 .../microbenchmarks/gpu/dgemm/dgmemm.py       | 101 ++++++++++++++++++
 .../microbenchmarks/gpu/dgemm/src/dgemm.cu    |   2 +-
 2 files changed, 102 insertions(+), 1 deletion(-)
 create mode 100644 cscs-checks/microbenchmarks/gpu/dgemm/dgmemm.py

diff --git a/cscs-checks/microbenchmarks/gpu/dgemm/dgmemm.py b/cscs-checks/microbenchmarks/gpu/dgemm/dgmemm.py
new file mode 100644
index 0000000000..0af49ed4db
--- /dev/null
+++ b/cscs-checks/microbenchmarks/gpu/dgemm/dgmemm.py
@@ -0,0 +1,101 @@
+# Copyright 2016-2020 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
+# ReFrame Project Developers. See the top-level LICENSE file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import reframe as rfm
+import reframe.utility.sanity as sn
+
+
+@rfm.simple_test
+class GPUdgemmTest(rfm.RegressionTest):
+    def __init__(self):
+        self.valid_systems = ['daint:gpu', 'dom:gpu',
+                              'ault:amdv100', 'ault:intelv100',
+                              'ault:amda100', 'ault:amdvega']
+        self.valid_prog_environs = ['PrgEnv-gnu']
+        self.num_tasks = 0
+        self.num_tasks_per_node = 1
+        self.build_system = 'Make'
+        self.executable = 'dgemm.x'
+
+        # Mark the Xdevice symlink as read-only
+        self.readonly_files = ['Xdevice']
+
+        self.sanity_patterns = self.assert_num_gpus()
+        self.perf_patterns = {
+            'perf': sn.min(sn.extractall(
+                r'^\s*\[[^\]]*\]\s*GPU\s*\d+: (?P<fp>\S+) TF/s',
+                self.stdout, 'fp', float))
+        }
+        self.reference = {
+            'dom:gpu': {
+                'perf': (3.35, -0.1, None, 'TF/s')
+            },
+            'daint:gpu': {
+                'perf': (3.35, -0.1, None, 'TF/s')
+            },
+            'ault:amdv100': {
+                'perf': (5.25, -0.1, None, 'TF/s')
+            },
+            'ault:intelv100': {
+                'perf': (5.25, -0.1, None, 'TF/s')
+            },
+            'ault:amda100': {
+                'perf': (10.5, -0.1, None, 'TF/s')
+            },
+            'ault:amdvega': {
+                'perf': (3.45, -0.1, None, 'TF/s')
+            }
+        }
+
+        self.maintainers = ['JO']
+        self.tags = {'benchmark'}
+
+    @property
+    @sn.sanity_function
+    def num_tasks_assigned(self):
+        return self.job.num_tasks
+
+    @sn.sanity_function
+    def assert_num_gpus(self):
+        return sn.assert_eq(
+            sn.count(sn.findall(r'^\s*\[[^\]]*\]\s*Test passed', self.stdout)),
+            self.num_tasks_assigned)
+
+    @rfm.run_after('setup')
+    def select_makefile(self):
+        cp = self.current_partition.fullname
+        if cp == 'ault:amdvega':
+            self.build_system.makefile = 'makefile.hip'
+        else:
+            self.build_system.makefile = 'makefile.cuda'
+
+    @rfm.run_after('setup')
+    def set_gpu_arch(self):
+        cp = self.current_partition.fullname
+
+        # Deal with the NVIDIA options first
+        nvidia_sm = None
+        if cp in {'tsa:cn', 'ault:intelv100', 'ault:amdv100'}:
+            nvidia_sm = '70'
+        elif cp == 'ault:amda100':
+            nvidia_sm = '80'
+        elif cp in {'dom:gpu', 'daint:gpu'}:
+            nvidia_sm = '60'
+
+        if nvidia_sm:
+            self.build_system.cxxflags += [f'-arch=sm_{nvidia_sm}']
+            if cp in {'dom:gpu', 'daint:gpu'}:
+                self.modules += ['craype-accel-nvidia60']
+            else:
+                self.modules += ['cuda']
+
+        # Deal with the AMD options
+        amd_trgt = None
+        if cp == 'ault:amdvega':
+            amd_trgt = 'gfx906'
+
+        if amd_trgt:
+            self.build_system.cxxflags += [f'--amdgpu-target={amd_trgt}']
+            self.modules += ['rocm']
diff --git a/cscs-checks/microbenchmarks/gpu/dgemm/src/dgemm.cu b/cscs-checks/microbenchmarks/gpu/dgemm/src/dgemm.cu
index 326e94336e..3ce46c927b 100644
--- a/cscs-checks/microbenchmarks/gpu/dgemm/src/dgemm.cu
+++ b/cscs-checks/microbenchmarks/gpu/dgemm/src/dgemm.cu
@@ -123,7 +123,7 @@ void dgemm(int device)
       totalErrors += h_err;
 
       // Print the performance results
-      printf("[%s][GPU %d] %4.2f TF/s\n", hostname, device, (float)perf);
+      printf("[%s] GPU %d: %4.2f TF/s\n", hostname, device, (float)perf);
     }
     XFree(A);
     XFree(B);

From fb7d7c1171b0d0e75559e2466724aec86fe02a4f Mon Sep 17 00:00:00 2001
From: Javier Otero <jotero@cscs.ch>
Date: Thu, 10 Dec 2020 17:03:37 +0100
Subject: [PATCH 32/51] Add fixme label.

---
 cscs-checks/microbenchmarks/gpu/dgemm/dgmemm.py               | 2 +-
 .../microbenchmarks/gpu/pointer_chase/pointer_chase.py        | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/cscs-checks/microbenchmarks/gpu/dgemm/dgmemm.py b/cscs-checks/microbenchmarks/gpu/dgemm/dgmemm.py
index 0af49ed4db..90d7734a7d 100644
--- a/cscs-checks/microbenchmarks/gpu/dgemm/dgmemm.py
+++ b/cscs-checks/microbenchmarks/gpu/dgemm/dgmemm.py
@@ -19,7 +19,7 @@ def __init__(self):
         self.build_system = 'Make'
         self.executable = 'dgemm.x'
 
-        # Mark the Xdevice symlink as read-only
+        # FIXME workaround due to issue #1639.
         self.readonly_files = ['Xdevice']
 
         self.sanity_patterns = self.assert_num_gpus()
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
index fdde47c306..601d12b4a6 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
@@ -38,6 +38,10 @@ def __init__(self):
         self.num_tasks_per_node = 1
         self.postbuild_cmds = ['ls .']
         self.sanity_patterns = sn.assert_found(r'pChase.x', self.stdout)
+
+        # FIXME workaround due to issue #1639.
+        self.readonly_files = ['Xdevice']
+
         self.maintainers = ['JO']
 
     @rfm.run_after('setup')

From a5682491f8657e906d51abbf02eabab4d48f8c34 Mon Sep 17 00:00:00 2001
From: Javier Otero <jotero@cscs.ch>
Date: Thu, 10 Dec 2020 17:15:06 +0100
Subject: [PATCH 33/51] Remove double XMemcpy definition.

---
 .../gpu/memory_bandwidth/src/Xdevice/cuda/utils.hpp          | 5 -----
 .../gpu/memory_bandwidth/src/Xdevice/hip/utils.hpp           | 5 -----
 2 files changed, 10 deletions(-)

diff --git a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/utils.hpp b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/utils.hpp
index 1bebe7420e..0d402e7412 100644
--- a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/utils.hpp
+++ b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/utils.hpp
@@ -108,11 +108,6 @@ void XMemcpyPeerAsync(void * dst, int peerDevId, void * src, int srcDevId, size_
   checkError( cudaMemcpyPeerAsync(dst, peerDevId, src, srcDevId, size, stream) );
 }
 
-void XMemcpy(void * dst, void * src, size_t size, cudaMemcpyKind dir)
-{
-  checkError( cudaMemcpy(dst, src, size, dir) );
-}
-
 void XHostGetDevicePointer(void** device, void* host, unsigned int flags)
 {
   checkError( cudaHostGetDevicePointer(device, host, flags) );
diff --git a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/utils.hpp b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/utils.hpp
index 8113e5d4c2..0bedf13c3b 100644
--- a/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/utils.hpp
+++ b/cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/hip/utils.hpp
@@ -109,11 +109,6 @@ void XMemcpyPeerAsync(void * dst, int peerDevId, void * src, int srcDevId, size_
   checkError( hipMemcpyPeerAsync(dst, peerDevId, src, srcDevId, size, stream) );
 }
 
-void XMemcpy(void * dst, void * src, size_t size, hipMemcpyKind dir)
-{
-  checkError( hipMemcpy(dst, src, size, dir) );
-}
-
 void XHostGetDevicePointer(void** device, void* host, unsigned int flags)
 {
   checkError( hipHostGetDevicePointer(device, host, flags) );

From a18a2948ae2bb69bbed4240a585f7aae29c970e8 Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Fri, 15 Jan 2021 13:45:08 +0100
Subject: [PATCH 34/51] Fix PEP8

---
 cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
index 601d12b4a6..2563e2ea78 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
@@ -111,7 +111,7 @@ def do_sanity_check(self):
         # Check that every node has the right number of GPUs
         # Store this nodes in case they're used later by the perf functions.
         self.my_nodes = set(sn.extractall(
-            r'^\s*\[([^\]]*)\]\s*Found %d device\(s\).' % self.num_gpus_per_node,
+            rf'^\s*\[([^\]]*)\]\s*Found {self.num_gpus_per_node} device\(s\).',
             self.stdout, 1))
 
         # Check that every node has made it to the end.

From 184253796048f8062bf948cd80c84e84161d6ead Mon Sep 17 00:00:00 2001
From: Javier Otero <jotero@cscs.ch>
Date: Tue, 19 Jan 2021 16:02:53 +0100
Subject: [PATCH 35/51] Add cdt-cuda module for dom:gpu

---
 cscs-checks/microbenchmarks/gpu/dgemm/dgmemm.py          | 3 +++
 .../microbenchmarks/gpu/pointer_chase/pointer_chase.py   | 9 ++++++---
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/cscs-checks/microbenchmarks/gpu/dgemm/dgmemm.py b/cscs-checks/microbenchmarks/gpu/dgemm/dgmemm.py
index 90d7734a7d..cf5c94419d 100644
--- a/cscs-checks/microbenchmarks/gpu/dgemm/dgmemm.py
+++ b/cscs-checks/microbenchmarks/gpu/dgemm/dgmemm.py
@@ -88,6 +88,9 @@ def set_gpu_arch(self):
             self.build_system.cxxflags += [f'-arch=sm_{nvidia_sm}']
             if cp in {'dom:gpu', 'daint:gpu'}:
                 self.modules += ['craype-accel-nvidia60']
+                if cp == 'dom:gpu':
+                    self.modules += ['cdt-cuda']
+
             else:
                 self.modules += ['cuda']
 
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
index 2563e2ea78..643c6d3d0f 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
@@ -48,9 +48,9 @@ def __init__(self):
     def select_makefile(self):
         cp = self.current_partition.fullname
         if cp == 'ault:amdvega':
-            self.prebuild_cmds = ['cp makefile.hip Makefile']
+            self.build_system.makefile = 'makefile.hip'
         else:
-            self.prebuild_cmds = ['cp makefile.cuda Makefile']
+            self.build_system.makefile = 'makefile.cuda'
 
     @rfm.run_after('setup')
     def set_gpu_arch(self):
@@ -63,12 +63,15 @@ def set_gpu_arch(self):
         elif cp == 'ault:amda100':
             nvidia_sm = '80'
         elif cp in {'dom:gpu', 'daint:gpu'}:
-            nvidia_sm == '60'
+            nvidia_sm = '60'
 
         if nvidia_sm:
             self.build_system.cxxflags += [f'-arch=sm_{nvidia_sm}']
             if cp in {'dom:gpu', 'daint:gpu'}:
                 self.modules += ['cudatoolkit']
+                if cp == 'dom:gpu':
+                    self.modules += ['cdt-cuda']
+
             else:
                 self.modules += ['cuda']
 

From 6f3273ac76b86d89891785558e2ce79db3fa6ab6 Mon Sep 17 00:00:00 2001
From: Javier Otero <jotero@cscs.ch>
Date: Tue, 19 Jan 2021 16:04:55 +0100
Subject: [PATCH 36/51] Remove unnecessary workaround

---
 cscs-checks/microbenchmarks/gpu/dgemm/dgmemm.py               | 4 ----
 .../microbenchmarks/gpu/pointer_chase/pointer_chase.py        | 4 ----
 2 files changed, 8 deletions(-)

diff --git a/cscs-checks/microbenchmarks/gpu/dgemm/dgmemm.py b/cscs-checks/microbenchmarks/gpu/dgemm/dgmemm.py
index cf5c94419d..111bfe604d 100644
--- a/cscs-checks/microbenchmarks/gpu/dgemm/dgmemm.py
+++ b/cscs-checks/microbenchmarks/gpu/dgemm/dgmemm.py
@@ -18,10 +18,6 @@ def __init__(self):
         self.num_tasks_per_node = 1
         self.build_system = 'Make'
         self.executable = 'dgemm.x'
-
-        # FIXME workaround due to issue #1639.
-        self.readonly_files = ['Xdevice']
-
         self.sanity_patterns = self.assert_num_gpus()
         self.perf_patterns = {
             'perf': sn.min(sn.extractall(
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
index 643c6d3d0f..1b89b29689 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
@@ -38,10 +38,6 @@ def __init__(self):
         self.num_tasks_per_node = 1
         self.postbuild_cmds = ['ls .']
         self.sanity_patterns = sn.assert_found(r'pChase.x', self.stdout)
-
-        # FIXME workaround due to issue #1639.
-        self.readonly_files = ['Xdevice']
-
         self.maintainers = ['JO']
 
     @rfm.run_after('setup')

From b1965031461d32507667aa596e02d1da4c785016 Mon Sep 17 00:00:00 2001
From: Javier Otero <jotero@cscs.ch>
Date: Tue, 19 Jan 2021 17:50:50 +0100
Subject: [PATCH 37/51] Update includes after merge

---
 .../microbenchmarks/gpu/pointer_chase/src/linked_list.hpp       | 2 +-
 .../microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/linked_list.hpp b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/linked_list.hpp
index fa6c2ee5a8..4e453118e1 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/linked_list.hpp
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/linked_list.hpp
@@ -1,4 +1,4 @@
-
+#include "Xdevice/smi.hpp"
 
 /*
  *
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
index 36cf7da72d..d8a88e3fce 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
@@ -332,7 +332,7 @@ int main(int argc, char ** argv)
 
   // Make sure we've got devices aboard.
   int num_devices;
-  XGetDeviceCount(num_devices);
+  XGetDeviceCount(&num_devices);
   if (num_devices == 0)
   {
     std::cout << "No devices found on host " << nid_name << std::endl;

From c3e8676c22be051ed53e1a7c9765ae6753275d01 Mon Sep 17 00:00:00 2001
From: Javier Otero <jotero@cscs.ch>
Date: Tue, 19 Jan 2021 17:52:58 +0100
Subject: [PATCH 38/51] Adjust pointer chase refs

---
 .../gpu/pointer_chase/pointer_chase.py               | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
index 1b89b29689..3cbc873661 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
@@ -381,15 +381,15 @@ def __init__(self):
 
         self.reference = {
             'dom:gpu': {
-                'L1_latency': (112, None, 0.1, 'clock cycles'),
-                'L1_miss_rate': (33.3, None, 0.1, '%'),
-                'L1_miss_latency': (268, None, 0.1, 'clock cycles'),
+                'L1_latency': (148, None, 0.1, 'clock cycles'),
+                'L1_miss_rate': (74.6, None, 0.1, '%'),
+                'L1_miss_latency': (407, None, 0.1, 'clock cycles'),
 
             },
             'daint:gpu': {
-                'L1_latency': (112, None, 0.1, 'clock cycles'),
-                'L1_miss_rate': (33.3, None, 0.1, '%'),
-                'L1_miss_latency': (268, None, 0.1, 'clock cycles'),
+                'L1_latency': (148, None, 0.1, 'clock cycles'),
+                'L1_miss_rate': (74.6, None, 0.1, '%'),
+                'L1_miss_latency': (407, None, 0.1, 'clock cycles'),
 
             },
             'tsa:cn': {

From d484c7975b883adcf00ae8af03868141d81aca81 Mon Sep 17 00:00:00 2001
From: Javier Otero <jotero@cscs.ch>
Date: Tue, 19 Jan 2021 18:29:11 +0100
Subject: [PATCH 39/51] Add benchmark tags

---
 cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
index 3cbc873661..932c2969e7 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
@@ -39,6 +39,7 @@ def __init__(self):
         self.postbuild_cmds = ['ls .']
         self.sanity_patterns = sn.assert_found(r'pChase.x', self.stdout)
         self.maintainers = ['JO']
+        self.tags = {'benchmark'}
 
     @rfm.run_after('setup')
     def select_makefile(self):
@@ -89,6 +90,7 @@ def __init__(self):
         self.exclusive_access = True
         self.sanity_patterns = self.do_sanity_check()
         self.maintainers = ['JO']
+        self.tag = {'benchmark'}
 
     @rfm.run_before('run')
     def set_num_gpus_per_node(self):

From 82d60856fde3494fe8013dc91b5fcdae3f1592f4 Mon Sep 17 00:00:00 2001
From: Javier Otero <jotero@cscs.ch>
Date: Tue, 19 Jan 2021 18:34:25 +0100
Subject: [PATCH 40/51] Fix typo

---
 cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
index 932c2969e7..e2d7ae035b 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
@@ -90,7 +90,7 @@ def __init__(self):
         self.exclusive_access = True
         self.sanity_patterns = self.do_sanity_check()
         self.maintainers = ['JO']
-        self.tag = {'benchmark'}
+        self.tags = {'benchmark'}
 
     @rfm.run_before('run')
     def set_num_gpus_per_node(self):

From 1bd83c19c23b4cf45206a5069bc3179c375c804c Mon Sep 17 00:00:00 2001
From: Javier Otero <jotero@cscs.ch>
Date: Wed, 20 Jan 2021 10:15:46 +0100
Subject: [PATCH 41/51] Replace cudatoolkit module by craype-accel-nvidia60

---
 cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
index e2d7ae035b..c7005445a6 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
@@ -65,7 +65,7 @@ def set_gpu_arch(self):
         if nvidia_sm:
             self.build_system.cxxflags += [f'-arch=sm_{nvidia_sm}']
             if cp in {'dom:gpu', 'daint:gpu'}:
-                self.modules += ['cudatoolkit']
+                self.modules += ['craype-accel-nvidia60']
                 if cp == 'dom:gpu':
                     self.modules += ['cdt-cuda']
 

From 0ee2c131dbb8ebcfc0158544e1e9d77aff8c2774 Mon Sep 17 00:00:00 2001
From: Javier Otero <jotero@cscs.ch>
Date: Fri, 29 Jan 2021 17:58:19 +0100
Subject: [PATCH 42/51] Add consitent naming

---
 .../gpu/pointer_chase/pointer_chase.py        |  4 +-
 .../gpu/pointer_chase/src/linked_list.hpp     | 82 +++++++++----------
 .../gpu/pointer_chase/src/pointer_chase.cu    | 68 +++++++--------
 3 files changed, 77 insertions(+), 77 deletions(-)

diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
index c7005445a6..9e3af06bc7 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
@@ -288,7 +288,7 @@ class GpuPointerChaseAverageP2PLatency(GpuPointerChaseDep):
     def __init__(self):
         super().__init__()
         self.valid_systems = Pchase.multi_device
-        self.executable_opts = ['--multiGPU']
+        self.executable_opts = ['--multi-gpu']
         self.perf_patterns = {
             'average_latency': self.average_P2P_latency(),
         }
@@ -493,7 +493,7 @@ class GpuPointerChaseL1P2P(GpuPointerChaseFineDep, L1_filter):
     def __init__(self):
         super().__init__()
         self.valid_systems = Pchase.multi_device
-        self.executable_opts = ['--multiGPU']
+        self.executable_opts = ['--multi-gpu']
         self.perf_patterns = {
             'L1_latency': self.max_L1_latency(),
             'L1_miss_rate': self.L1_miss_rate(),
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/linked_list.hpp b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/linked_list.hpp
index 4e453118e1..9dffd2c357 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/linked_list.hpp
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/linked_list.hpp
@@ -7,21 +7,21 @@
  */
 
 
-__global__ void clockLatency(int * clk)
+__global__ void clock_latency(int * clk)
 {
   // This returns the clock latency when reading the 64-bit clock counter.
   clk[0] = XClockLatency<int>();
 }
 
 
-void printClockLatency(char * nid, int dev)
+void print_clock_latency(char * nid, int dev)
 {
   /* Prints the latency of reading the clock cycles */
   int * clk_d;
   int clk;
   XSetDevice(dev);
   XMalloc((void**)&clk_d, sizeof(int));
-  clockLatency<<<1,1>>>(clk_d);
+  clock_latency<<<1,1>>>(clk_d);
   XDeviceSynchronize();
   XMemcpy(&clk, clk_d, sizeof(int), XMemcpyDeviceToHost);
   XFree(clk_d);
@@ -87,15 +87,15 @@ __global__ void initialize_random_list(Node * buffer, uint32_t *indices)
 
 }
 
-__global__ void simple_traverse(Node * __restrict__ buffer, uint32_t headIndex)
+__global__ void simple_traverse(Node * __restrict__ buffer, uint32_t head_index)
 {
   /* Simple list traverse - no timing is done here
    * - buffer: where the list is
-   * - headIndex: index in the buffer where the head of the list is
+   * - head_index: index in the buffer where the head of the list is
    */
 
   uint32_t count = 0;
-  Node * head = &(buffer[headIndex]);
+  Node * head = &(buffer[head_index]);
   Node * ptr = head;
   while(ptr->next != nullptr || count < NODES-1)
   {
@@ -127,7 +127,7 @@ __global__ void simple_traverse(Node * __restrict__ buffer, uint32_t headIndex)
  * list traversal as a whole.
  */
 template < unsigned int repeat >
-__device__ __forceinline__ void nextNode( __VOLATILE__ Node ** ptr, uint32_t * timer, Node ** ptrs)
+__device__ __forceinline__ void next_node( __VOLATILE__ Node ** ptr, uint32_t * timer, Node ** ptrs)
 {
   /*
    * Recursive function to traverse the list.
@@ -148,15 +148,15 @@ __device__ __forceinline__ void nextNode( __VOLATILE__ Node ** ptr, uint32_t * t
 # endif
 
   // Keep traversing the list.
-  nextNode<repeat-1>(ptr, timer+1, ptrs+1);
+  next_node<repeat-1>(ptr, timer+1, ptrs+1);
 }
 
 // Specialize the function to break the recursion.
 template<>
-__device__ __forceinline__ void  nextNode<0>( __VOLATILE__ Node ** ptr, uint32_t * timer, Node ** ptrs){}
+__device__ __forceinline__ void  next_node<0>( __VOLATILE__ Node ** ptr, uint32_t * timer, Node ** ptrs){}
 
 
-__global__ void timed_list_traversal(Node * __restrict__ buffer, uint32_t headIndex, uint32_t * timer)
+__global__ void timed_list_traversal(Node * __restrict__ buffer, uint32_t head_index, uint32_t * timer)
 {
   /* Timed List traversal - we make a singly-linked list circular just to have a data dep. and
    * cover from compiler optimisations.
@@ -167,7 +167,7 @@ __global__ void timed_list_traversal(Node * __restrict__ buffer, uint32_t headIn
   __shared__ Node * ptrs[NODES-1];
 
   // Create a pointer to iterate through the list
-  __VOLATILE__ Node * ptr = &(buffer[headIndex]);
+  __VOLATILE__ Node * ptr = &(buffer[head_index]);
 
 #ifndef TIME_EACH_STEP
   // start timer
@@ -176,7 +176,7 @@ __global__ void timed_list_traversal(Node * __restrict__ buffer, uint32_t headIn
 #endif
 
   // Traverse the list
-  nextNode<NODES-1>(&ptr, s_timer, ptrs);
+  next_node<NODES-1>(&ptr, s_timer, ptrs);
 
 #ifndef TIME_EACH_STEP
   // end cycle count
@@ -195,7 +195,7 @@ __global__ void timed_list_traversal(Node * __restrict__ buffer, uint32_t headIn
   // Join the tail with the head (just for the data dependency).
   if (ptr->next == nullptr)
   {
-    ptr->next = &(buffer[headIndex]);
+    ptr->next = &(buffer[head_index]);
   }
 
 }
@@ -220,13 +220,13 @@ struct List
    */
 
   Node * buffer = nullptr;
-  uint32_t headIndex = 0;
+  uint32_t head_index = 0;
   uint32_t * timer = nullptr;
   uint32_t * d_timer = nullptr;
-  size_t buffSize;
+  size_t buff_size;
   size_t stride;
 
-  List(size_t bSize, size_t st) : buffSize(bSize), stride(st)
+  List(size_t bsize, size_t st) : buff_size(bsize), stride(st)
   {
     // Allocate the buffers to store the timings measured in the kernel
     timer = new uint32_t[NODES];
@@ -238,12 +238,12 @@ struct List
     XFree(d_timer);
   }
 
-  void info(size_t n, size_t buffSize)
+  void info(size_t n, size_t buff_size)
   {
     printf("Creating Linked list:\n");
     printf(" - Node size: %lu\n", sizeof(Node));
     printf(" - Number of nodes: %lu:\n", n);
-    printf(" - Total buffer size: %10.2f MB:\n", float(sizeof(Node)*buffSize)/1024.0/1024);
+    printf(" - Total buffer size: %10.2f MB:\n", float(sizeof(Node)*buff_size)/1024.0/1024);
     XDeviceSynchronize();
   }
 
@@ -274,39 +274,39 @@ struct List
       rng.seed(ss);
       std::uniform_real_distribution<double> unif(0, 1);
 
-      uint32_t * nodeIndices = (uint32_t*)malloc(sizeof(uint32_t)*NODES);
+      uint32_t * node_indices = (uint32_t*)malloc(sizeof(uint32_t)*NODES);
       // Create set to keep track of the assigned indices.
       std::set<uint32_t> s = {};
       for (int i = 0; i < NODES; i++)
       {
         // Get a random index.
-        uint32_t currentIndex = (uint32_t)(unif(rng)*buffSize);
+        uint32_t current_index = (uint32_t)(unif(rng)*buff_size);
 
         // If already present in the set, find another alternative index.
-        while (s.find(currentIndex) != s.end())
+        while (s.find(current_index) != s.end())
         {
-          if (currentIndex < NODES-1)
+          if (current_index < NODES-1)
           {
-            currentIndex++;
+            current_index++;
           }
           else
           {
-            currentIndex = 0;
+            current_index = 0;
           }
         }
 
-        nodeIndices[i] = currentIndex;
-        s.insert(currentIndex);
+        node_indices[i] = current_index;
+        s.insert(current_index);
       }
 
       // Copy the node indices to the device and init the random list
-      uint32_t * d_nodeIndices;
-      XMalloc((void**)&d_nodeIndices, sizeof(uint32_t)*NODES);
-      XMemcpy(d_nodeIndices, nodeIndices, sizeof(uint32_t)*NODES, XMemcpyHostToDevice);
-      initialize_random_list<<<1,1>>>(buffer, d_nodeIndices);
-      headIndex = nodeIndices[0];
-      free(nodeIndices);
-      XFree(d_nodeIndices);
+      uint32_t * d_node_indices;
+      XMalloc((void**)&d_node_indices, sizeof(uint32_t)*NODES);
+      XMemcpy(d_node_indices, node_indices, sizeof(uint32_t)*NODES, XMemcpyHostToDevice);
+      initialize_random_list<<<1,1>>>(buffer, d_node_indices);
+      head_index = node_indices[0];
+      free(node_indices);
+      XFree(d_node_indices);
     }
 
     XDeviceSynchronize();
@@ -317,7 +317,7 @@ struct List
     /*
      * Simple list traversal - NOT timed.
      */
-    simple_traverse<<<1,1>>>(buffer, headIndex);
+    simple_traverse<<<1,1>>>(buffer, head_index);
     XDeviceSynchronize();
   }
 
@@ -327,7 +327,7 @@ struct List
      * Timed list traversal
      */
 
-    timed_list_traversal<<<1,1>>>(buffer, headIndex, d_timer);
+    timed_list_traversal<<<1,1>>>(buffer, head_index, d_timer);
     XDeviceSynchronize();
 
     // Copy the timing data back to the host
@@ -343,12 +343,12 @@ struct DeviceList : public List
    * List allocated in device memory
    */
 
-  DeviceList(size_t n, size_t buffSize, size_t stride) : List(buffSize, stride)
+  DeviceList(size_t n, size_t buff_size, size_t stride) : List(buff_size, stride)
   {
 #   ifdef DEBUG
-    List::info(n, buffSize);
+    List::info(n, buff_size);
 #   endif
-    XMalloc((void**)&buffer, sizeof(Node)*buffSize);
+    XMalloc((void**)&buffer, sizeof(Node)*buff_size);
   }
 
   ~DeviceList()
@@ -365,12 +365,12 @@ struct HostList : public List
    */
 
   Node * h_buffer;
-  HostList(size_t n, size_t buffSize, size_t stride) : List(buffSize,stride)
+  HostList(size_t n, size_t buff_size, size_t stride) : List(buff_size,stride)
   {
 #   ifdef DEBUG
-    List::info(n, buffSize);
+    List::info(n, buff_size);
 #   endif
-    XHostMalloc((void**)&h_buffer, sizeof(Node)*buffSize, XHostAllocMapped);
+    XHostMalloc((void**)&h_buffer, sizeof(Node)*buff_size, XHostAllocMapped);
     XHostGetDevicePointer((void**)&buffer, (void*)h_buffer, 0);
   }
 
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
index d8a88e3fce..ec3a5e7318 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
@@ -57,7 +57,7 @@
 
 
 template < class LIST >
-uint32_t * generalPointerChase(int local_device, int remote_device, int init_mode, size_t buffSize, size_t stride)
+uint32_t * general_pointer_chase(int local_device, int remote_device, int init_mode, size_t buff_size, size_t stride)
 {
   /*
    * Driver to manage the whole allocation, list traversal, etc.
@@ -72,24 +72,24 @@ uint32_t * generalPointerChase(int local_device, int remote_device, int init_mod
    */
 
   XSetDevice(remote_device);
-  LIST l(NODES, buffSize, stride);
+  LIST l(NODES, buff_size, stride);
   l.initialize(init_mode);
 
   // Check if we have remote memory access.
   XSetDevice(local_device);
-  bool peerAccessSet = false;
+  bool peer_access_set = false;
   if (local_device!=remote_device)
   {
-    int hasPeerAccess;
-    XDeviceCanAccessPeer(&hasPeerAccess, local_device, remote_device);
-    if (!hasPeerAccess)
+    int has_peer_access;
+    XDeviceCanAccessPeer(&has_peer_access, local_device, remote_device);
+    if (!has_peer_access)
     {
       printf("Devices have no peer access.\n");
       exit(1);
     }
 
     // Enable the peerAccess access.
-    peerAccessSet = true;
+    peer_access_set = true;
     XDeviceEnablePeerAccess(remote_device, 0);
   }
 
@@ -99,7 +99,7 @@ uint32_t * generalPointerChase(int local_device, int remote_device, int init_mod
   // Time the pointer chase
   l.time_traversal();
 
-  if (peerAccessSet)
+  if (peer_access_set)
     XDeviceDisablePeerAccess(remote_device);
 
    // Set again the device where the allocations were placed, so it can take care of it's
@@ -111,14 +111,14 @@ uint32_t * generalPointerChase(int local_device, int remote_device, int init_mod
 
 
 template < class LIST >
-void localPointerChase(int num_devices, int init_mode, size_t buffSize, size_t stride, char * nid)
+void local_pointer_chase(int num_devices, int init_mode, size_t buff_size, size_t stride, char * nid)
 {
   /*
    * Specialised pointer chase on a single device.
    */
   for (int gpu_id = 0; gpu_id < num_devices; gpu_id++)
   {
-    uint32_t* timer = generalPointerChase< LIST >(gpu_id, gpu_id, init_mode, buffSize, stride);
+    uint32_t* timer = general_pointer_chase< LIST >(gpu_id, gpu_id, init_mode, buff_size, stride);
 
     // Print the timings of the pointer chase
 #   ifndef TIME_EACH_STEP
@@ -180,7 +180,7 @@ void print_device_table(int num_devices, std::queue<uint32_t> q, const char * wh
 
 
 template < class LIST >
-void remotePointerChase(int num_devices, int init_mode, size_t buffSize, size_t stride, char * nid, int summarize)
+void remote_pointer_chase(int num_devices, int init_mode, size_t buff_size, size_t stride, char * nid, int summarize)
 {
   /*
    * Specialised pointer chase to allocate the list in one device, and do the pointer chase from another device.
@@ -194,7 +194,7 @@ void remotePointerChase(int num_devices, int init_mode, size_t buffSize, size_t
 # else
   std::queue<uint32_t> q_max;
   std::queue<uint32_t> q_min;
-  auto fetchMax = [](uint32_t* t)
+  auto fetch_max = [](uint32_t* t)
   {
     uint32_t max = 0;
     for (int i = 0; i < NODES-1; i++)
@@ -204,7 +204,7 @@ void remotePointerChase(int num_devices, int init_mode, size_t buffSize, size_t
     }
     return max;
   };
-  auto fetchMin = [](uint32_t* t)
+  auto fetch_min = [](uint32_t* t)
   {
     uint32_t min = ~0;
     for (int i = 0; i < NODES-1; i++)
@@ -221,7 +221,7 @@ void remotePointerChase(int num_devices, int init_mode, size_t buffSize, size_t
   {
     for (int i = LIMITS; i < num_devices; i++)
     {
-      uint32_t * timer_ptr = generalPointerChase< LIST >(i, j, init_mode, buffSize, stride);
+      uint32_t * timer_ptr = general_pointer_chase< LIST >(i, j, init_mode, buff_size, stride);
 
       // Store the desired values for each element of the matrix in queues
 #     ifndef TIME_EACH_STEP
@@ -229,8 +229,8 @@ void remotePointerChase(int num_devices, int init_mode, size_t buffSize, size_t
 #     else
       if (summarize)
       {
-        q_min.push(fetchMin(timer_ptr));
-        q_max.push(fetchMax(timer_ptr));
+        q_min.push(fetch_min(timer_ptr));
+        q_max.push(fetch_max(timer_ptr));
       }
       else
       {
@@ -265,11 +265,11 @@ void remotePointerChase(int num_devices, int init_mode, size_t buffSize, size_t
 int main(int argc, char ** argv)
 {
   // Set program defaults before parsing the command line args.
-  int list_init_mode = 0;
+  int list_init_random = 0;
   size_t stride = 1;
-  size_t buffSize = NODES*stride;
-  int multiGPU = 0;
-  int print_mode = 0;
+  size_t buff_size = NODES*stride;
+  int multi_gpu = 0;
+  int print_summary_only = 0;
   int clock = 0;
 
   // Parse the command line args.
@@ -283,9 +283,9 @@ int main(int argc, char ** argv)
       std::cout << "              If --rand is used, this parameter just changes the buffer size." << std::endl;
       std::cout << "--buffer #  : Sets the size of the buffer where the linked list is allocated on. " << std::endl;
       std::cout << "              The number indicates the size of the buffer in list nodes." << std::endl;
-      std::cout << "--multiGPU  : Runs the pointer chase algo using all device-pair combinations." << std::endl;
+      std::cout << "--multi-gpu : Runs the pointer chase algo using all device-pair combinations." << std::endl;
       std::cout << "              This measures the device-to-device memory latency." << std::endl;
-      std::cout << "--summary   : When timing each node jump individually and used alongside --multiGPU, " << std::endl;
+      std::cout << "--summary   : When timing each node jump individually and used alongside --multi-gpu, " << std::endl;
       std::cout << "              this collapses the output into two tables with the min and max latencies." << std::endl;
       std::cout << "--clock     : Skip all the above and just print the clock latency for all devices." << std::endl;
       std::cout << "--help (-h) : I guess you figured what this does already ;)" << std::endl;
@@ -293,25 +293,25 @@ int main(int argc, char ** argv)
     }
     else if (str == "--rand")
     {
-      list_init_mode = 1;
+      list_init_random = 1;
     }
     else if (str == "--stride")
     {
       stride = std::stoi((std::string)argv[++i]);
-      if (buffSize < NODES*stride)
-          buffSize = NODES*stride;
+      if (buff_size < NODES*stride)
+          buff_size = NODES*stride;
     }
     else if (str == "--buffer")
     {
-      buffSize = std::stoi((std::string)argv[++i]);
+      buff_size = std::stoi((std::string)argv[++i]);
     }
-    else if (str == "--multiGPU")
+    else if (str == "--multi-gpu")
     {
-      multiGPU = 1;
+      multi_gpu = 1;
     }
     else if (str == "--summary")
     {
-      print_mode = 1;
+      print_summary_only = 1;
     }
     else if (str == "--clock")
     {
@@ -320,7 +320,7 @@ int main(int argc, char ** argv)
   }
 
   // Sanity of the command line args.
-  if (buffSize < NODES*stride)
+  if (buff_size < NODES*stride)
   {
     std::cerr << "Buffer is not large enough to fit the list." << std::endl;
     return 1;
@@ -347,18 +347,18 @@ int main(int argc, char ** argv)
   {
     for (int i = 0; i < num_devices; i++)
     {
-      printClockLatency(nid_name,i);
+      print_clock_latency(nid_name,i);
     }
   }
   else
   {
-    if (!multiGPU)
+    if (!multi_gpu)
     {
-      localPointerChase<LIST_TYPE>(num_devices, list_init_mode, buffSize, stride, nid_name);
+      local_pointer_chase<LIST_TYPE>(num_devices, list_init_random, buff_size, stride, nid_name);
     }
     else
     {
-      remotePointerChase<LIST_TYPE>(num_devices, list_init_mode, buffSize, stride, nid_name, print_mode);
+      remote_pointer_chase<LIST_TYPE>(num_devices, list_init_random, buff_size, stride, nid_name, print_summary_only);
     }
   }
 

From 4c36930840bb3c4df19f84a9e555f26eb20cd0fb Mon Sep 17 00:00:00 2001
From: Javier Otero <jotero@cscs.ch>
Date: Mon, 1 Feb 2021 15:24:20 +0100
Subject: [PATCH 43/51] Address PR comments

---
 .../gpu/pointer_chase/pointer_chase.py        |  2 +-
 .../gpu/pointer_chase/src/linked_list.hpp     | 67 +++++--------
 .../gpu/pointer_chase/src/pointer_chase.cu    | 94 ++++++++-----------
 3 files changed, 62 insertions(+), 101 deletions(-)

diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
index 9e3af06bc7..692ccb18b8 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
@@ -184,7 +184,7 @@ class GpuPointerChaseSingle(GpuPointerChaseDep):
     def __init__(self, stride):
         super().__init__()
         self.valid_systems = Pchase.valid_systems
-        self.executable_opts = ['--stride', f'{stride}']
+        self.executable_opts = ['--sparsity', f'{stride}']
         self.perf_patterns = {
             'average_latency': sn.max(sn.extractall(
                 r'^\s*\[[^\]]*\]\s* On device \d+, '
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/linked_list.hpp b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/linked_list.hpp
index 9dffd2c357..b6c798f632 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/linked_list.hpp
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/linked_list.hpp
@@ -54,12 +54,12 @@ __global__ void initialize_list(Node * buffer, int stride = 1)
    */
 
   // Set the head
-  Node * prev = new (&(buffer[0])) Node();
+  Node * prev = buffer;
 
   // Init the rest of the list
-  for (int n = 1; n < NODES; n++)
+  for (int n = 1; n < num_nodes; n++)
   {
-    Node * temp = new (&(buffer[n*stride])) Node();
+    Node * temp = buffer + n*stride;
     prev->next = temp;
     prev = temp;
   }
@@ -75,12 +75,12 @@ __global__ void initialize_random_list(Node * buffer, uint32_t *indices)
    */
 
   // Set the head
-  Node * prev = new (&(buffer[indices[0]])) Node();
+  Node * prev = buffer + indices[0];
 
   // Init the rest of the list
-  for (int n = 1; n < NODES; n++)
+  for (int n = 1; n < num_nodes; n++)
   {
-    Node * temp = new (&(buffer[indices[n]])) Node();
+    Node * temp = buffer + indices[n];
     prev->next = temp;
     prev = temp;
   }
@@ -97,7 +97,7 @@ __global__ void simple_traverse(Node * __restrict__ buffer, uint32_t head_index)
   uint32_t count = 0;
   Node * head = &(buffer[head_index]);
   Node * ptr = head;
-  while(ptr->next != nullptr || count < NODES-1)
+  while(ptr->next != nullptr || count < num_nodes-1)
   {
     ptr = ptr->next;
     count++;
@@ -163,8 +163,8 @@ __global__ void timed_list_traversal(Node * __restrict__ buffer, uint32_t head_i
    */
 
   // These are used to prevent ILP when timing each jump.
-  __shared__ uint32_t s_timer[NODES-1];
-  __shared__ Node * ptrs[NODES-1];
+  __shared__ uint32_t s_timer[num_nodes-1];
+  __shared__ Node * ptrs[num_nodes-1];
 
   // Create a pointer to iterate through the list
   __VOLATILE__ Node * ptr = &(buffer[head_index]);
@@ -176,13 +176,13 @@ __global__ void timed_list_traversal(Node * __restrict__ buffer, uint32_t head_i
 #endif
 
   // Traverse the list
-  next_node<NODES-1>(&ptr, s_timer, ptrs);
+  next_node<num_nodes-1>(&ptr, s_timer, ptrs);
 
 #ifndef TIME_EACH_STEP
   // end cycle count
   timer[0] = clocks.end();
 #else
-  for (uint32_t i = 0; i < NODES-1; i++)
+  for (uint32_t i = 0; i < num_nodes-1; i++)
   {
     timer[i] = s_timer[i];
   }
@@ -229,8 +229,8 @@ struct List
   List(size_t bsize, size_t st) : buff_size(bsize), stride(st)
   {
     // Allocate the buffers to store the timings measured in the kernel
-    timer = new uint32_t[NODES];
-    XMalloc((void**)&d_timer, sizeof(uint32_t)*(NODES));
+    timer = new uint32_t[num_nodes];
+    XMalloc((void**)&d_timer, sizeof(uint32_t)*(num_nodes));
   };
 
   virtual ~List()
@@ -268,41 +268,16 @@ struct List
     else
     {
       // Random number engine.
-      std::mt19937_64 rng;
-      uint64_t timeSeed = std::chrono::high_resolution_clock::now().time_since_epoch().count();
-      std::seed_seq ss{uint32_t(timeSeed & 0xffffffff), uint32_t(timeSeed>>32)};
-      rng.seed(ss);
-      std::uniform_real_distribution<double> unif(0, 1);
-
-      uint32_t * node_indices = (uint32_t*)malloc(sizeof(uint32_t)*NODES);
-      // Create set to keep track of the assigned indices.
-      std::set<uint32_t> s = {};
-      for (int i = 0; i < NODES; i++)
-      {
-        // Get a random index.
-        uint32_t current_index = (uint32_t)(unif(rng)*buff_size);
-
-        // If already present in the set, find another alternative index.
-        while (s.find(current_index) != s.end())
-        {
-          if (current_index < NODES-1)
-          {
-            current_index++;
-          }
-          else
-          {
-            current_index = 0;
-          }
-        }
-
-        node_indices[i] = current_index;
-        s.insert(current_index);
-      }
+      std::random_device rd;
+      std::mt19937_64 gen(rd());
+      uint32_t * node_indices = (uint32_t*)malloc(sizeof(uint32_t)*num_nodes);
+      std::iota(node_ndices, node_indices + num_nodes, 0);
+      std::shuffle(node_indices, node_indices + num_nodes, gen);
 
       // Copy the node indices to the device and init the random list
       uint32_t * d_node_indices;
-      XMalloc((void**)&d_node_indices, sizeof(uint32_t)*NODES);
-      XMemcpy(d_node_indices, node_indices, sizeof(uint32_t)*NODES, XMemcpyHostToDevice);
+      XMalloc((void**)&d_node_indices, sizeof(uint32_t)*num_nodes);
+      XMemcpy(d_node_indices, node_indices, sizeof(uint32_t)*num_nodes, XMemcpyHostToDevice);
       initialize_random_list<<<1,1>>>(buffer, d_node_indices);
       head_index = node_indices[0];
       free(node_indices);
@@ -331,7 +306,7 @@ struct List
     XDeviceSynchronize();
 
     // Copy the timing data back to the host
-    XMemcpy(timer, d_timer, sizeof(uint32_t)*(NODES-1), XMemcpyDeviceToHost);
+    XMemcpy(timer, d_timer, sizeof(uint32_t)*(num_nodes-1), XMemcpyDeviceToHost);
   }
 
 };
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
index ec3a5e7318..78e49c142a 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
@@ -19,16 +19,11 @@
  The stride and the full buffer size can be set with "--stride" and "--buffer",
  both in number of nodes.
 
- The macro NODES sets the total number of nodes in the list. Note that the
- list traversal is 'unrolled' inlining a recursive template, and this will
- not work if you use a large number of nodes.
-
  The nodes can be padded with an arbitrary size controlled by the NODE_PADDING
  macro (in Bytes).
 
- The LIST_TYPE macro dictates where the list is allocated. If DeviceList is used
- (default option) the linked list is allocated in device memory. In contrast, if
- HostList is used, the list is allocated as host's pinned memory.
+ If the ALLOC_ON_HOST macro is defined, the list will be allocated in host
+ pinned memory. Otherwise, the list is allocated in device memory.
 
  The links of the list can be made volatile defining the macro VOLATILE.
 
@@ -37,13 +32,9 @@
  TIME_EACH_STEP.
 */
 
-#define NODES 64
+constexpr int num_nodes = 64;
 #define NODE_PADDING 0
 
-#ifndef LIST_TYPE
-# define LIST_TYPE DeviceList
-#endif
-
 #ifndef HOSTNAME_SIZE
 # define HOSTNAME_SIZE 80
 #endif
@@ -54,9 +45,15 @@
 
 // List structure
 #include "linked_list.hpp"
+#ifdef ALLOC_ON_HOST
+using list_type = HostList;
+#else
+using list_type = DeviceList;
+#endif
+
 
 
-template < class LIST >
+template < class List >
 uint32_t * general_pointer_chase(int local_device, int remote_device, int init_mode, size_t buff_size, size_t stride)
 {
   /*
@@ -72,7 +69,7 @@ uint32_t * general_pointer_chase(int local_device, int remote_device, int init_m
    */
 
   XSetDevice(remote_device);
-  LIST l(NODES, buff_size, stride);
+  List l(num_nodes, buff_size, stride);
   l.initialize(init_mode);
 
   // Check if we have remote memory access.
@@ -110,7 +107,7 @@ uint32_t * general_pointer_chase(int local_device, int remote_device, int init_m
 }
 
 
-template < class LIST >
+template < class List >
 void local_pointer_chase(int num_devices, int init_mode, size_t buff_size, size_t stride, char * nid)
 {
   /*
@@ -118,14 +115,14 @@ void local_pointer_chase(int num_devices, int init_mode, size_t buff_size, size_
    */
   for (int gpu_id = 0; gpu_id < num_devices; gpu_id++)
   {
-    uint32_t* timer = general_pointer_chase< LIST >(gpu_id, gpu_id, init_mode, buff_size, stride);
+    uint32_t* timer = general_pointer_chase< List >(gpu_id, gpu_id, init_mode, buff_size, stride);
 
     // Print the timings of the pointer chase
 #   ifndef TIME_EACH_STEP
-    printf("[%s] On device %d, the chase took on average %d cycles per node jump.\n", nid, gpu_id, timer[0]/(NODES-1));
+    printf("[%s] On device %d, the chase took on average %d cycles per node jump.\n", nid, gpu_id, timer[0]/(num_nodes-1));
 #   else
     printf("[%s] Latency for each node jump (device %d):\n", nid, gpu_id);
-    for (uint32_t i = 0; i < NODES-1; i++)
+    for (uint32_t i = 0; i < num_nodes-1; i++)
     {
       printf("[%s][device %d] %d\n", nid, gpu_id, timer[i]);
     }
@@ -179,7 +176,7 @@ void print_device_table(int num_devices, std::queue<uint32_t> q, const char * wh
 }
 
 
-template < class LIST >
+template < class List >
 void remote_pointer_chase(int num_devices, int init_mode, size_t buff_size, size_t stride, char * nid, int summarize)
 {
   /*
@@ -190,14 +187,14 @@ void remote_pointer_chase(int num_devices, int init_mode, size_t buff_size, size
 
 # ifndef TIME_EACH_STEP
   std::queue<uint32_t> q_average;
-  auto fetch = [](uint32_t* t){return t[0]/(NODES-1);};
+  auto fetch = [](uint32_t* t){return t[0]/(num_nodes-1);};
 # else
   std::queue<uint32_t> q_max;
   std::queue<uint32_t> q_min;
   auto fetch_max = [](uint32_t* t)
   {
     uint32_t max = 0;
-    for (int i = 0; i < NODES-1; i++)
+    for (int i = 0; i < num_nodes-1; i++)
     {
       if (t[i] > max)
         max = t[i];
@@ -207,7 +204,7 @@ void remote_pointer_chase(int num_devices, int init_mode, size_t buff_size, size
   auto fetch_min = [](uint32_t* t)
   {
     uint32_t min = ~0;
-    for (int i = 0; i < NODES-1; i++)
+    for (int i = 0; i < num_nodes-1; i++)
     {
       if (t[i] < min)
         min = t[i];
@@ -221,7 +218,7 @@ void remote_pointer_chase(int num_devices, int init_mode, size_t buff_size, size
   {
     for (int i = LIMITS; i < num_devices; i++)
     {
-      uint32_t * timer_ptr = general_pointer_chase< LIST >(i, j, init_mode, buff_size, stride);
+      uint32_t * timer_ptr = general_pointer_chase< List >(i, j, init_mode, buff_size, stride);
 
       // Store the desired values for each element of the matrix in queues
 #     ifndef TIME_EACH_STEP
@@ -234,7 +231,7 @@ void remote_pointer_chase(int num_devices, int init_mode, size_t buff_size, size
       }
       else
       {
-        for (int n = 0; n < NODES-1; n++)
+        for (int n = 0; n < num_nodes-1; n++)
         {
           printf("[%s][device %d][device %d] %d\n", nid, j, i, timer_ptr[n]);
         }
@@ -266,8 +263,8 @@ int main(int argc, char ** argv)
 {
   // Set program defaults before parsing the command line args.
   int list_init_random = 0;
-  size_t stride = 1;
-  size_t buff_size = NODES*stride;
+  size_t sparsity = 1;
+  size_t buff_size = num_nodes*sparsity;
   int multi_gpu = 0;
   int print_summary_only = 0;
   int clock = 0;
@@ -278,32 +275,28 @@ int main(int argc, char ** argv)
     std::string str = argv[i];
     if (str == "--help" || str == "-h")
     {
-      std::cout << "--rand      : Initializes the linked list with nodes in random order." << std::endl;
-      std::cout << "--stride #  : Sets the stride between the nodes in the list (in number of nodes)." << std::endl;
-      std::cout << "              If --rand is used, this parameter just changes the buffer size." << std::endl;
-      std::cout << "--buffer #  : Sets the size of the buffer where the linked list is allocated on. " << std::endl;
-      std::cout << "              The number indicates the size of the buffer in list nodes." << std::endl;
-      std::cout << "--multi-gpu : Runs the pointer chase algo using all device-pair combinations." << std::endl;
-      std::cout << "              This measures the device-to-device memory latency." << std::endl;
-      std::cout << "--summary   : When timing each node jump individually and used alongside --multi-gpu, " << std::endl;
-      std::cout << "              this collapses the output into two tables with the min and max latencies." << std::endl;
-      std::cout << "--clock     : Skip all the above and just print the clock latency for all devices." << std::endl;
-      std::cout << "--help (-h) : I guess you figured what this does already ;)" << std::endl;
+      std::cout << "--rand       : Places the linked list nodes into the buffer in random order (i.e." << std::endl;
+      std::cout << "               consecutive list nodes are not consecutive in memory). If this option" << std::endl;
+      std::cout << "               is not used, the nodes are placed in sequential order." << std::endl;
+      std::cout << "--sparsity # : Controls the sparsity of the list nodes in the buffer. This sets the" << std::endl;
+      std::cout << "               buffer size where the list is placed as sparsity*num_nodes. If the" << std::endl;
+      std::cout << "               list is initialized in sequential order, this effectively sets the stride." << std::endl;
+      std::cout << "--multi-gpu  : Runs the pointer chase algo using all device-pair combinations." << std::endl;
+      std::cout << "               This measures the device-to-device memory latency." << std::endl;
+      std::cout << "--summary    : When timing each node jump individually and used alongside --multi-gpu, " << std::endl;
+      std::cout << "               this collapses the output into two tables with the min and max latencies." << std::endl;
+      std::cout << "--clock      : Skip all the above and just print the clock latency for all devices." << std::endl;
+      std::cout << "--help (-h)  : I guess you figured what this does already ;)" << std::endl;
       return 0;
     }
     else if (str == "--rand")
     {
       list_init_random = 1;
     }
-    else if (str == "--stride")
+    else if (str == "--sparsity")
     {
-      stride = std::stoi((std::string)argv[++i]);
-      if (buff_size < NODES*stride)
-          buff_size = NODES*stride;
-    }
-    else if (str == "--buffer")
-    {
-      buff_size = std::stoi((std::string)argv[++i]);
+      sparsity = std::stoi((std::string)argv[++i]);
+      buff_size = num_nodes*sparsity;
     }
     else if (str == "--multi-gpu")
     {
@@ -319,13 +312,6 @@ int main(int argc, char ** argv)
     }
   }
 
-  // Sanity of the command line args.
-  if (buff_size < NODES*stride)
-  {
-    std::cerr << "Buffer is not large enough to fit the list." << std::endl;
-    return 1;
-  }
-
   // Get the node name
   char nid_name[HOSTNAME_SIZE];
   gethostname(nid_name, HOSTNAME_SIZE);
@@ -354,11 +340,11 @@ int main(int argc, char ** argv)
   {
     if (!multi_gpu)
     {
-      local_pointer_chase<LIST_TYPE>(num_devices, list_init_random, buff_size, stride, nid_name);
+      local_pointer_chase<list_type>(num_devices, list_init_random, buff_size, sparsity, nid_name);
     }
     else
     {
-      remote_pointer_chase<LIST_TYPE>(num_devices, list_init_random, buff_size, stride, nid_name, print_summary_only);
+      remote_pointer_chase<list_type>(num_devices, list_init_random, buff_size, sparsity, nid_name, print_summary_only);
     }
   }
 

From 80f3b14a1ade1f4338b84ef730bde6e8d2642d5a Mon Sep 17 00:00:00 2001
From: Javier Otero <jotero@cscs.ch>
Date: Mon, 1 Feb 2021 15:36:55 +0100
Subject: [PATCH 44/51] Add missing include

---
 .../microbenchmarks/gpu/pointer_chase/src/linked_list.hpp       | 2 +-
 .../microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/linked_list.hpp b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/linked_list.hpp
index b6c798f632..7c3399d460 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/linked_list.hpp
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/linked_list.hpp
@@ -271,7 +271,7 @@ struct List
       std::random_device rd;
       std::mt19937_64 gen(rd());
       uint32_t * node_indices = (uint32_t*)malloc(sizeof(uint32_t)*num_nodes);
-      std::iota(node_ndices, node_indices + num_nodes, 0);
+      std::iota(node_indices, node_indices + num_nodes, 0);
       std::shuffle(node_indices, node_indices + num_nodes, gen);
 
       // Copy the node indices to the device and init the random list
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
index 78e49c142a..d1b9fea334 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
@@ -4,8 +4,8 @@
 #include <string>
 #include <random>
 #include <chrono>
-#include <set>
 #include <memory>
+#include <algorithm>
 #include <queue>
 
 /*

From a823a4e2b03763335edb13e126548ee4ba315c40 Mon Sep 17 00:00:00 2001
From: Javier Otero <jotero@cscs.ch>
Date: Mon, 1 Feb 2021 18:48:55 +0100
Subject: [PATCH 45/51] Make the chase circular

---
 .../gpu/pointer_chase/src/linked_list.hpp     | 76 +++++++++----------
 .../gpu/pointer_chase/src/pointer_chase.cu    | 53 ++++++++-----
 2 files changed, 72 insertions(+), 57 deletions(-)

diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/linked_list.hpp b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/linked_list.hpp
index 7c3399d460..45af52ba64 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/linked_list.hpp
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/linked_list.hpp
@@ -45,7 +45,7 @@ struct Node
  *  Kernels and device functions
  */
 
-__global__ void initialize_list(Node * buffer, int stride = 1)
+__global__ void initialize_list(Node * buffer, int num_nodes, int stride = 1)
 {
   /* List serial initializer.
    * - buffer: where the list is to be placed.
@@ -63,10 +63,11 @@ __global__ void initialize_list(Node * buffer, int stride = 1)
     prev->next = temp;
     prev = temp;
   }
+  prev->next = buffer;
 
 }
 
-__global__ void initialize_random_list(Node * buffer, uint32_t *indices)
+__global__ void initialize_random_list(Node * buffer, int num_nodes, uint32_t *indices)
 {
   /* List random initializer
    * - buffer: where the list is to be placed.
@@ -84,6 +85,7 @@ __global__ void initialize_random_list(Node * buffer, uint32_t *indices)
     prev->next = temp;
     prev = temp;
   }
+  prev->next = buffer + indices[0];
 
 }
 
@@ -94,19 +96,17 @@ __global__ void simple_traverse(Node * __restrict__ buffer, uint32_t head_index)
    * - head_index: index in the buffer where the head of the list is
    */
 
-  uint32_t count = 0;
   Node * head = &(buffer[head_index]);
   Node * ptr = head;
-  while(ptr->next != nullptr || count < num_nodes-1)
+  while(ptr->next != head)
   {
     ptr = ptr->next;
-    count++;
   }
 
   // Silly dep. to tell the compiler not to throw away this kernel.
-  if (ptr->next == head)
+  if (ptr->next->next == head)
   {
-    printf("You had a circular list :(\n");
+    printf("The impossible just happened\n");
   }
 
 }
@@ -126,8 +126,7 @@ __global__ void simple_traverse(Node * __restrict__ buffer, uint32_t head_index)
  * Depending on the compiler flags used, the timing can either measure each node jump, or the entire
  * list traversal as a whole.
  */
-template < unsigned int repeat >
-__device__ __forceinline__ void next_node( __VOLATILE__ Node ** ptr, uint32_t * timer, Node ** ptrs)
+__device__ __forceinline__ void next_node( __VOLATILE__ Node ** ptr, uint32_t * timer, Node ** ptrs, int & jump)
 {
   /*
    * Recursive function to traverse the list.
@@ -135,6 +134,7 @@ __device__ __forceinline__ void next_node( __VOLATILE__ Node ** ptr, uint32_t *
    * - timer: Array to store the timings of each individual node jump.
    *   Only used if this option is activated (-DTIME_EACH_STEP)
    * - ptrs: Just used to have a data dependency to block ILP.
+   * - jump: Int to keep track of the number of jumps
    */
 
 # ifdef TIME_EACH_STEP
@@ -143,18 +143,13 @@ __device__ __forceinline__ void next_node( __VOLATILE__ Node ** ptr, uint32_t *
 # endif
   (*ptr) = (*ptr)->next;
 # ifdef TIME_EACH_STEP
-  (*ptrs) = (Node*)(*ptr);  // Data dep. to prevent ILP.
-  *timer = clocks.end();    // Time the jump
+  *(ptrs+jump) = (*ptr);  // Data dep. to prevent ILP.
+  *(timer+jump) = clocks.end();    // Time the jump
 # endif
 
-  // Keep traversing the list.
-  next_node<repeat-1>(ptr, timer+1, ptrs+1);
+  jump++;
 }
 
-// Specialize the function to break the recursion.
-template<>
-__device__ __forceinline__ void  next_node<0>( __VOLATILE__ Node ** ptr, uint32_t * timer, Node ** ptrs){}
-
 
 __global__ void timed_list_traversal(Node * __restrict__ buffer, uint32_t head_index, uint32_t * timer)
 {
@@ -163,12 +158,15 @@ __global__ void timed_list_traversal(Node * __restrict__ buffer, uint32_t head_i
    */
 
   // These are used to prevent ILP when timing each jump.
-  __shared__ uint32_t s_timer[num_nodes-1];
-  __shared__ Node * ptrs[num_nodes-1];
+  __shared__ uint32_t s_timer[JUMPS];
+  __shared__ Node * ptrs[JUMPS];
 
   // Create a pointer to iterate through the list
   __VOLATILE__ Node * ptr = &(buffer[head_index]);
 
+  // Node jump counter
+  int jump = 0;
+
 #ifndef TIME_EACH_STEP
   // start timer
   XClocks64 clocks;
@@ -176,23 +174,23 @@ __global__ void timed_list_traversal(Node * __restrict__ buffer, uint32_t head_i
 #endif
 
   // Traverse the list
-  next_node<num_nodes-1>(&ptr, s_timer, ptrs);
+  REPEAT_JUMPS(next_node(&ptr, s_timer, ptrs, jump);)
 
 #ifndef TIME_EACH_STEP
   // end cycle count
   timer[0] = clocks.end();
 #else
-  for (uint32_t i = 0; i < num_nodes-1; i++)
+  for (uint32_t i = 0; i < JUMPS; i++)
   {
     timer[i] = s_timer[i];
   }
-  if (ptr == ptrs[0])
+  if (ptrs[1] == ptrs[0])
   {
     printf("This is some data dependency that will never be executed.");
   }
 #endif
 
-  // Join the tail with the head (just for the data dependency).
+  // Just for the data dependency - the list is already circular.
   if (ptr->next == nullptr)
   {
     ptr->next = &(buffer[head_index]);
@@ -219,18 +217,18 @@ struct List
    *  - timed_traverse: traverses the list and measures the number of cycles per node jump.
    */
 
+  uint32_t num_nodes;
   Node * buffer = nullptr;
   uint32_t head_index = 0;
   uint32_t * timer = nullptr;
   uint32_t * d_timer = nullptr;
-  size_t buff_size;
   size_t stride;
 
-  List(size_t bsize, size_t st) : buff_size(bsize), stride(st)
+  List(int n, size_t st) : num_nodes(n), stride(st)
   {
     // Allocate the buffers to store the timings measured in the kernel
-    timer = new uint32_t[num_nodes];
-    XMalloc((void**)&d_timer, sizeof(uint32_t)*(num_nodes));
+    timer = new uint32_t[JUMPS];
+    XMalloc((void**)&d_timer, sizeof(uint32_t)*(JUMPS));
   };
 
   virtual ~List()
@@ -238,12 +236,12 @@ struct List
     XFree(d_timer);
   }
 
-  void info(size_t n, size_t buff_size)
+  void info()
   {
     printf("Creating Linked list:\n");
     printf(" - Node size: %lu\n", sizeof(Node));
-    printf(" - Number of nodes: %lu:\n", n);
-    printf(" - Total buffer size: %10.2f MB:\n", float(sizeof(Node)*buff_size)/1024.0/1024);
+    printf(" - Number of nodes: %lu:\n", num_nodes);
+    printf(" - Total buffer size: %10.2f MB:\n", float(sizeof(Node)*num_nodes*stride)/1024.0/1024);
     XDeviceSynchronize();
   }
 
@@ -262,7 +260,7 @@ struct List
 
     if (mode == 0)
     {
-      initialize_list<<<1,1>>>(buffer, stride);
+      initialize_list<<<1,1>>>(buffer, num_nodes, stride);
       XDeviceSynchronize();
     }
     else
@@ -278,7 +276,7 @@ struct List
       uint32_t * d_node_indices;
       XMalloc((void**)&d_node_indices, sizeof(uint32_t)*num_nodes);
       XMemcpy(d_node_indices, node_indices, sizeof(uint32_t)*num_nodes, XMemcpyHostToDevice);
-      initialize_random_list<<<1,1>>>(buffer, d_node_indices);
+      initialize_random_list<<<1,1>>>(buffer, num_nodes, d_node_indices);
       head_index = node_indices[0];
       free(node_indices);
       XFree(d_node_indices);
@@ -306,7 +304,7 @@ struct List
     XDeviceSynchronize();
 
     // Copy the timing data back to the host
-    XMemcpy(timer, d_timer, sizeof(uint32_t)*(num_nodes-1), XMemcpyDeviceToHost);
+    XMemcpy(timer, d_timer, sizeof(uint32_t)*JUMPS, XMemcpyDeviceToHost);
   }
 
 };
@@ -318,12 +316,12 @@ struct DeviceList : public List
    * List allocated in device memory
    */
 
-  DeviceList(size_t n, size_t buff_size, size_t stride) : List(buff_size, stride)
+  DeviceList(size_t num_nodes, size_t stride) : List(num_nodes, stride)
   {
 #   ifdef DEBUG
-    List::info(n, buff_size);
+    List::info();
 #   endif
-    XMalloc((void**)&buffer, sizeof(Node)*buff_size);
+    XMalloc((void**)&buffer, sizeof(Node)*num_nodes*stride);
   }
 
   ~DeviceList()
@@ -340,12 +338,12 @@ struct HostList : public List
    */
 
   Node * h_buffer;
-  HostList(size_t n, size_t buff_size, size_t stride) : List(buff_size,stride)
+  HostList(size_t num_nodes, size_t stride) : List(num_nodes, stride)
   {
 #   ifdef DEBUG
-    List::info(n, buff_size);
+    List::info();
 #   endif
-    XHostMalloc((void**)&h_buffer, sizeof(Node)*buff_size, XHostAllocMapped);
+    XHostMalloc((void**)&h_buffer, sizeof(Node)*num_nodes*stride, XHostAllocMapped);
     XHostGetDevicePointer((void**)&buffer, (void*)h_buffer, 0);
   }
 
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
index d1b9fea334..a1601b68e1 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
@@ -32,7 +32,19 @@
  TIME_EACH_STEP.
 */
 
-constexpr int num_nodes = 64;
+
+#define REPEAT2(x) x; x;
+#define REPEAT4(x) REPEAT2(x) REPEAT2(x)
+#define REPEAT8(x) REPEAT4(x) REPEAT4(x)
+#define REPEAT16(x) REPEAT8(x) REPEAT8(x)
+#define REPEAT32(x) REPEAT16(x) REPEAT16(x)
+#define REPEAT64(x) REPEAT32(x) REPEAT32(x)
+#define REPEAT128(x) REPEAT64(x) REPEAT64(x)
+#define REPEAT256(x) REPEAT128(x) REPEAT128(x)
+
+#define JUMPS 256
+#define REPEAT_JUMPS(x) REPEAT256(x)
+
 #define NODE_PADDING 0
 
 #ifndef HOSTNAME_SIZE
@@ -54,7 +66,7 @@ using list_type = DeviceList;
 
 
 template < class List >
-uint32_t * general_pointer_chase(int local_device, int remote_device, int init_mode, size_t buff_size, size_t stride)
+uint32_t * general_pointer_chase(int local_device, int remote_device, int init_mode, size_t num_nodes, size_t stride)
 {
   /*
    * Driver to manage the whole allocation, list traversal, etc.
@@ -64,12 +76,12 @@ uint32_t * general_pointer_chase(int local_device, int remote_device, int init_m
    * - local_device: ID of the device where the allocation of the list takes place
    * - remote_device: ID of the device doing the pointer chase.
    * - init_mode: see the List class.
-   * - buff_size: Size (in nodes) of the buffer.
+   * - num_nodes: nodes in the liked list.
    * - stride: Gap (in nodes) between two consecutive nodes. This only applies if init_mode is 0.
    */
 
   XSetDevice(remote_device);
-  List l(num_nodes, buff_size, stride);
+  List l(num_nodes, stride);
   l.initialize(init_mode);
 
   // Check if we have remote memory access.
@@ -108,21 +120,21 @@ uint32_t * general_pointer_chase(int local_device, int remote_device, int init_m
 
 
 template < class List >
-void local_pointer_chase(int num_devices, int init_mode, size_t buff_size, size_t stride, char * nid)
+void local_pointer_chase(int num_devices, int init_mode, size_t num_nodes, size_t stride, char * nid)
 {
   /*
    * Specialised pointer chase on a single device.
    */
   for (int gpu_id = 0; gpu_id < num_devices; gpu_id++)
   {
-    uint32_t* timer = general_pointer_chase< List >(gpu_id, gpu_id, init_mode, buff_size, stride);
+    uint32_t* timer = general_pointer_chase< List >(gpu_id, gpu_id, init_mode, num_nodes, stride);
 
     // Print the timings of the pointer chase
 #   ifndef TIME_EACH_STEP
-    printf("[%s] On device %d, the chase took on average %d cycles per node jump.\n", nid, gpu_id, timer[0]/(num_nodes-1));
+    printf("[%s] On device %d, the chase took on average %d cycles per node jump.\n", nid, gpu_id, timer[0]/JUMPS);
 #   else
     printf("[%s] Latency for each node jump (device %d):\n", nid, gpu_id);
-    for (uint32_t i = 0; i < num_nodes-1; i++)
+    for (uint32_t i = 0; i < JUMPS; i++)
     {
       printf("[%s][device %d] %d\n", nid, gpu_id, timer[i]);
     }
@@ -177,7 +189,7 @@ void print_device_table(int num_devices, std::queue<uint32_t> q, const char * wh
 
 
 template < class List >
-void remote_pointer_chase(int num_devices, int init_mode, size_t buff_size, size_t stride, char * nid, int summarize)
+void remote_pointer_chase(int num_devices, int init_mode, size_t num_nodes, size_t stride, char * nid, int summarize)
 {
   /*
    * Specialised pointer chase to allocate the list in one device, and do the pointer chase from another device.
@@ -187,14 +199,14 @@ void remote_pointer_chase(int num_devices, int init_mode, size_t buff_size, size
 
 # ifndef TIME_EACH_STEP
   std::queue<uint32_t> q_average;
-  auto fetch = [](uint32_t* t){return t[0]/(num_nodes-1);};
+  auto fetch = [](uint32_t* t){return t[0]/(JUMPS);};
 # else
   std::queue<uint32_t> q_max;
   std::queue<uint32_t> q_min;
   auto fetch_max = [](uint32_t* t)
   {
     uint32_t max = 0;
-    for (int i = 0; i < num_nodes-1; i++)
+    for (int i = 0; i < JUMPS; i++)
     {
       if (t[i] > max)
         max = t[i];
@@ -204,7 +216,7 @@ void remote_pointer_chase(int num_devices, int init_mode, size_t buff_size, size
   auto fetch_min = [](uint32_t* t)
   {
     uint32_t min = ~0;
-    for (int i = 0; i < num_nodes-1; i++)
+    for (int i = 0; i < JUMPS; i++)
     {
       if (t[i] < min)
         min = t[i];
@@ -218,7 +230,7 @@ void remote_pointer_chase(int num_devices, int init_mode, size_t buff_size, size
   {
     for (int i = LIMITS; i < num_devices; i++)
     {
-      uint32_t * timer_ptr = general_pointer_chase< List >(i, j, init_mode, buff_size, stride);
+      uint32_t * timer_ptr = general_pointer_chase< List >(i, j, init_mode, num_nodes, stride);
 
       // Store the desired values for each element of the matrix in queues
 #     ifndef TIME_EACH_STEP
@@ -231,7 +243,7 @@ void remote_pointer_chase(int num_devices, int init_mode, size_t buff_size, size
       }
       else
       {
-        for (int n = 0; n < num_nodes-1; n++)
+        for (int n = 0; n < JUMPS; n++)
         {
           printf("[%s][device %d][device %d] %d\n", nid, j, i, timer_ptr[n]);
         }
@@ -264,7 +276,7 @@ int main(int argc, char ** argv)
   // Set program defaults before parsing the command line args.
   int list_init_random = 0;
   size_t sparsity = 1;
-  size_t buff_size = num_nodes*sparsity;
+  size_t num_nodes = JUMPS;
   int multi_gpu = 0;
   int print_summary_only = 0;
   int clock = 0;
@@ -275,6 +287,8 @@ int main(int argc, char ** argv)
     std::string str = argv[i];
     if (str == "--help" || str == "-h")
     {
+      std::cout << "--nodes #    : Number of nodes in the linked list. If no value is specified, it " << std::endl;
+      std::cout << "               defaults to the number of node jumps set at compile time." << std::endl;
       std::cout << "--rand       : Places the linked list nodes into the buffer in random order (i.e." << std::endl;
       std::cout << "               consecutive list nodes are not consecutive in memory). If this option" << std::endl;
       std::cout << "               is not used, the nodes are placed in sequential order." << std::endl;
@@ -289,6 +303,10 @@ int main(int argc, char ** argv)
       std::cout << "--help (-h)  : I guess you figured what this does already ;)" << std::endl;
       return 0;
     }
+    else if (str == "--nodes")
+    {
+      num_nodes = std::stoi((std::string)argv[++i]);
+    }
     else if (str == "--rand")
     {
       list_init_random = 1;
@@ -296,7 +314,6 @@ int main(int argc, char ** argv)
     else if (str == "--sparsity")
     {
       sparsity = std::stoi((std::string)argv[++i]);
-      buff_size = num_nodes*sparsity;
     }
     else if (str == "--multi-gpu")
     {
@@ -340,11 +357,11 @@ int main(int argc, char ** argv)
   {
     if (!multi_gpu)
     {
-      local_pointer_chase<list_type>(num_devices, list_init_random, buff_size, sparsity, nid_name);
+      local_pointer_chase<list_type>(num_devices, list_init_random, num_nodes, sparsity, nid_name);
     }
     else
     {
-      remote_pointer_chase<list_type>(num_devices, list_init_random, buff_size, sparsity, nid_name, print_summary_only);
+      remote_pointer_chase<list_type>(num_devices, list_init_random, num_nodes, sparsity, nid_name, print_summary_only);
     }
   }
 

From e875d0f6f8d255ace762703739b1d5c02cb7d98c Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Mon, 1 Mar 2021 17:17:47 +0100
Subject: [PATCH 46/51] Remove single-jump timing routines

---
 .../gpu/pointer_chase/src/linked_list.hpp     | 113 +++----------
 .../gpu/pointer_chase/src/pointer_chase.cu    | 148 ++++--------------
 2 files changed, 50 insertions(+), 211 deletions(-)

diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/linked_list.hpp b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/linked_list.hpp
index 45af52ba64..4ca289d1be 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/linked_list.hpp
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/linked_list.hpp
@@ -86,116 +86,44 @@ __global__ void initialize_random_list(Node * buffer, int num_nodes, uint32_t *i
     prev = temp;
   }
   prev->next = buffer + indices[0];
-
 }
 
-__global__ void simple_traverse(Node * __restrict__ buffer, uint32_t head_index)
+__global__ void simple_traversal(Node * __restrict__ buffer)
 {
-  /* Simple list traverse - no timing is done here
-   * - buffer: where the list is
-   * - head_index: index in the buffer where the head of the list is
-   */
-
-  Node * head = &(buffer[head_index]);
-  Node * ptr = head;
-  while(ptr->next != head)
+  Node * ptr = buffer;
+  while(ptr->next != buffer)
   {
     ptr = ptr->next;
   }
 
   // Silly dep. to tell the compiler not to throw away this kernel.
-  if (ptr->next->next == head)
+  if (ptr->next->next == buffer)
   {
     printf("The impossible just happened\n");
   }
-
 }
 
 
-#ifdef VOLATILE
-# define __VOLATILE__ volatile
-#else
-# define __VOLATILE__
-#endif
-
-/*
- * Timed list traversal. This implementation is recursive (because it's less code) so you have to
- * watch out to not exceed the recursion limits. The functions are force-inlined, so the PTX code
- * looks identical as if you were to unwrap the recursion manually.
- *
- * Depending on the compiler flags used, the timing can either measure each node jump, or the entire
- * list traversal as a whole.
- */
-__device__ __forceinline__ void next_node( __VOLATILE__ Node ** ptr, uint32_t * timer, Node ** ptrs, int & jump)
+__global__ void timed_traversal(Node * __restrict__ buffer, size_t num_jumps, uint64_t * timer)
 {
-  /*
-   * Recursive function to traverse the list.
-   * - ptr: Pointer of a pointer to a node in the linked list.
-   * - timer: Array to store the timings of each individual node jump.
-   *   Only used if this option is activated (-DTIME_EACH_STEP)
-   * - ptrs: Just used to have a data dependency to block ILP.
-   * - jump: Int to keep track of the number of jumps
-   */
-
-# ifdef TIME_EACH_STEP
-  XClocks64 clocks;
-  clocks.start();
-# endif
-  (*ptr) = (*ptr)->next;
-# ifdef TIME_EACH_STEP
-  *(ptrs+jump) = (*ptr);  // Data dep. to prevent ILP.
-  *(timer+jump) = clocks.end();    // Time the jump
-# endif
-
-  jump++;
-}
-
-
-__global__ void timed_list_traversal(Node * __restrict__ buffer, uint32_t head_index, uint32_t * timer)
-{
-  /* Timed List traversal - we make a singly-linked list circular just to have a data dep. and
-   * cover from compiler optimisations.
-   */
-
-  // These are used to prevent ILP when timing each jump.
-  __shared__ uint32_t s_timer[JUMPS];
-  __shared__ Node * ptrs[JUMPS];
-
-  // Create a pointer to iterate through the list
-  __VOLATILE__ Node * ptr = &(buffer[head_index]);
-
-  // Node jump counter
-  int jump = 0;
-
-#ifndef TIME_EACH_STEP
   // start timer
   XClocks64 clocks;
   clocks.start();
-#endif
 
   // Traverse the list
-  REPEAT_JUMPS(next_node(&ptr, s_timer, ptrs, jump);)
+  while(num_jumps--)
+  {
+    buffer = buffer->next;
+  }
 
-#ifndef TIME_EACH_STEP
   // end cycle count
   timer[0] = clocks.end();
-#else
-  for (uint32_t i = 0; i < JUMPS; i++)
-  {
-    timer[i] = s_timer[i];
-  }
-  if (ptrs[1] == ptrs[0])
-  {
-    printf("This is some data dependency that will never be executed.");
-  }
-#endif
 
   // Just for the data dependency - the list is already circular.
-  if (ptr->next == nullptr)
+  if (buffer->next == nullptr)
   {
-    ptr->next = &(buffer[head_index]);
+    buffer->next = buffer;
   }
-
 }
 
 
@@ -219,16 +147,14 @@ struct List
 
   uint32_t num_nodes;
   Node * buffer = nullptr;
-  uint32_t head_index = 0;
-  uint32_t * timer = nullptr;
-  uint32_t * d_timer = nullptr;
+  uint64_t timer;
+  uint64_t * d_timer = nullptr;
   size_t stride;
 
   List(int n, size_t st) : num_nodes(n), stride(st)
   {
     // Allocate the buffers to store the timings measured in the kernel
-    timer = new uint32_t[JUMPS];
-    XMalloc((void**)&d_timer, sizeof(uint32_t)*(JUMPS));
+    XMalloc((void**)&d_timer, sizeof(uint64_t));
   };
 
   virtual ~List()
@@ -277,7 +203,6 @@ struct List
       XMalloc((void**)&d_node_indices, sizeof(uint32_t)*num_nodes);
       XMemcpy(d_node_indices, node_indices, sizeof(uint32_t)*num_nodes, XMemcpyHostToDevice);
       initialize_random_list<<<1,1>>>(buffer, num_nodes, d_node_indices);
-      head_index = node_indices[0];
       free(node_indices);
       XFree(d_node_indices);
     }
@@ -290,21 +215,21 @@ struct List
     /*
      * Simple list traversal - NOT timed.
      */
-    simple_traverse<<<1,1>>>(buffer, head_index);
+    simple_traversal<<<1,1>>>(buffer);
     XDeviceSynchronize();
   }
 
-  void time_traversal()
+  void time_traversal(size_t num_jumps)
   {
     /*
      * Timed list traversal
      */
 
-    timed_list_traversal<<<1,1>>>(buffer, head_index, d_timer);
+    timed_traversal<<<1,1>>>(buffer, num_jumps, d_timer);
     XDeviceSynchronize();
 
     // Copy the timing data back to the host
-    XMemcpy(timer, d_timer, sizeof(uint32_t)*JUMPS, XMemcpyDeviceToHost);
+    XMemcpy(&timer, d_timer, sizeof(uint64_t), XMemcpyDeviceToHost);
   }
 
 };
@@ -352,5 +277,3 @@ struct HostList : public List
     XFreeHost(buffer);
   }
 };
-
-
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
index a1601b68e1..e2c0d41793 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
@@ -13,38 +13,20 @@
  Times in clock cycles the time it takes to jump from one node to the next
  in a singly linked list.
 
- The list can be initialized sequentially or with a random node ordering. This
- can be controlled passing the command line argument "--rand".
+ The number of nodes in the list and the stride amongst nodes can be set with
+ "--nodes" and "--stride".
 
- The stride and the full buffer size can be set with "--stride" and "--buffer",
- both in number of nodes.
+ The the nodes can be placed in the list in either sequential or random order.
+ This can be controlled passing the command line argument "--rand".
 
  The nodes can be padded with an arbitrary size controlled by the NODE_PADDING
  macro (in Bytes).
 
  If the ALLOC_ON_HOST macro is defined, the list will be allocated in host
  pinned memory. Otherwise, the list is allocated in device memory.
-
- The links of the list can be made volatile defining the macro VOLATILE.
-
- By default, the code returns the aveage number of cycles per jump, but this can
- be changed to return the cycle count on a per-jump basis by defining the flag
- TIME_EACH_STEP.
 */
 
 
-#define REPEAT2(x) x; x;
-#define REPEAT4(x) REPEAT2(x) REPEAT2(x)
-#define REPEAT8(x) REPEAT4(x) REPEAT4(x)
-#define REPEAT16(x) REPEAT8(x) REPEAT8(x)
-#define REPEAT32(x) REPEAT16(x) REPEAT16(x)
-#define REPEAT64(x) REPEAT32(x) REPEAT32(x)
-#define REPEAT128(x) REPEAT64(x) REPEAT64(x)
-#define REPEAT256(x) REPEAT128(x) REPEAT128(x)
-
-#define JUMPS 256
-#define REPEAT_JUMPS(x) REPEAT256(x)
-
 #define NODE_PADDING 0
 
 #ifndef HOSTNAME_SIZE
@@ -66,13 +48,10 @@ using list_type = DeviceList;
 
 
 template < class List >
-uint32_t * general_pointer_chase(int local_device, int remote_device, int init_mode, size_t num_nodes, size_t stride)
+uint64_t general_pointer_chase(int local_device, int remote_device, int init_mode, size_t num_nodes, size_t stride, size_t num_jumps)
 {
   /*
    * Driver to manage the whole allocation, list traversal, etc.
-   * It returns the array containing the timings. Note that these values will depend on whether the
-   * flag -DTIME_EACH_STEP was defined or not (see top of the file).
-   *
    * - local_device: ID of the device where the allocation of the list takes place
    * - remote_device: ID of the device doing the pointer chase.
    * - init_mode: see the List class.
@@ -106,7 +85,7 @@ uint32_t * general_pointer_chase(int local_device, int remote_device, int init_m
   l.traverse();
 
   // Time the pointer chase
-  l.time_traversal();
+  l.time_traversal(num_jumps);
 
   if (peer_access_set)
     XDeviceDisablePeerAccess(remote_device);
@@ -120,26 +99,17 @@ uint32_t * general_pointer_chase(int local_device, int remote_device, int init_m
 
 
 template < class List >
-void local_pointer_chase(int num_devices, int init_mode, size_t num_nodes, size_t stride, char * nid)
+void local_pointer_chase(int num_devices, int init_mode, size_t num_nodes, size_t stride, size_t num_jumps, char * nid)
 {
   /*
    * Specialised pointer chase on a single device.
    */
   for (int gpu_id = 0; gpu_id < num_devices; gpu_id++)
   {
-    uint32_t* timer = general_pointer_chase< List >(gpu_id, gpu_id, init_mode, num_nodes, stride);
+    uint64_t timer = general_pointer_chase< List >(gpu_id, gpu_id, init_mode, num_nodes, stride, num_jumps);
 
     // Print the timings of the pointer chase
-#   ifndef TIME_EACH_STEP
-    printf("[%s] On device %d, the chase took on average %d cycles per node jump.\n", nid, gpu_id, timer[0]/JUMPS);
-#   else
-    printf("[%s] Latency for each node jump (device %d):\n", nid, gpu_id);
-    for (uint32_t i = 0; i < JUMPS; i++)
-    {
-      printf("[%s][device %d] %d\n", nid, gpu_id, timer[i]);
-    }
-#   endif
-    delete [] timer;
+    printf("[%s] On device %d, the chase took on average %d cycles per node jump.\n", nid, gpu_id, timer/num_jumps);
   }
 }
 
@@ -150,13 +120,13 @@ void local_pointer_chase(int num_devices, int init_mode, size_t num_nodes, size_
 # define LIMITS 0
 #endif
 
-void print_device_table(int num_devices, std::queue<uint32_t> q, const char * what, const char * nid)
+void print_device_table(int num_devices, std::queue<uint32_t> q, const char * nid)
 {
   /*
    * Print the data in a table format - useful when doing P2P list traversals.
    */
 
-  printf("[%s] %s memory latency (in clock cycles) with remote direct memory access\n", nid, what);
+  printf("[%s] Average memory latency (in clock cycles) with remote direct memory access\n", nid);
   printf("[%s] %10s", nid, "From \\ To ");
   for (int ds = 0; ds < num_devices; ds++)
   {
@@ -189,7 +159,7 @@ void print_device_table(int num_devices, std::queue<uint32_t> q, const char * wh
 
 
 template < class List >
-void remote_pointer_chase(int num_devices, int init_mode, size_t num_nodes, size_t stride, char * nid, int summarize)
+void remote_pointer_chase(int num_devices, int init_mode, size_t num_nodes, size_t stride, size_t num_jumps, char * nid, int summarize)
 {
   /*
    * Specialised pointer chase to allocate the list in one device, and do the pointer chase from another device.
@@ -197,77 +167,21 @@ void remote_pointer_chase(int num_devices, int init_mode, size_t num_nodes, size
    *   Otherwise, every single result will be printed out.
    */
 
-# ifndef TIME_EACH_STEP
-  std::queue<uint32_t> q_average;
-  auto fetch = [](uint32_t* t){return t[0]/(JUMPS);};
-# else
-  std::queue<uint32_t> q_max;
-  std::queue<uint32_t> q_min;
-  auto fetch_max = [](uint32_t* t)
-  {
-    uint32_t max = 0;
-    for (int i = 0; i < JUMPS; i++)
-    {
-      if (t[i] > max)
-        max = t[i];
-    }
-    return max;
-  };
-  auto fetch_min = [](uint32_t* t)
-  {
-    uint32_t min = ~0;
-    for (int i = 0; i < JUMPS; i++)
-    {
-      if (t[i] < min)
-        min = t[i];
-    }
-    return min;
-  };
-# endif
+  std::queue<uint32_t> timings;
 
   // Do the latency measurements
   for (int j = 0; j < num_devices; j++)
   {
     for (int i = LIMITS; i < num_devices; i++)
     {
-      uint32_t * timer_ptr = general_pointer_chase< List >(i, j, init_mode, num_nodes, stride);
+      uint64_t total_cycles = general_pointer_chase< List >(i, j, init_mode, num_nodes, stride, num_jumps);
 
       // Store the desired values for each element of the matrix in queues
-#     ifndef TIME_EACH_STEP
-      q_average.push(fetch(timer_ptr));
-#     else
-      if (summarize)
-      {
-        q_min.push(fetch_min(timer_ptr));
-        q_max.push(fetch_max(timer_ptr));
-      }
-      else
-      {
-        for (int n = 0; n < JUMPS; n++)
-        {
-          printf("[%s][device %d][device %d] %d\n", nid, j, i, timer_ptr[n]);
-        }
-      }
-#     endif
-      delete [] timer_ptr;
+      timings.push(total_cycles/num_jumps);
     }
   }
 
-  std::string what;
-# ifndef TIME_EACH_STEP
-  what = "Average";
-  print_device_table(num_devices, q_average, what.c_str(), nid);
-# else
-  if (summarize)
-  {
-    what = "Min.";
-    print_device_table(num_devices, q_min, what.c_str(), nid);
-    printf("\n");
-    what = "Max.";
-    print_device_table(num_devices, q_max, what.c_str(), nid);
-  }
-# endif
-
+  print_device_table(num_devices, timings, nid);
 }
 
 
@@ -275,8 +189,9 @@ int main(int argc, char ** argv)
 {
   // Set program defaults before parsing the command line args.
   int list_init_random = 0;
-  size_t sparsity = 1;
-  size_t num_nodes = JUMPS;
+  size_t stride = 1;
+  size_t num_nodes = 1024;
+  size_t num_jumps = num_nodes;
   int multi_gpu = 0;
   int print_summary_only = 0;
   int clock = 0;
@@ -287,14 +202,11 @@ int main(int argc, char ** argv)
     std::string str = argv[i];
     if (str == "--help" || str == "-h")
     {
-      std::cout << "--nodes #    : Number of nodes in the linked list. If no value is specified, it " << std::endl;
-      std::cout << "               defaults to the number of node jumps set at compile time." << std::endl;
-      std::cout << "--rand       : Places the linked list nodes into the buffer in random order (i.e." << std::endl;
-      std::cout << "               consecutive list nodes are not consecutive in memory). If this option" << std::endl;
-      std::cout << "               is not used, the nodes are placed in sequential order." << std::endl;
-      std::cout << "--sparsity # : Controls the sparsity of the list nodes in the buffer. This sets the" << std::endl;
-      std::cout << "               buffer size where the list is placed as sparsity*num_nodes. If the" << std::endl;
-      std::cout << "               list is initialized in sequential order, this effectively sets the stride." << std::endl;
+      std::cout << "--nodes #    : Number of nodes in the linked list. Default value is set to 1024." << std::endl;
+      std::cout << "--stride #   : Distance (in number of nodes) between two consecutive nodes. Default is 1." << std::endl;
+      std::cout << "               This effectively sets the buffer size where the list is allocated to nodes*stride." << std::endl;
+      std::cout << "--num-jumps #: Number of jumps during the timing of the list traversal. Default is 1024." << std::endl;
+      std::cout << "--rand       : Place the nodes in the buffer in random order (default is sequential ordering)." << std::endl;
       std::cout << "--multi-gpu  : Runs the pointer chase algo using all device-pair combinations." << std::endl;
       std::cout << "               This measures the device-to-device memory latency." << std::endl;
       std::cout << "--summary    : When timing each node jump individually and used alongside --multi-gpu, " << std::endl;
@@ -311,9 +223,13 @@ int main(int argc, char ** argv)
     {
       list_init_random = 1;
     }
-    else if (str == "--sparsity")
+    else if (str == "--stride")
+    {
+      stride = std::stoi((std::string)argv[++i]);
+    }
+    else if (str == "--num-jumps")
     {
-      sparsity = std::stoi((std::string)argv[++i]);
+      num_jumps = std::stoi((std::string)argv[++i]);
     }
     else if (str == "--multi-gpu")
     {
@@ -357,11 +273,11 @@ int main(int argc, char ** argv)
   {
     if (!multi_gpu)
     {
-      local_pointer_chase<list_type>(num_devices, list_init_random, num_nodes, sparsity, nid_name);
+      local_pointer_chase<list_type>(num_devices, list_init_random, num_nodes, stride, num_jumps, nid_name);
     }
     else
     {
-      remote_pointer_chase<list_type>(num_devices, list_init_random, num_nodes, sparsity, nid_name, print_summary_only);
+      remote_pointer_chase<list_type>(num_devices, list_init_random, num_nodes, stride, num_jumps, nid_name, print_summary_only);
     }
   }
 

From aef694ad1f4d06d7f48156220d3e6a01322e4c25 Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Tue, 2 Mar 2021 13:55:33 +0100
Subject: [PATCH 47/51] Cleanup source code and remove single-step tests

---
 .../gpu/pointer_chase/pointer_chase.py        | 266 +-----------------
 .../gpu/pointer_chase/src/linked_list.hpp     |   2 +-
 .../gpu/pointer_chase/src/pointer_chase.cu    |   8 +-
 .../gpu/pointer_chase/src/run_full_chase.sh   |  14 +
 4 files changed, 22 insertions(+), 268 deletions(-)
 create mode 100644 cscs-checks/microbenchmarks/gpu/pointer_chase/src/run_full_chase.sh

diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
index 692ccb18b8..a108cc056d 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
@@ -138,8 +138,7 @@ def set_executable(self, CompileGpuPointerChase):
 @rfm.simple_test
 class GpuPointerChaseClockLatency(GpuPointerChaseDep):
     '''
-    Check the clock latencies. This can be thought of the
-    measuring error.
+    Check the clock latencies.
     '''
 
     def __init__(self):
@@ -320,266 +319,3 @@ def average_P2P_latency(self):
                    self.stdout, 1, int)
         ))/(self.num_gpus_per_node-1)
         )
-
-
-#
-# PChase tests tracking the individual latencies of each node jump
-#
-
-
-@rfm.simple_test
-class CompileGpuPointerChaseFine(CompileGpuPointerChase):
-    '''
-    Compile the pChase code to time each node jump.
-    '''
-
-    def __init__(self):
-        super().__init__()
-
-    @rfm.run_before('compile')
-    def set_cxxflags(self):
-        self.build_system.cxxflags += ['-DTIME_EACH_STEP']
-
-
-class GpuPointerChaseFineDep(GpuPointerChaseBase):
-    def __init__(self):
-        super().__init__()
-        self.depends_on('CompileGpuPointerChaseFine')
-
-    @rfm.require_deps
-    def set_executable(self, CompileGpuPointerChaseFine):
-        self.executable = os.path.join(
-            CompileGpuPointerChaseFine().stagedir, 'pChase.x')
-
-    @sn.sanity_function
-    def get_all_latencies(self, pattern):
-        return sn.extractall(pattern, self.stdout, 1, int)
-
-
-class L1_filter:
-    def filter_out_L1_hits(self, threshold, all_latencies):
-        '''
-        Return a list with the latencies that are above 20% threshold.
-        '''
-        return list(filter(lambda x: x > 1.2*threshold, all_latencies))
-
-
-@rfm.simple_test
-class GpuPointerChaseL1(GpuPointerChaseFineDep, L1_filter):
-    '''
-    Pointer chase for all the devices present on each node.
-    The traversal is done with unit stride, checking the L1 latency,
-    L1 miss rate and average latency of an L1 miss.
-    '''
-
-    def __init__(self):
-        super().__init__()
-        self.valid_systems = Pchase.valid_systems
-        self.perf_patterns = {
-            'L1_latency': self.max_L1_latency(),
-            'L1_miss_rate': self.L1_miss_rate(),
-            'L1_miss_latency': self.L1_miss_latency(),
-        }
-
-        self.reference = {
-            'dom:gpu': {
-                'L1_latency': (148, None, 0.1, 'clock cycles'),
-                'L1_miss_rate': (74.6, None, 0.1, '%'),
-                'L1_miss_latency': (407, None, 0.1, 'clock cycles'),
-
-            },
-            'daint:gpu': {
-                'L1_latency': (148, None, 0.1, 'clock cycles'),
-                'L1_miss_rate': (74.6, None, 0.1, '%'),
-                'L1_miss_latency': (407, None, 0.1, 'clock cycles'),
-
-            },
-            'tsa:cn': {
-                'L1_latency': (38, None, 0.1, 'clock cycles'),
-                'L1_miss_rate': (25.4, None, 0.1, '%'),
-                'L1_miss_latency': (240, None, 0.1, 'clock cycles'),
-            },
-            'ault:amda100': {
-                'L1_latency': (42, None, 0.1, 'clock cycles'),
-                'L1_misses': (25.4, None, 0.1, '%'),
-                'L1_miss_latency': (215, None, 0.1, 'clock cycles'),
-            },
-            'ault:amdv100': {
-                'L1_latency': (39, None, 0.1, 'clock cycles'),
-                'L1_misses': (25.4, None, 0.1, '%'),
-                'L1_miss_latency': (208, None, 0.1, 'clock cycles'),
-            },
-            'ault:amdvega': {
-                'L1_latency': (164, None, 0.1, 'clock cycles'),
-                'L1_miss_rate': (23.8, None, 0.1, '%'),
-                'L1_miss_latency': (840, None, 0.1, 'clock cycles'),
-            },
-        }
-
-    @staticmethod
-    def target_str(node, device):
-        return r'^\s*\[%s\]\[device %d\]\s*(\d+)' % (node, device)
-
-    @sn.sanity_function
-    def max_L1_latency(self):
-        '''
-        Max. L1 latency amongst all devices.
-        '''
-        l1_latency = []
-        for n in self.my_nodes:
-            for d in range(self.num_gpus_per_node):
-                l1_latency.append(
-                    sn.min(self.get_all_latencies(self.target_str(n, d)))
-                )
-
-        # Return the data from the worst performing device
-        return sn.max(l1_latency)
-
-    def get_L1_misses(self, n, d, all_latencies=None):
-        '''
-        The idea here is to get the lowest value and model the L1 hits as
-        implemented in the self.filter_out_L1_hits function. Every
-        node jump returned by this function will be counted as an L1 miss.
-        '''
-        if all_latencies is None:
-            all_latencies = self.get_all_latencies(self.target_str(n, d))
-
-        L1 = sn.min(all_latencies)
-        return self.filter_out_L1_hits(L1, all_latencies)
-
-    @sn.sanity_function
-    def L1_miss_rate(self):
-        '''
-        Calculate the rate of L1 misses based on the model implemented by the
-        get_L1_misses sanity function. Return the worst performing rate from
-        all nodes/devices.
-        '''
-        l1_miss_rate = []
-        for n in self.my_nodes:
-            for d in range(self.num_gpus_per_node):
-                all_lat = sn.evaluate(
-                    self.get_all_latencies(self.target_str(n, d))
-                )
-                l1_miss_rate.append(
-                    len(self.get_L1_misses(n, d, all_lat))/len(all_lat)
-                )
-
-        return max(l1_miss_rate)*100
-
-    @sn.sanity_function
-    def L1_miss_latency(self):
-        '''
-        Count the average number of cycles taken only by the node jumps
-        with an L1 miss. Return the worst performing values for all
-        nodes/devices.
-        '''
-        l1_miss_latency = []
-        for n in self.my_nodes:
-            for d in range(self.num_gpus_per_node):
-                l1_miss_latency.append(
-                    ceil(sn.evaluate(sn.avg(self.get_L1_misses(n, d))))
-                )
-
-        return max(l1_miss_latency)
-
-
-@rfm.simple_test
-class GpuPointerChaseL1P2P(GpuPointerChaseFineDep, L1_filter):
-    '''
-    Pointer chase through P2P, checking L1 miss rates and L1 miss
-    latency averaged amogst all devices in each node.
-    '''
-
-    def __init__(self):
-        super().__init__()
-        self.valid_systems = Pchase.multi_device
-        self.executable_opts = ['--multi-gpu']
-        self.perf_patterns = {
-            'L1_latency': self.max_L1_latency(),
-            'L1_miss_rate': self.L1_miss_rate(),
-            'L1_miss_latency': self.L1_miss_latency()
-        }
-        self.reference = {
-            'tsa:cn': {
-                'L1_latency': (38, None, 0.1, 'clock cycles'),
-                'L1_miss_rate': (25.4, None, 0.1, '%'),
-                'L1_miss_latency': (1463, None, 0.1, 'clock cycles'),
-            },
-            'ault:amda100': {
-                'L1_latency': (42, None, 0.1, 'clock cycles'),
-                'L1_miss_rate': (25.4, None, 0.1, '%'),
-                'L1_miss_latency': (792, None, 0.1, 'clock cycles'),
-            },
-            'ault:amdv100': {
-                'L1_latency': (39, None, 0.1, 'clock cycles'),
-                'L1_miss_rate': (25.4, None, 0.1, '%'),
-                'L1_miss_latency': (2620, None, 0.1, 'clock cycles'),
-            },
-            'ault:amdvega': {
-                'L1_latency': (164, None, 0.1, 'clock cycles'),
-                'L1_miss_rate': (19.3, None, 0.1, '%'),
-                'L1_miss_latency': (2200, None, 0.1, 'clock cycles'),
-            },
-        }
-
-    @staticmethod
-    def target_str(node, d1, d2):
-        return r'^\s*\[%s\]\[device %d\]\[device %d\]\s*(\d+)' % (node, d1, d2)
-
-    @sn.sanity_function
-    def max_L1_latency(self):
-        '''
-        Max. L1 latency amongst all devices.
-        '''
-        l1_latency = []
-        for n in self.my_nodes:
-            for d1 in range(self.num_gpus_per_node):
-                for d2 in range(self.num_gpus_per_node):
-                    l1_latency.append(
-                        sn.min(self.get_all_latencies(
-                            self.target_str(n, d1, d2))
-                        )
-                    )
-
-        # Return the data from the worst performing device
-        return sn.max(l1_latency)
-
-    @sn.sanity_function
-    def L1_miss_rate(self):
-        '''
-        Calculates the L1 miss rate across P2P list traversals.
-        '''
-        total_node_jumps = 0
-        total_L1_misses = 0
-        for n in self.my_nodes:
-            for d1 in range(self.num_gpus_per_node):
-                for d2 in range(self.num_gpus_per_node):
-                    if(d1 != d2):
-                        all_lat = sn.evaluate(self.get_all_latencies(
-                            self.target_str(n, d1, d2)
-                        ))
-                        L1 = min(all_lat)
-                        total_L1_misses += len(
-                            self.filter_out_L1_hits(L1, all_lat)
-                        )
-                        total_node_jumps += len(all_lat)
-
-        return (total_L1_misses/total_node_jumps)*100
-
-    @sn.sanity_function
-    def L1_miss_latency(self):
-        '''
-        Calculate the latency of all L1 misses across all P2P list traversals
-        '''
-        L1_misses = []
-        for n in self.my_nodes:
-            for d1 in range(self.num_gpus_per_node):
-                for d2 in range(self.num_gpus_per_node):
-                    if (d1 != d2):
-                        all_lat = sn.evaluate(self.get_all_latencies(
-                            self.target_str(n, d1, d2)
-                        ))
-                        L1 = min(all_lat)
-                        L1_misses += self.filter_out_L1_hits(L1, all_lat)
-
-        return int(sn.evaluate(sn.avg(L1_misses)))
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/linked_list.hpp b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/linked_list.hpp
index 4ca289d1be..88b5821de1 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/linked_list.hpp
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/linked_list.hpp
@@ -99,7 +99,7 @@ __global__ void simple_traversal(Node * __restrict__ buffer)
   // Silly dep. to tell the compiler not to throw away this kernel.
   if (ptr->next->next == buffer)
   {
-    printf("The impossible just happened\n");
+    printf("The list has only 1 node\n");
   }
 }
 
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
index 02510f0afc..0e99040df3 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
@@ -54,11 +54,15 @@ uint64_t general_pointer_chase(int local_device, int remote_device, int init_mod
 {
   /*
    * Driver to manage the whole allocation, list traversal, etc.
+   * Before any timings are done, this function traverses the full list. This "fills up" the device
+   * caches and removes any spurious latencies on the first few node jumps. This means that there is no
+   * need to even traverse the full list when performing the timed traversal.
    * - local_device: ID of the device where the allocation of the list takes place
    * - remote_device: ID of the device doing the pointer chase.
    * - init_mode: see the List class.
    * - num_nodes: nodes in the liked list.
    * - stride: Gap (in nodes) between two consecutive nodes. This only applies if init_mode is 0.
+   * - num_jumps: Number of node jumps to carry out on the timed traversal.
    */
 
   XSetDevice(remote_device);
@@ -108,10 +112,10 @@ void local_pointer_chase(int num_devices, int init_mode, size_t num_nodes, size_
    */
   for (int gpu_id = 0; gpu_id < num_devices; gpu_id++)
   {
-    uint64_t timer = general_pointer_chase< List >(gpu_id, gpu_id, init_mode, num_nodes, stride, num_jumps);
+    uint64_t total_cycles = general_pointer_chase< List >(gpu_id, gpu_id, init_mode, num_nodes, stride, num_jumps);
 
     // Print the timings of the pointer chase
-    printf("[%s] On device %d, the chase took on average %d cycles per node jump.\n", nid, gpu_id, timer/num_jumps);
+    printf("[%s] On device %d, the chase took on average %d cycles per node jump.\n", nid, gpu_id, total_cycles/num_jumps);
   }
 }
 
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/run_full_chase.sh b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/run_full_chase.sh
new file mode 100644
index 0000000000..8af748a825
--- /dev/null
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/run_full_chase.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+mkdir -p data
+
+for stride in 1 2 4 8 16 32
+do
+  echo "Running chase with stride of ${stride}"
+  n=8
+  while [ $n -le 268435456 ] # Up to 2GB worth of list (8 Bytes per node)
+  do
+    srun -n 1 ./pChase.x --stride $stride --num-jumps 20000 --nodes $n  > data/out_${n}_${stride}.dat 
+    n=$(( $n * 2 ))
+  done
+done

From 6da67850d0d921b281133c5d6cbf1e327e210d07 Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Tue, 2 Mar 2021 19:32:11 +0100
Subject: [PATCH 48/51] Add memory latency tests

---
 .../gpu/pointer_chase/pointer_chase.py        | 367 ++++++++++--------
 .../gpu/pointer_chase/src/makefile.hip        |   4 +-
 .../gpu/pointer_chase/src/pointer_chase.cu    |  16 +-
 .../gpu/pointer_chase/src/run_full_chase.sh   |   2 +-
 4 files changed, 207 insertions(+), 182 deletions(-)

diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
index a108cc056d..e4636fdfaa 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
@@ -7,31 +7,32 @@
 import reframe as rfm
 
 import os
-from math import ceil
 
 
-class Pchase:
+class PchaseGlobal(rfm.RegressionMixin):
+    '''Handy class to store common test settings.
     '''
-    Public storage class to avoid writing the parameters below multiple times.
-    '''
-    single_device = ['daint:gpu', 'dom:gpu']
-    multi_device = ['ault:intelv100', 'ault:amdv100',
-                    'ault:amda100', 'ault:amdvega',
-                    'tsa:cn']
-    valid_systems = single_device+multi_device
-    valid_prog_environs = ['PrgEnv-gnu']
-
-
-#
-# PChase tests tracking the averaged latencies for all node jumps
-#
+    single_device_systems = variable(
+        list,
+        value=['daint:gpu', 'dom:gpu']
+    )
+    multi_device_systems = variable(
+        list,
+        value=[
+            'ault:intelv100', 'ault:amdv100',
+            'ault:amda100', 'ault:amdvega', 'tsa:cn'
+        ]
+    )
+    global_prog_environs = variable(list, value=['PrgEnv-gnu'])
 
 
 @rfm.simple_test
-class CompileGpuPointerChase(rfm.CompileOnlyRegressionTest):
+class CompileGpuPointerChase(rfm.CompileOnlyRegressionTest, PchaseGlobal):
     def __init__(self):
-        self.valid_systems = Pchase.valid_systems
-        self.valid_prog_environs = Pchase.valid_prog_environs
+        self.valid_systems = (
+            self.single_device_systems + self.multi_device_systems
+        )
+        self.valid_prog_environs = self.global_prog_environs
         self.exclusive_access = True
         self.build_system = 'Make'
         self.num_tasks = 0
@@ -82,9 +83,30 @@ def set_gpu_arch(self):
             self.modules += ['rocm']
 
 
-class GpuPointerChaseBase(rfm.RunOnlyRegressionTest):
+class GpuPointerChaseBase(rfm.RunOnlyRegressionTest, PchaseGlobal):
+    '''Base RunOnly class.
+
+    This runs the pointer chase algo on the linked list from the code compiled
+    in the executable from the test above. The list is fully customisable
+    through the command line, so the number of nodes, and the stride size for
+    each jump will determine where the memory hits occur. This stride is set to
+    32 node lengths (a node is 8 Bytes) to ensure that there is only a single
+    node per cache line. The number of node jumps is set relatively large to
+    ensure that the background effects are averaged out.
+
+    Derived tests MUST set the number of list nodes.
+    '''
+    num_list_nodes = variable(int)
+
+    # Use a large stride to ensure there's only a single node per cache line
+    stride = variable(int, value=32)
+
+    # Set a large number of node jumps to smooth out spurious effects
+    num_node_jumps = variable(int, value=400000)
+
     def __init__(self):
-        self.valid_prog_environs = Pchase.valid_prog_environs
+        self.depends_on('CompileGpuPointerChase')
+        self.valid_prog_environs = self.global_prog_environs
         self.num_tasks = 0
         self.num_tasks_per_node = 1
         self.exclusive_access = True
@@ -92,6 +114,19 @@ def __init__(self):
         self.maintainers = ['JO']
         self.tags = {'benchmark'}
 
+    @rfm.require_deps
+    def set_executable(self, CompileGpuPointerChase):
+        self.executable = os.path.join(
+            CompileGpuPointerChase().stagedir, 'pChase.x')
+
+    @rfm.run_before('run')
+    def set_exec_opts(self):
+        self.executable_opts += [
+            f'--stride {self.stride}',
+            f'--nodes {self.num_list_nodes}',
+            f'--num-jumps {self.num_node_jumps}'
+        ]
+
     @rfm.run_before('run')
     def set_num_gpus_per_node(self):
         cp = self.current_partition.fullname
@@ -108,7 +143,6 @@ def set_num_gpus_per_node(self):
 
     @sn.sanity_function
     def do_sanity_check(self):
-
         # Check that every node has the right number of GPUs
         # Store this nodes in case they're used later by the perf functions.
         self.my_nodes = set(sn.extractall(
@@ -124,198 +158,189 @@ def do_sanity_check(self):
             sn.assert_eq(self.job.num_tasks, nodes_at_end)))
 
 
-class GpuPointerChaseDep(GpuPointerChaseBase):
+class GpuPointerChaseSingle(GpuPointerChaseBase):
+    '''Base class for the single-GPU latency tests.'''
     def __init__(self):
         super().__init__()
-        self.depends_on('CompileGpuPointerChase')
-
-    @rfm.require_deps
-    def set_executable(self, CompileGpuPointerChase):
-        self.executable = os.path.join(
-            CompileGpuPointerChase().stagedir, 'pChase.x')
-
+        self.valid_systems = (
+            self.single_device_systems + self.multi_device_systems
+        )
+        self.perf_patterns = {
+            'average_latency': sn.max(sn.extractall(
+                r'^\s*\[[^\]]*\]\s* On device \d+, '
+                r'the chase took on average (\d+) '
+                r'cycles per node jump.', self.stdout, 1, int)
+            ),
+        }
 
 @rfm.simple_test
-class GpuPointerChaseClockLatency(GpuPointerChaseDep):
-    '''
-    Check the clock latencies.
+class GpuL1Latency(GpuPointerChaseSingle):
+    '''Measure L1 latency.
+
+    The linked list fits in L1. The stride is set pretty large, but that does
+    not matter for this case since everything is in L1.
     '''
+    num_list_nodes = 16
 
     def __init__(self):
         super().__init__()
-        self.valid_systems = Pchase.valid_systems
-        self.executable_opts = ['--clock']
-        self.perf_patterns = {
-            'clock_latency': sn.max(sn.extractall(
-                r'^\s*\[[^\]]*\]\s*The clock latency on device \d+ '
-                r'is (\d+) cycles.', self.stdout, 1, int)
-            ),
-        }
-
         self.reference = {
-            'daint:gpu': {
-                'clock_latency': (56, None, 0.1, 'cycles'),
-            },
             'dom:gpu': {
-                'clock_latency': (56, None, 0.1, 'cycles'),
+                'average_latency': (103, None, 0.1, 'clock cycles')
+            },
+            'daint:gpu': {
+                'average_latency': (103, None, 0.1, 'clock cycles')
             },
             'tsa:cn': {
-                'clock_latency': (8, None, 0.1, 'cycles'),
+                'average_latency': (28, None, 0.1, 'clock cycles')
             },
             'ault:amda100': {
-                'clock_latency': (7, None, 0.1, 'cycles'),
+                'average_latency': (33, None, 0.1, 'clock cycles')
             },
             'ault:amdv100': {
-                'clock_latency': (8, None, 0.1, 'cycles'),
+                'average_latency': (28, None, 0.1, 'clock cycles')
             },
-            'ault:amdvega': {
-                'clock_latency': (40, None, 0.1, 'cycles'),
+           'ault:amdvega': {
+                'average_latency': (140, None, 0.1, 'clock cycles')
             },
         }
 
 
-@rfm.parameterized_test([1], [2], [4], [4096])
-class GpuPointerChaseSingle(GpuPointerChaseDep):
-    '''
-    Pointer chase on a single device with increasing stride.
+@rfm.simple_test
+class GpuL2Latency(GpuPointerChaseSingle):
+    '''Measure the L2 latency.
+
+    The linked list is larger than L1, but it fits in L2. The stride is set
+    to be larger than L1's cache line to avoid any hits in L1.
     '''
+    num_list_nodes = 5000
 
-    def __init__(self, stride):
+    def __init__(self):
         super().__init__()
-        self.valid_systems = Pchase.valid_systems
-        self.executable_opts = ['--sparsity', f'{stride}']
-        self.perf_patterns = {
-            'average_latency': sn.max(sn.extractall(
-                r'^\s*\[[^\]]*\]\s* On device \d+, '
-                r'the chase took on average (\d+) '
-                r'cycles per node jump.', self.stdout, 1, int)
-            ),
+        self.reference = {
+            'dom:gpu': {
+                'average_latency': (290, None, 0.1, 'clock cycles')
+            },
+            'daint:gpu': {
+                'average_latency': (258, None, 0.1, 'clock cycles')
+            },
+            'tsa:cn': {
+                'average_latency': (215, None, 0.1, 'clock cycles')
+            },
+            'ault:amda100': {
+                'average_latency': (204, None, 0.1, 'clock cycles')
+            },
+            'ault:amdv100': {
+                'average_latency': (215, None, 0.1, 'clock cycles')
+            },
+           'ault:amdvega': {
+                'average_latency': (290, None, 0.1, 'clock cycles')
+            },
         }
 
-        if stride == 1:
-            self.reference = {
-                'tsa:cn': {
-                    'average_latency': (80, None, 0.1, 'clock cycles')
-                },
-                'ault:amda100': {
-                    'average_latency': (76, None, 0.1, 'clock cycles')
-                },
-                'ault:amdv100': {
-                    'average_latency': (77, None, 0.1, 'clock cycles')
-                },
-                'dom:gpu': {
-                    'average_latency': (143, None, 0.1, 'clock cycles')
-                },
-                'daint:gpu': {
-                    'average_latency': (143, None, 0.1, 'clock cycles')
-                },
-                'ault:amdvega': {
-                    'average_latency': (225, None, 0.1, 'clock cycles')
-                },
-            }
-        elif stride == 2:
-            self.reference = {
-                'tsa:cn': {
-                    'average_latency': (120, None, 0.1, 'clock cycles')
-                },
-                'ault:amda100': {
-                    'average_latency': (116, None, 0.1, 'clock cycles')
-                },
-                'ault:amdv100': {
-                    'average_latency': (118, None, 0.1, 'clock cycles')
-                },
-                'dom:gpu': {
-                    'average_latency': (181, None, 0.1, 'clock cycles')
-                },
-                'daint:gpu': {
-                    'average_latency': (181, None, 0.1, 'clock cycles')
-                },
-                'ault:amdvega': {
-                    'average_latency': (300, None, 0.1, 'clock cycles')
-                },
-            }
-        elif stride == 4:
-            self.reference = {
-                'tsa:cn': {
-                    'average_latency': (204, None, 0.1, 'clock cycles')
-                },
-                'ault:amda100': {
-                    'average_latency': (198, None, 0.1, 'clock cycles')
-                },
-                'ault:amdv100': {
-                    'average_latency': (204, None, 0.1, 'clock cycles')
-                },
-                'dom:gpu': {
-                    'average_latency': (260, None, 0.1, 'clock cycles')
-                },
-                'daint:gpu': {
-                    'average_latency': (260, None, 0.1, 'clock cycles')
-                },
-                'ault:amdvega': {
-                    'average_latency': (470, None, 0.1, 'clock cycles')
-                },
-            }
-        elif stride == 4096:
-            self.reference = {
-                'tsa:cn': {
-                    'average_latency': (220, None, 0.1, 'clock cycles')
-                },
-                'ault:amda100': {
-                    'average_latency': (206, None, 0.1, 'clock cycles')
-                },
-                'ault:amdv100': {
-                    'average_latency': (220, None, 0.1, 'clock cycles')
-                },
-                'dom:gpu': {
-                    'average_latency': (260, None, 0.1, 'clock cycles')
-                },
-                'daint:gpu': {
-                    'average_latency': (260, None, 0.1, 'clock cycles')
-                },
-                'ault:amdvega': {
-                    'average_latency': (800, None, 0.1, 'clock cycles')
-                },
-            }
-
-
 @rfm.simple_test
-class GpuPointerChaseAverageP2PLatency(GpuPointerChaseDep):
-    '''
-    Average inter-node P2P latency.
+class GpuDRAMLatency(GpuPointerChaseSingle):
+    '''Measure the DRAM latency.
+
+    The linked list is large enough to fill the last cache level. Also, the
+    stride during the traversal must me large enough that there are no
+    cache hits at all.
     '''
+    num_list_nodes = 2000000
 
     def __init__(self):
         super().__init__()
-        self.valid_systems = Pchase.multi_device
-        self.executable_opts = ['--multi-gpu']
-        self.perf_patterns = {
-            'average_latency': self.average_P2P_latency(),
-        }
-
         self.reference = {
+            'dom:gpu': {
+                'average_latency': (506, None, 0.1, 'clock cycles')
+            },
+            'daint:gpu': {
+                'average_latency': (506, None, 0.1, 'clock cycles')
+            },
+            'tsa:cn': {
+                'average_latency': (425, None, 0.1, 'clock cycles')
+            },
             'ault:amda100': {
-                'average_latency': (223, None, 0.1, 'clock cycles')
+                'average_latency': (560, None, 0.1, 'clock cycles')
             },
             'ault:amdv100': {
-                'average_latency': (611, None, 0.1, 'clock cycles')
+                'average_latency': (425, None, 0.1, 'clock cycles')
             },
-            'ault:amdvega': {
-                'average_latency': (336, None, 0.1, 'clock cycles')
-            },
-            'tsa:cn': {
-                'average_latency': (394, None, 0.1, 'clock cycles')
+           'ault:amdvega': {
+                'average_latency': (625, None, 0.1, 'clock cycles')
             },
         }
 
+
+class GpuP2PLatency(GpuPointerChaseBase):
+    '''List traversal is done from a remote GPU.'''
+    num_list_nodes = required
+
+    def __init__(self):
+        super().__init__()
+        self.valid_systems = self.multi_device_systems
+        self.executable_opts += ['--multi-gpu']
+        self.perf_patterns = {
+            'average_latency': self.average_P2P_latency(),
+        }
+
     @sn.sanity_function
     def average_P2P_latency(self):
         '''
         Extract the average P2P latency. Note that the pChase code
         returns a table with the cummulative latency for all P2P
-        list traversals.
+        list traversals, and the last column of this table has the max
+        values for each device.
         '''
-        return int(sn.evaluate(sn.max(sn.extractall(
-                   r'^\s*\[[^\]]*\]\s*GPU\s*\d+\s+(\s*\d+.\s+)+',
-                   self.stdout, 1, int)
-        ))/(self.num_gpus_per_node-1)
-        )
+        return int(sn.evaluate(
+            sn.max(sn.extractall(
+                r'^\s*\[[^\]]*\]\s*GPU\s*\d+\s+(\s*\d+.\s+)+',
+                self.stdout, 1, int)
+            )
+        ))
+
+
+@rfm.simple_test
+class GpuL2LatencyP2P(GpuP2PLatency):
+    '''The traversal is cached on the remote device's L2.'''
+    num_list_nodes = 5000
+
+    def __init__(self):
+        super().__init__()
+        self.reference = {
+            'tsa:cn': {
+                'average_latency': (425, None, 0.1, 'clock cycles')
+            },
+            'ault:amda100': {
+                'average_latency': (760, None, 0.1, 'clock cycles')
+            },
+            'ault:amdv100': {
+                'average_latency': (760, None, 0.1, 'clock cycles')
+            },
+           'ault:amdvega': {
+                'average_latency': (315, None, 0.1, 'clock cycles')
+            },
+        }
+
+
+@rfm.simple_test
+class GpuDRAMLatencyP2P(GpuP2PLatency):
+    '''Measure the latency with remote access to DRAM.'''
+    num_list_nodes = 2000000
+
+    def __init__(self):
+        super().__init__()
+        self.reference = {
+            'tsa:cn': {
+                'average_latency': (425, None, 0.1, 'clock cycles')
+            },
+            'ault:amda100': {
+                'average_latency': (1120, None, 0.1, 'clock cycles')
+            },
+            'ault:amdv100': {
+                'average_latency': (760, None, 0.1, 'clock cycles')
+            },
+           'ault:amdvega': {
+                'average_latency': (3550, None, 0.1, 'clock cycles')
+            },
+        }
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/makefile.hip b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/makefile.hip
index f10578c035..bd7b7270b5 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/makefile.hip
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/makefile.hip
@@ -1,5 +1,5 @@
-RSMI_ROOT?=/opt/rocm-3.9.0/rocm_smi
-CXXFLAGS?=--amdgpu-target=gfx906,gfx908
+RSMI_ROOT?=/opt/rocm/rocm_smi
+CXXFLAGS?=--amdgpu-target=gfx906,gfx908,gfx900
 
 pointerChase:
 	hipcc -o pChase.x -O3 pointer_chase.cu -DTARGET_HIP ${CXXFLAGS} -std=c++11 -lnuma -I${RSMI_ROOT}/include -L${RSMI_ROOT}/lib -lrocm_smi64
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
index 0e99040df3..f649e9356f 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
@@ -137,12 +137,12 @@ void print_device_table(int num_devices, std::queue<uint32_t> q, const char * ni
   for (int ds = 0; ds < num_devices; ds++)
   {
     printf("%4sGPU %2d", "", ds);
-  } printf("%10s\n", "Totals");
+  } printf("%10s\n", "Max");
 
   for (int j = 0; j < num_devices; j++)
   {
-    // Track the sum of the latencies
-    uint32_t totals = 0;
+    // Track the max latency
+    uint32_t max_latency = 0;
 
     printf("[%s] GPU %2d%4s", nid, j, " ");
     for (int i = 0; i < LIMITS; i++)
@@ -152,14 +152,14 @@ void print_device_table(int num_devices, std::queue<uint32_t> q, const char * ni
 
     for (int i = LIMITS; i < num_devices; i++)
     {
-      uint32_t timer = q.front();
+      uint32_t cycles = q.front();
       q.pop();
-      if (i != j)
+      if (cycles > max_latency)
       {
-        totals += timer;
+        max_latency = cycles;
       }
-      printf("%10d", timer);
-    } printf("%10d\n", totals);
+      printf("%10d", cycles);
+    } printf("%10d\n", max_latency);
   }
 }
 
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/run_full_chase.sh b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/run_full_chase.sh
index 8af748a825..85977ad9d7 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/run_full_chase.sh
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/run_full_chase.sh
@@ -8,7 +8,7 @@ do
   n=8
   while [ $n -le 268435456 ] # Up to 2GB worth of list (8 Bytes per node)
   do
-    srun -n 1 ./pChase.x --stride $stride --num-jumps 20000 --nodes $n  > data/out_${n}_${stride}.dat 
+    srun -n 1 ./pChase.x --stride $stride --num-jumps 400000 --nodes $n  > data/out_${n}_${stride}.dat 
     n=$(( $n * 2 ))
   done
 done

From 9c429fb56e90cf5119fde6ea32694ab9d459d7a2 Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Tue, 2 Mar 2021 20:09:37 +0100
Subject: [PATCH 49/51] Update refs for tsa

---
 .../gpu/pointer_chase/pointer_chase.py        | 75 +++++++++----------
 1 file changed, 37 insertions(+), 38 deletions(-)

diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
index e4636fdfaa..ac8bed6c1f 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
@@ -301,46 +301,45 @@ def average_P2P_latency(self):
 
 
 @rfm.simple_test
-class GpuL2LatencyP2P(GpuP2PLatency):
-    '''The traversal is cached on the remote device's L2.'''
-    num_list_nodes = 5000
-
-    def __init__(self):
-        super().__init__()
-        self.reference = {
-            'tsa:cn': {
-                'average_latency': (425, None, 0.1, 'clock cycles')
-            },
-            'ault:amda100': {
-                'average_latency': (760, None, 0.1, 'clock cycles')
-            },
-            'ault:amdv100': {
-                'average_latency': (760, None, 0.1, 'clock cycles')
-            },
-           'ault:amdvega': {
-                'average_latency': (315, None, 0.1, 'clock cycles')
-            },
-        }
+class GpuP2PLatencyP2P(GpuP2PLatency):
+    '''Measure the latency to remote device.
 
-
-@rfm.simple_test
-class GpuDRAMLatencyP2P(GpuP2PLatency):
-    '''Measure the latency with remote access to DRAM.'''
+    Depending on the list size, the data might be cached in different places.
+    A list_size of 2000000 will place the list on the DRAM of the remote device.
+    '''
+    list_size = parameter([5000, 2000000])
     num_list_nodes = 2000000
 
     def __init__(self):
         super().__init__()
-        self.reference = {
-            'tsa:cn': {
-                'average_latency': (425, None, 0.1, 'clock cycles')
-            },
-            'ault:amda100': {
-                'average_latency': (1120, None, 0.1, 'clock cycles')
-            },
-            'ault:amdv100': {
-                'average_latency': (760, None, 0.1, 'clock cycles')
-            },
-           'ault:amdvega': {
-                'average_latency': (3550, None, 0.1, 'clock cycles')
-            },
-        }
+        self.num_list_nodes = self.list_size
+        if self.list_size == 5000:
+            self.reference = {
+                'tsa:cn': {
+                    'average_latency': (2981, None, 0.1, 'clock cycles')
+                },
+                'ault:amda100': {
+                    'average_latency': (760, None, 0.1, 'clock cycles')
+                },
+                'ault:amdv100': {
+                    'average_latency': (760, None, 0.1, 'clock cycles')
+                },
+               'ault:amdvega': {
+                    'average_latency': (315, None, 0.1, 'clock cycles')
+                },
+            }
+        elif self.list_size == 2000000:
+            self.reference = {
+                'tsa:cn': {
+                    'average_latency': (3219, None, 0.1, 'clock cycles')
+                },
+                'ault:amda100': {
+                    'average_latency': (1120, None, 0.1, 'clock cycles')
+                },
+                'ault:amdv100': {
+                    'average_latency': (760, None, 0.1, 'clock cycles')
+                },
+               'ault:amdvega': {
+                    'average_latency': (3550, None, 0.1, 'clock cycles')
+                },
+            }

From 9cfd0009b21f364505a9463b18d1d4c73a24a73a Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Thu, 4 Mar 2021 14:49:57 +0100
Subject: [PATCH 50/51] Address PR comments

---
 .../microbenchmarks/gpu/dgemm/dgmemm.py       | 13 ++++-------
 .../gpu/pointer_chase/pointer_chase.py        | 22 +++++++++++--------
 2 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/cscs-checks/microbenchmarks/gpu/dgemm/dgmemm.py b/cscs-checks/microbenchmarks/gpu/dgemm/dgmemm.py
index 111bfe604d..15667959da 100644
--- a/cscs-checks/microbenchmarks/gpu/dgemm/dgmemm.py
+++ b/cscs-checks/microbenchmarks/gpu/dgemm/dgmemm.py
@@ -45,21 +45,16 @@ def __init__(self):
             }
         }
 
-        self.maintainers = ['JO']
+        self.maintainers = ['JO', 'SK']
         self.tags = {'benchmark'}
 
-    @property
-    @sn.sanity_function
-    def num_tasks_assigned(self):
-        return self.job.num_tasks
-
     @sn.sanity_function
     def assert_num_gpus(self):
         return sn.assert_eq(
             sn.count(sn.findall(r'^\s*\[[^\]]*\]\s*Test passed', self.stdout)),
-            self.num_tasks_assigned)
+            sn.getattr(self.job, 'num_tasks'))
 
-    @rfm.run_after('setup')
+    @rfm.run_before('compile')
     def select_makefile(self):
         cp = self.current_partition.fullname
         if cp == 'ault:amdvega':
@@ -67,7 +62,7 @@ def select_makefile(self):
         else:
             self.build_system.makefile = 'makefile.cuda'
 
-    @rfm.run_after('setup')
+    @rfm.run_before('compile')
     def set_gpu_arch(self):
         cp = self.current_partition.fullname
 
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
index ac8bed6c1f..468dd679d4 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
@@ -4,6 +4,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 
 import reframe.utility.sanity as sn
+import reframe.utility.typecheck as typ
 import reframe as rfm
 
 import os
@@ -13,11 +14,11 @@ class PchaseGlobal(rfm.RegressionMixin):
     '''Handy class to store common test settings.
     '''
     single_device_systems = variable(
-        list,
+        typ.List[str],
         value=['daint:gpu', 'dom:gpu']
     )
     multi_device_systems = variable(
-        list,
+        typ.List[str],
         value=[
             'ault:intelv100', 'ault:amdv100',
             'ault:amda100', 'ault:amdvega', 'tsa:cn'
@@ -39,7 +40,7 @@ def __init__(self):
         self.num_tasks_per_node = 1
         self.postbuild_cmds = ['ls .']
         self.sanity_patterns = sn.assert_found(r'pChase.x', self.stdout)
-        self.maintainers = ['JO']
+        self.maintainers = ['JO', 'SK']
         self.tags = {'benchmark'}
 
     @rfm.run_after('setup')
@@ -111,7 +112,7 @@ def __init__(self):
         self.num_tasks_per_node = 1
         self.exclusive_access = True
         self.sanity_patterns = self.do_sanity_check()
-        self.maintainers = ['JO']
+        self.maintainers = ['JO', 'SK']
         self.tags = {'benchmark'}
 
     @rfm.require_deps
@@ -160,6 +161,7 @@ def do_sanity_check(self):
 
 class GpuPointerChaseSingle(GpuPointerChaseBase):
     '''Base class for the single-GPU latency tests.'''
+
     def __init__(self):
         super().__init__()
         self.valid_systems = (
@@ -173,6 +175,7 @@ def __init__(self):
             ),
         }
 
+
 @rfm.simple_test
 class GpuL1Latency(GpuPointerChaseSingle):
     '''Measure L1 latency.
@@ -200,7 +203,7 @@ def __init__(self):
             'ault:amdv100': {
                 'average_latency': (28, None, 0.1, 'clock cycles')
             },
-           'ault:amdvega': {
+            'ault:amdvega': {
                 'average_latency': (140, None, 0.1, 'clock cycles')
             },
         }
@@ -233,11 +236,12 @@ def __init__(self):
             'ault:amdv100': {
                 'average_latency': (215, None, 0.1, 'clock cycles')
             },
-           'ault:amdvega': {
+            'ault:amdvega': {
                 'average_latency': (290, None, 0.1, 'clock cycles')
             },
         }
 
+
 @rfm.simple_test
 class GpuDRAMLatency(GpuPointerChaseSingle):
     '''Measure the DRAM latency.
@@ -266,7 +270,7 @@ def __init__(self):
             'ault:amdv100': {
                 'average_latency': (425, None, 0.1, 'clock cycles')
             },
-           'ault:amdvega': {
+            'ault:amdvega': {
                 'average_latency': (625, None, 0.1, 'clock cycles')
             },
         }
@@ -324,7 +328,7 @@ def __init__(self):
                 'ault:amdv100': {
                     'average_latency': (760, None, 0.1, 'clock cycles')
                 },
-               'ault:amdvega': {
+                'ault:amdvega': {
                     'average_latency': (315, None, 0.1, 'clock cycles')
                 },
             }
@@ -339,7 +343,7 @@ def __init__(self):
                 'ault:amdv100': {
                     'average_latency': (760, None, 0.1, 'clock cycles')
                 },
-               'ault:amdvega': {
+                'ault:amdvega': {
                     'average_latency': (3550, None, 0.1, 'clock cycles')
                 },
             }

From c89f950018af629f3fbbeea179fa598e440621ee Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Thu, 4 Mar 2021 16:04:41 +0100
Subject: [PATCH 51/51] Do chase simultaneously in all devices

---
 .../gpu/pointer_chase/pointer_chase.py        |  4 ++-
 .../gpu/pointer_chase/src/pointer_chase.cu    | 32 ++++++++++++++++---
 2 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
index 468dd679d4..1b8834b521 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py
@@ -344,6 +344,8 @@ def __init__(self):
                     'average_latency': (760, None, 0.1, 'clock cycles')
                 },
                 'ault:amdvega': {
-                    'average_latency': (3550, None, 0.1, 'clock cycles')
+                    'average_latency': (
+                        3550, None, 0.1, 'clock cycles'
+                    )
                 },
             }
diff --git a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
index f649e9356f..8f7c2680d3 100644
--- a/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
+++ b/cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu
@@ -7,6 +7,10 @@
 #include <memory>
 #include <algorithm>
 #include <queue>
+#include <thread>
+#include <mutex>
+#include <vector>
+#include <functional>
 
 /*
  ~~ GPU Linked list pointer chase algorithm ~~
@@ -103,6 +107,21 @@ uint64_t general_pointer_chase(int local_device, int remote_device, int init_mod
    return l.timer;
 }
 
+std::mutex mtx;
+template < class L >
+void loc_ptr_ch(int gpu_id, int init_mode, size_t num_nodes, size_t stride, size_t num_jumps, char * nid)
+{
+  /*
+   * Low-level thread-safe local pointer chase function.
+   */
+  uint64_t total_cycles = general_pointer_chase< L >(gpu_id, gpu_id, init_mode, num_nodes, stride, num_jumps);
+
+  // Print the timings of the pointer chase
+  {
+    std::lock_guard<std::mutex> lg(mtx);
+    printf("[%s] On device %d, the chase took on average %d cycles per node jump.\n", nid, gpu_id, total_cycles/num_jumps);
+  }
+}
 
 template < class List >
 void local_pointer_chase(int num_devices, int init_mode, size_t num_nodes, size_t stride, size_t num_jumps, char * nid)
@@ -110,13 +129,18 @@ void local_pointer_chase(int num_devices, int init_mode, size_t num_nodes, size_
   /*
    * Specialised pointer chase on a single device.
    */
+  std::vector<std::thread> threads;
   for (int gpu_id = 0; gpu_id < num_devices; gpu_id++)
   {
-    uint64_t total_cycles = general_pointer_chase< List >(gpu_id, gpu_id, init_mode, num_nodes, stride, num_jumps);
-
-    // Print the timings of the pointer chase
-    printf("[%s] On device %d, the chase took on average %d cycles per node jump.\n", nid, gpu_id, total_cycles/num_jumps);
+    threads.push_back(std::thread(loc_ptr_ch<List>,
+                                  gpu_id, init_mode,
+                                  num_nodes, stride, num_jumps, nid
+                     )
+    );
   }
+
+  // Join all threads
+  std::for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join));
 }