From 423be682bf6a9b05802df1fcfa2a3e7ee49df5d8 Mon Sep 17 00:00:00 2001
From: Evgeny Fiksman <efiksman@fb.com>
Date: Tue, 10 Dec 2019 07:56:30 -0800
Subject: [PATCH] Add additional execution arguments to the benchmark (#207)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/207

Add initial benchmark cmd line arguments to avoid rebuild during benchmarking of various usecases

Reviewed By: jianyuh

Differential Revision: D18863198

fbshipit-source-id: 32452a125e86b857f5cf96f41bc321d8d5a85eb9
---
 bench/BenchUtils.cc    |  34 ++++++++
 bench/BenchUtils.h     |  13 +++
 bench/FP16Benchmark.cc | 186 ++++++++++++++++++++---------------------
 3 files changed, 138 insertions(+), 95 deletions(-)
diff --git a/bench/BenchUtils.cc b/bench/BenchUtils.cc
index 694c0d109d..89f93a1f61 100644
--- a/bench/BenchUtils.cc
+++ b/bench/BenchUtils.cc
@@ -9,6 +9,7 @@
 #include <algorithm>
 #include <random>
 #include <type_traits>
+#include <string.h>
 
 #ifdef _OPENMP
 #include <omp.h>
@@ -89,4 +90,37 @@ int fbgemm_get_thread_num() {
 #endif
 }
 
+int parseArgumentInt(
+    int argc,
+    const char* argv[],
+    const char* arg,
+    int non_exist_val,
+    int def_val) {
+  int val = non_exist_val;
+  int arg_len = strlen(arg);
+  for(auto i = 1; i < argc; ++i) {
+    const char* ptr = strstr(argv[i], arg);
+    if (ptr) {
+      int res;
+      sscanf(ptr + arg_len, "%d", &res);
+      val = (*(ptr + arg_len - 1) == '=') ? res : def_val;
+      break;
+    }
+  }
+  return val;
+}
+
+bool parseArgumentBool(
+    int argc,
+    const char* argv[],
+    const char* arg,
+    bool def_val) {
+  for(auto i = 1; i < argc; ++i) {
+    const char* ptr = strstr(argv[i], arg);
+    if (ptr) {
+      return true;
+    }
+  }
+  return def_val;
+}
 } // namespace fbgemm
diff --git a/bench/BenchUtils.h b/bench/BenchUtils.h
index 90504fdf54..45e5b31df4 100644
--- a/bench/BenchUtils.h
+++ b/bench/BenchUtils.h
@@ -42,6 +42,19 @@ void cache_evict(const T& vec) {
   }
 }
 
+/**
+ * Parse application command line arguments
+ *
+ */
+int parseArgumentInt(
+    int argc,
+    const char* argv[],
+    const char* arg,
+    int non_exist_val,
+    int def_val);
+bool parseArgumentBool(
+    int argc, const char* argv[], const char* arg, bool def_val);
+
 /**
  * @param Fn functor to execute
  * @param Fe data eviction functor
diff --git a/bench/FP16Benchmark.cc b/bench/FP16Benchmark.cc
index 2fa7b63071..faba8c893d 100644
--- a/bench/FP16Benchmark.cc
+++ b/bench/FP16Benchmark.cc
@@ -35,7 +35,8 @@ void test_xerbla(char* srname, const int* info, int){
   printf("\nXERBLA(MKL Error) is called :%s: %d\n", srname, *info);
 }
 
-void performance_test(int num_instances, bool flush) {
+void performance_test(
+    int num_instances, bool flush, int repetitions, bool is_mkl) {
 
 #if defined(USE_MKL)
   mkl_set_xerbla((XerblaEntry)test_xerbla);
@@ -217,77 +218,81 @@ void performance_test(int num_instances, bool flush) {
     }
 
 #if defined(USE_MKL)
-    // Gold via MKL sgemm
-    type = "MKL_FP32";
+    if (is_mkl) {
+      // Gold via MKL sgemm
+      type = "MKL_FP32";
 #elif defined(USE_BLAS)
-    type = "BLAS_FP32";
+      type = "BLAS_FP32";
 #else
-    type = "REF_FP32";
+      type = "REF_FP32";
 #endif
 
-    ttot = measureWithWarmup(
-        [&]() {
-          int copy = num_instances == 1 ? 0 : fbgemm_get_thread_num();
+      ttot = measureWithWarmup(
+          [&]() {
+            int copy = num_instances == 1 ? 0 : fbgemm_get_thread_num();
+            for(int i = 0; i < repetitions; ++i) {
 #if defined(USE_MKL) || defined(USE_BLAS)
-          cblas_sgemm(
-            CblasRowMajor,
-            CblasNoTrans,
-            CblasNoTrans,
-            m,
-            n,
-            k,
-            1.0,
-            A[copy].data(),
-            k,
-            Bt[copy].data(),
-            btran == matrix_op_t::NoTranspose ? kAligned : nAligned,
-            beta,
-            C_ref[copy].data(),
-            n);
+              cblas_sgemm(
+                CblasRowMajor,
+                CblasNoTrans,
+                CblasNoTrans,
+                m,
+                n,
+                k,
+                1.0,
+                A[copy].data(),
+                k,
+                Bt[copy].data(),
+                btran == matrix_op_t::NoTranspose ? kAligned : nAligned,
+                beta,
+                C_ref[copy].data(),
+                n);
 #else
-          cblas_sgemm_ref(
-              matrix_op_t::NoTranspose,
-              btran,
-              m,
-              n,
-              k,
-              alpha,
-              A[copy].data(),
-              k,
-              B[copy].data(),
-              (btran == matrix_op_t::NoTranspose) ? n : k,
-              beta,
-              C_ref[copy].data(),
-              n);
+              cblas_sgemm_ref(
+                  matrix_op_t::NoTranspose,
+                  btran,
+                  m,
+                  n,
+                  k,
+                  alpha,
+                  A[copy].data(),
+                  k,
+                  B[copy].data(),
+                  (btran == matrix_op_t::NoTranspose) ? n : k,
+                  beta,
+                  C_ref[copy].data(),
+                  n);
 #endif
-        },
-        3,
-        NITER,
-        [&]() {
-          if (flush) {
-            int copy = num_instances == 1 ? 0 : fbgemm_get_thread_num();
-            cache_evict(A[copy]);
+            }
+          },
+          3,
+          NITER,
+          [&]() {
+            if (flush) {
+              int copy = num_instances == 1 ? 0 : fbgemm_get_thread_num();
+              cache_evict(A[copy]);
 #if defined(USE_MKL) || defined(USE_BLAS)
-            cache_evict(Bt[copy]);
+              cache_evict(Bt[copy]);
 #else
-            cache_evict(B[copy]);
+              cache_evict(B[copy]);
 #endif
-            cache_evict(C_ref[copy]);
-          }
-        },
-        // Use OpenMP if num instances > 1
-        num_instances > 1);
-
-    gflops = nflops / ttot / 1e9;
-    gbs = nbytes / ttot / 1e9;
-    printf(
-        "\n%30s m = %5d n = %5d k = %5d Gflops = %8.4lf GBytes = %8.4lf\n",
-        type.c_str(),
-        m,
-        n,
-        k,
-        gflops,
-        gbs);
+              cache_evict(C_ref[copy]);
+            }
+          },
+          // Use OpenMP if num instances > 1
+          num_instances > 1);
+
+      gflops = nflops / ttot / 1e9;
+      gbs = nbytes / ttot / 1e9;
+      printf(
+          "\n%30s m = %5d n = %5d k = %5d Gflops = %8.4lf GBytes = %8.4lf\n",
+          type.c_str(),
+          m,
+          n,
+          k,
+          gflops * repetitions,
+          gbs * repetitions);
+    }
 
     type = "FBP_" + std::string(typeid(btype).name());
 
@@ -306,15 +311,17 @@ void performance_test(int num_instances, bool flush) {
           int num_threads = num_instances == 1 ? fbgemm_get_num_threads() : 1;
           int tid = num_instances == 1 ? fbgemm_get_thread_num() : 0;
 
-          cblas_gemm_compute(
-              matrix_op_t::NoTranspose,
-              m,
-              A[copy].data(),
-              *Bp[copy],
-              beta,
-              C_fb[copy].data(),
-              tid,
-              num_threads);
+          for(int i = 0; i < repetitions; ++i) {
+            cblas_gemm_compute(
+                matrix_op_t::NoTranspose,
+                m,
+                A[copy].data(),
+                *Bp[copy],
+                beta,
+                C_fb[copy].data(),
+                tid,
+                num_threads);
+            }
         },
         3,
         NITER,
@@ -336,27 +343,20 @@ void performance_test(int num_instances, bool flush) {
         m,
         n,
         k,
-        gflops,
-        gbs);
+        gflops * repetitions,
+        gbs * repetitions);
   }
 }
 
-int main(int argc, char** argv) {
+int main(int argc, const char* argv[]) {
+  int num_instances = 1;
 #ifdef _OPENMP
   const char* inst = getenv("GEMMBENCH_NUM_INSTANCES");
-  int num_instances = 1;
   if (inst != nullptr && *inst) {
     num_instances = std::max(atoi(inst), num_instances);
   }
-
-  for (auto i = 1; i < argc; ++i) {
-    static const char param[] = "--inst=";
-    const char* ptr = strstr(argv[i], param);
-    if (ptr) {
-      ptr += sizeof(param) - 1; // null terminated
-      num_instances = std::max(atoi(ptr), num_instances);
-    }
-  }
+  num_instances = parseArgumentInt(
+      argc, argv, "--inst=", num_instances, num_instances);
   printf("Running %d instances\n", num_instances);
   if (num_instances > 1) {
       // Set-up execution for multi-instance mode
@@ -372,23 +372,19 @@ int main(int argc, char** argv) {
   } else {
     // When running single instance use OMP_NUM_THREADS to determine
     // parallelism. Default behaviour is using a single thread.
-    // Use 1 thread unless OMP_NUM_THREADS is explicit set.
+    int num_threads = parseArgumentInt(
+        argc, argv, "--num_threads=", 1, 1);
     const char* val = getenv("OMP_NUM_THREADS");
     if (val == nullptr || !*val) {
-      omp_set_num_threads(1);
+      omp_set_num_threads(num_threads);
     }
   }
 
 #endif
 
-  bool flush = true;
-  for (auto i = 1; i < argc; ++i) {
-    static const char param[] = "--no-flush";
-    const char* ptr = strstr(argv[i], param);
-    if (ptr) {
-      flush = false;
-    }
-  }
+  int repetitions = parseArgumentInt(argc, argv, "--repit=", 1, 1);
+  bool no_flush = parseArgumentBool(argc, argv, "--no-flush", false);
+  bool no_mkl = parseArgumentBool(argc, argv, "--no-mkl", false);
 
-  performance_test(num_instances, flush);
+  performance_test(num_instances, !no_flush, repetitions, !no_mkl);
 }