From 423be682bf6a9b05802df1fcfa2a3e7ee49df5d8 Mon Sep 17 00:00:00 2001 From: Evgeny Fiksman Date: Tue, 10 Dec 2019 07:56:30 -0800 Subject: [PATCH] Add additional execution arguments to the benchmark (#207) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/207 Add initial benchmark cmd line arguments to avoid rebuild during benchmarking of various usecases Reviewed By: jianyuh Differential Revision: D18863198 fbshipit-source-id: 32452a125e86b857f5cf96f41bc321d8d5a85eb9 --- bench/BenchUtils.cc | 34 ++++++++ bench/BenchUtils.h | 13 +++ bench/FP16Benchmark.cc | 186 ++++++++++++++++++++--------------------- 3 files changed, 138 insertions(+), 95 deletions(-) diff --git a/bench/BenchUtils.cc b/bench/BenchUtils.cc index 694c0d109d..89f93a1f61 100644 --- a/bench/BenchUtils.cc +++ b/bench/BenchUtils.cc @@ -9,6 +9,7 @@ #include #include #include +#include #ifdef _OPENMP #include @@ -89,4 +90,37 @@ int fbgemm_get_thread_num() { #endif } +int parseArgumentInt( + int argc, + const char* argv[], + const char* arg, + int non_exist_val, + int def_val) { + int val = non_exist_val; + int arg_len = strlen(arg); + for(auto i = 1; i < argc; ++i) { + const char* ptr = strstr(argv[i], arg); + if (ptr) { + int res; + sscanf(ptr + arg_len, "%d", &res); + val = (*(ptr + arg_len - 1) == '=') ? res : def_val; + break; + } + } + return val; +} + +bool parseArgumentBool( + int argc, + const char* argv[], + const char* arg, + bool def_val) { + for(auto i = 1; i < argc; ++i) { + const char* ptr = strstr(argv[i], arg); + if (ptr) { + return true; + } + } + return def_val; +} } // namespace fbgemm diff --git a/bench/BenchUtils.h b/bench/BenchUtils.h index 90504fdf54..45e5b31df4 100644 --- a/bench/BenchUtils.h +++ b/bench/BenchUtils.h @@ -42,6 +42,19 @@ void cache_evict(const T& vec) { } } +/** + * Parse application command line arguments + * + */ +int parseArgumentInt( + int argc, + const char* argv[], + const char* arg, + int non_exist_val, + int def_val); +bool parseArgumentBool( + int argc, const char* argv[], const char* arg, bool def_val); + /** * @param Fn functor to execute * @param Fe data eviction functor diff --git a/bench/FP16Benchmark.cc b/bench/FP16Benchmark.cc index 2fa7b63071..faba8c893d 100644 --- a/bench/FP16Benchmark.cc +++ b/bench/FP16Benchmark.cc @@ -35,7 +35,8 @@ void test_xerbla(char* srname, const int* info, int){ printf("\nXERBLA(MKL Error) is called :%s: %d\n", srname, *info); } -void performance_test(int num_instances, bool flush) { +void performance_test( + int num_instances, bool flush, int repetitions, bool is_mkl) { #if defined(USE_MKL) mkl_set_xerbla((XerblaEntry)test_xerbla); @@ -217,77 +218,81 @@ void performance_test(int num_instances, bool flush) { } #if defined(USE_MKL) - // Gold via MKL sgemm - type = "MKL_FP32"; + if (is_mkl) { + // Gold via MKL sgemm + type = "MKL_FP32"; #elif defined(USE_BLAS) - type = "BLAS_FP32"; + type = "BLAS_FP32"; #else - type = "REF_FP32"; + type = "REF_FP32"; #endif - ttot = measureWithWarmup( - [&]() { - int copy = num_instances == 1 ? 0 : fbgemm_get_thread_num(); + ttot = measureWithWarmup( + [&]() { + int copy = num_instances == 1 ? 0 : fbgemm_get_thread_num(); + for(int i = 0; i < repetitions; ++i) { #if defined(USE_MKL) || defined(USE_BLAS) - cblas_sgemm( - CblasRowMajor, - CblasNoTrans, - CblasNoTrans, - m, - n, - k, - 1.0, - A[copy].data(), - k, - Bt[copy].data(), - btran == matrix_op_t::NoTranspose ? kAligned : nAligned, - beta, - C_ref[copy].data(), - n); + cblas_sgemm( + CblasRowMajor, + CblasNoTrans, + CblasNoTrans, + m, + n, + k, + 1.0, + A[copy].data(), + k, + Bt[copy].data(), + btran == matrix_op_t::NoTranspose ? kAligned : nAligned, + beta, + C_ref[copy].data(), + n); #else - cblas_sgemm_ref( - matrix_op_t::NoTranspose, - btran, - m, - n, - k, - alpha, - A[copy].data(), - k, - B[copy].data(), - (btran == matrix_op_t::NoTranspose) ? n : k, - beta, - C_ref[copy].data(), - n); + cblas_sgemm_ref( + matrix_op_t::NoTranspose, + btran, + m, + n, + k, + alpha, + A[copy].data(), + k, + B[copy].data(), + (btran == matrix_op_t::NoTranspose) ? n : k, + beta, + C_ref[copy].data(), + n); #endif - }, - 3, - NITER, - [&]() { - if (flush) { - int copy = num_instances == 1 ? 0 : fbgemm_get_thread_num(); - cache_evict(A[copy]); + } + }, + 3, + NITER, + [&]() { + if (flush) { + int copy = num_instances == 1 ? 0 : fbgemm_get_thread_num(); + cache_evict(A[copy]); #if defined(USE_MKL) || defined(USE_BLAS) - cache_evict(Bt[copy]); + cache_evict(Bt[copy]); #else - cache_evict(B[copy]); + cache_evict(B[copy]); #endif - cache_evict(C_ref[copy]); - } - }, - // Use OpenMP if num instances > 1 - num_instances > 1); - - gflops = nflops / ttot / 1e9; - gbs = nbytes / ttot / 1e9; - printf( - "\n%30s m = %5d n = %5d k = %5d Gflops = %8.4lf GBytes = %8.4lf\n", - type.c_str(), - m, - n, - k, - gflops, - gbs); + cache_evict(C_ref[copy]); + } + }, + // Use OpenMP if num instances > 1 + num_instances > 1); + + gflops = nflops / ttot / 1e9; + gbs = nbytes / ttot / 1e9; + printf( + "\n%30s m = %5d n = %5d k = %5d Gflops = %8.4lf GBytes = %8.4lf\n", + type.c_str(), + m, + n, + k, + gflops * repetitions, + gbs * repetitions); + } type = "FBP_" + std::string(typeid(btype).name()); @@ -306,15 +311,17 @@ void performance_test(int num_instances, bool flush) { int num_threads = num_instances == 1 ? fbgemm_get_num_threads() : 1; int tid = num_instances == 1 ? fbgemm_get_thread_num() : 0; - cblas_gemm_compute( - matrix_op_t::NoTranspose, - m, - A[copy].data(), - *Bp[copy], - beta, - C_fb[copy].data(), - tid, - num_threads); + for(int i = 0; i < repetitions; ++i) { + cblas_gemm_compute( + matrix_op_t::NoTranspose, + m, + A[copy].data(), + *Bp[copy], + beta, + C_fb[copy].data(), + tid, + num_threads); + } }, 3, NITER, @@ -336,27 +343,20 @@ void performance_test(int num_instances, bool flush) { m, n, k, - gflops, - gbs); + gflops * repetitions, + gbs * repetitions); } } -int main(int argc, char** argv) { +int main(int argc, const char* argv[]) { + int num_instances = 1; #ifdef _OPENMP const char* inst = getenv("GEMMBENCH_NUM_INSTANCES"); - int num_instances = 1; if (inst != nullptr && *inst) { num_instances = std::max(atoi(inst), num_instances); } - - for (auto i = 1; i < argc; ++i) { - static const char param[] = "--inst="; - const char* ptr = strstr(argv[i], param); - if (ptr) { - ptr += sizeof(param) - 1; // null terminated - num_instances = std::max(atoi(ptr), num_instances); - } - } + num_instances = parseArgumentInt( + argc, argv, "--inst=", num_instances, num_instances); printf("Running %d instances\n", num_instances); if (num_instances > 1) { // Set-up execution for multi-instance mode @@ -372,23 +372,19 @@ int main(int argc, char** argv) { } else { // When running single instance use OMP_NUM_THREADS to determine // parallelism. Default behaviour is using a single thread. - // Use 1 thread unless OMP_NUM_THREADS is explicit set. + int num_threads = parseArgumentInt( + argc, argv, "--num_threads=", 1, 1); const char* val = getenv("OMP_NUM_THREADS"); if (val == nullptr || !*val) { - omp_set_num_threads(1); + omp_set_num_threads(num_threads); } } #endif - bool flush = true; - for (auto i = 1; i < argc; ++i) { - static const char param[] = "--no-flush"; - const char* ptr = strstr(argv[i], param); - if (ptr) { - flush = false; - } - } + int repetitions = parseArgumentInt(argc, argv, "--repit=", 1, 1); + bool no_flush = parseArgumentBool(argc, argv, "--no-flush", false); + bool no_mkl = parseArgumentBool(argc, argv, "--no-mkl", false); - performance_test(num_instances, flush); + performance_test(num_instances, !no_flush, repetitions, !no_mkl); }