From 6bb573fb035139f89deab47688d85556675416ca Mon Sep 17 00:00:00 2001 From: Ake Sandgren Date: Tue, 14 Dec 2021 17:19:41 +0100 Subject: [PATCH] gpu_burn.cu: Time the warmup phase and print minimum required duration. - Calculates the time it takes to run compare+getErrors and removes that during Performance calculation. - Read device temp values after each repetition and save maxtemp. - Error out if duration was too short and it didn't complete a single repetition. --- .../gpu/gpu_burn/src/gpu_burn.cu | 75 +++++++++++++++---- 1 file changed, 62 insertions(+), 13 deletions(-) diff --git a/hpctestlib/microbenchmarks/gpu/gpu_burn/src/gpu_burn.cu b/hpctestlib/microbenchmarks/gpu/gpu_burn/src/gpu_burn.cu index 12e628cbf2..d106fe2ff9 100644 --- a/hpctestlib/microbenchmarks/gpu/gpu_burn/src/gpu_burn.cu +++ b/hpctestlib/microbenchmarks/gpu/gpu_burn/src/gpu_burn.cu @@ -38,9 +38,8 @@ #define SIZE 2048ul // Matrices are SIZE*SIZE.. 2048^2 should be efficiently implemented in CUBLAS #define USEMEM 0.9 // Try to allocate 90% of memory -// Used to report op/s, measured through Visual Profiler, CUBLAS from CUDA 7.5 -// (Seems that they indeed take the naive dim^3 approach) -#define OPS_PER_MUL 17188257792ul +// Operations per matrix multiply +#define OPS_PER_MUL (2*SIZE*SIZE*SIZE-SIZE*SIZE) #include #include @@ -227,12 +226,15 @@ class BurnTracker public: std::mutex mtx; size_t iters, reps, err; - std::chrono::system_clock::time_point start, end; + float devTemp; + std::chrono::high_resolution_clock::time_point start, end; + std::chrono::duration compare_time; BurnTracker() { std::lock_guard lg(mtx); err = 0; iters = 0; reps = 0; + devTemp = 0.0; }; void set_iters(size_t it) @@ -241,20 +243,43 @@ public: iters = it; } + void set_compare_time(std::chrono::duration t) + { + std::lock_guard lg(mtx); + compare_time = t; + } + void start_timer() { std::lock_guard lg(mtx); - start = std::chrono::system_clock::now(); + start = std::chrono::high_resolution_clock::now(); } - void log(size_t e) + void log(size_t e, Smi *smi_handle, int devId) { + float temp; + std::lock_guard lg(mtx); - end = std::chrono::system_clock::now(); + + end = std::chrono::high_resolution_clock::now(); + + smi_handle->getGpuTemp(devId, &temp); + if (temp > devTemp) { + devTemp = temp; + } + reps++; err += e; } + float getTemp() + { + std::lock_guard lg(mtx); + float temp = devTemp; + devTemp = 0.0; + return temp; + } + double read() { std::lock_guard lg(mtx); @@ -263,9 +288,14 @@ public: if (err) return -1; + if (reps == 0) { + printf("Warning: duration is to short, didn't finish a single repetition\n"); + return -1; + } + // Get the time difference and return the flops std::chrono::duration diff = end-start; - double Gflops = 1e-9 * iters * reps * OPS_PER_MUL / diff.count(); + double Gflops = 1e-9 * iters * reps * OPS_PER_MUL / (diff - compare_time*reps).count(); // Reset the counters err = 0; reps = 0; @@ -287,9 +317,13 @@ int devCount; template void startBurn(int devId, Smi * smi_handle, T *A, T *B, - BurnTracker * bt + BurnTracker * bt, + char *hostname ) { + std::chrono::high_resolution_clock::time_point warmup_start, warmup_end; + std::chrono::duration warmup_diff; + GemmTest test(devId, smi_handle); test.initBuffers(A, B); @@ -297,7 +331,22 @@ void startBurn(int devId, bt->set_iters(test.getIters()); // Warmup burn + warmup_start = std::chrono::high_resolution_clock::now(); test.compute(); + XDeviceSynchronize(); + warmup_end = std::chrono::high_resolution_clock::now(); + warmup_diff = warmup_end-warmup_start; + printf("[%s] GPU %2d: Warmup computation takes %g seconds, duration must be larger than that to get any results\n", hostname, devId, warmup_diff.count()); + fflush(stdout); + warmup_start = std::chrono::high_resolution_clock::now(); + for (int i=0; i < 100; i++) { + test.compare(); + test.getErrors(); + } + warmup_end = std::chrono::high_resolution_clock::now(); + warmup_diff = (warmup_end-warmup_start)/100; + bt->set_compare_time(warmup_diff); + XDeviceSynchronize(); { // Flag that this thread is done with the warmup. @@ -319,7 +368,7 @@ void startBurn(int devId, test.compare(); // Update the results - bt->log(test.getErrors()); + bt->log(test.getErrors(), smi_handle, devId); } } @@ -361,7 +410,8 @@ template void launch(int duration) threads.push_back(std::thread(startBurn, i, &smi_handle, A, B, - trackThreads[i] + trackThreads[i], + hostname ) ); } @@ -384,8 +434,7 @@ template void launch(int duration) for (int i = 0; i < devCount; i++) { double flops = trackThreads[i]->read(); - float devTemp; - smi_handle.getGpuTemp(i, &devTemp); + float devTemp = trackThreads[i]->getTemp(); printf("[%s] GPU %2d(%s): %4.0f GF/s %d Celsius\n", hostname, i, flops < 0.0 ? "FAULTY" : "OK", flops, (int)devTemp); }