From 5c3a054b12f7674c421de583ae78e960c2af52c0 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Fri, 5 Feb 2021 15:01:16 -0800 Subject: [PATCH] Add FLOPS support to the new profiler API. (#51734) Summary: The new profiler API was added in PR#48280. This PR is to add FLOPS support to the new profiler API. Pull Request resolved: https://github.com/pytorch/pytorch/pull/51734 Test Plan: ```python python test/test_profiler.py -k test_flops ``` Reviewed By: xuzhao9 Differential Revision: D26261851 Pulled By: ilia-cher fbshipit-source-id: dbeba4c197e6f51a9a8e640e8bb60ec38df87f73 --- test/test_profiler.py | 14 ++++++++++++++ torch/autograd/profiler.py | 4 ++-- torch/csrc/autograd/profiler_kineto.h | 2 +- torch/csrc/autograd/profiler_legacy.h | 2 +- torch/profiler/profiler.py | 6 +++++- 5 files changed, 23 insertions(+), 5 deletions(-) diff --git a/test/test_profiler.py b/test/test_profiler.py index 2d0ca05cd4a6..585f569aa7f6 100644 --- a/test/test_profiler.py +++ b/test/test_profiler.py @@ -385,6 +385,20 @@ def test_flops(self): profiler_output = prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=10) self.assertIn("FLOPS", profiler_output) + if not (kineto_available() and torch.cuda.is_available()): + return + + with profile(activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA], + record_shapes=True, + with_flops=True, + ) as kineto_profiler: + model(inputs) + profiler_output = kineto_profiler.key_averages().table( + sort_by="self_cuda_time_total", row_limit=-1) + self.assertIn("FLOPS", profiler_output) + @unittest.skipIf(not kineto_available(), "Kineto is required") @unittest.skipIf(not torch.cuda.is_available(), "CUDA is required") def test_kineto_profiler_api(self): diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py index 475caadcfb73..ad165299514c 100644 --- a/torch/autograd/profiler.py +++ b/torch/autograd/profiler.py @@ -365,8 +365,8 @@ class profile(object): with_flops (bool, optional): If with_flops is set, the profiler will estimate the FLOPS (floating pointer operations per second) value using the operator's input shape - and total CPU time. This allows one to estimate the hardware performance. Currently, - this option only works for the matrix multiplication and convolution functions. + and total time. This allows one to estimate the hardware performance. Currently, + this option only works for the matrix multiplication and 2D convolution operators. profile_memory (bool, optional): track tensor memory allocation/deallocation. diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h index ab1191bc8c7e..9533721e9e40 100644 --- a/torch/csrc/autograd/profiler_kineto.h +++ b/torch/csrc/autograd/profiler_kineto.h @@ -170,7 +170,7 @@ struct TORCH_API KinetoEvent { uint8_t activity_type_; c10::optional>> shapes_; c10::optional> stack_; - uint64_t flops_; + uint64_t flops_ = 0; std::string name_; uint64_t device_index_ = 0; diff --git a/torch/csrc/autograd/profiler_legacy.h b/torch/csrc/autograd/profiler_legacy.h index 23169cd33450..e71db8879f4b 100644 --- a/torch/csrc/autograd/profiler_legacy.h +++ b/torch/csrc/autograd/profiler_legacy.h @@ -331,7 +331,7 @@ struct TORCH_API LegacyEvent { uint64_t correlation_id_; // Extra arguments for computing op flops std::unordered_map extra_args_; - uint64_t flops_; + uint64_t flops_ = 0; }; // a linked-list of fixed sized vectors, to avoid diff --git a/torch/profiler/profiler.py b/torch/profiler/profiler.py index a942ce2d5f6d..b961d76fa2a9 100644 --- a/torch/profiler/profiler.py +++ b/torch/profiler/profiler.py @@ -92,7 +92,8 @@ class profile(object): during the profiling; - ``record_shapes`` - save information about operator's input shapes; - ``profile_memory`` - track tensor memory allocation/deallocation; - - ``with_stack`` - record source information (file and line number) for the ops. + - ``with_stack`` - record source information (file and line number) for the ops; + - ``with_flops`` - use formula to estimate the FLOPS of specific operators (matrix multiplication and 2D convolution); - ``use_cuda`` - (deprecated, use ``activities``). .. note:: @@ -178,6 +179,7 @@ def __init__( record_shapes: bool = False, profile_memory: bool = False, with_stack: bool = False, + with_flops: bool = False, # deprecated: use_cuda: Optional[bool] = None): if activities: @@ -207,6 +209,7 @@ def __init__( self.record_steps = False self.on_trace_ready = on_trace_ready self.record_shapes = record_shapes + self.with_flops = with_flops self.profile_memory = profile_memory self.with_stack = with_stack self.step_num = 0 @@ -353,6 +356,7 @@ def _start_warmup(self): use_cuda=(ProfilerActivity.CUDA in self.activities), use_cpu=(ProfilerActivity.CPU in self.activities), record_shapes=self.record_shapes, + with_flops=self.with_flops, profile_memory=self.profile_memory, with_stack=self.with_stack, use_kineto=True,