diff --git a/test/test_profiler.py b/test/test_profiler.py index f1cdff7e43c6..ac215ab10aa7 100644 --- a/test/test_profiler.py +++ b/test/test_profiler.py @@ -385,6 +385,20 @@ def test_flops(self): profiler_output = prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=10) self.assertIn("FLOPS", profiler_output) + if not (kineto_available() and torch.cuda.is_available()): + return + + with profile(activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA], + record_shapes=True, + with_flops=True, + ) as kineto_profiler: + model(inputs) + profiler_output = kineto_profiler.key_averages().table( + sort_by="self_cuda_time_total", row_limit=-1) + self.assertIn("FLOPS", profiler_output) + @unittest.skipIf(not kineto_available(), "Kineto is required") @unittest.skipIf(not torch.cuda.is_available(), "CUDA is required") def test_kineto_profiler_api(self): diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py index 475caadcfb73..ad165299514c 100644 --- a/torch/autograd/profiler.py +++ b/torch/autograd/profiler.py @@ -365,8 +365,8 @@ class profile(object): with_flops (bool, optional): If with_flops is set, the profiler will estimate the FLOPS (floating pointer operations per second) value using the operator's input shape - and total CPU time. This allows one to estimate the hardware performance. Currently, - this option only works for the matrix multiplication and convolution functions. + and total time. This allows one to estimate the hardware performance. Currently, + this option only works for the matrix multiplication and 2D convolution operators. profile_memory (bool, optional): track tensor memory allocation/deallocation. diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h index 9290bf0204f0..bd4e1b8d6198 100644 --- a/torch/csrc/autograd/profiler_kineto.h +++ b/torch/csrc/autograd/profiler_kineto.h @@ -170,7 +170,7 @@ struct TORCH_API KinetoEvent { uint8_t activity_type_; c10::optional>> shapes_; c10::optional> stack_; - uint64_t flops_; + uint64_t flops_ = 0; std::string name_; uint64_t device_index_ = 0; diff --git a/torch/csrc/autograd/profiler_legacy.h b/torch/csrc/autograd/profiler_legacy.h index 23169cd33450..e71db8879f4b 100644 --- a/torch/csrc/autograd/profiler_legacy.h +++ b/torch/csrc/autograd/profiler_legacy.h @@ -331,7 +331,7 @@ struct TORCH_API LegacyEvent { uint64_t correlation_id_; // Extra arguments for computing op flops std::unordered_map extra_args_; - uint64_t flops_; + uint64_t flops_ = 0; }; // a linked-list of fixed sized vectors, to avoid diff --git a/torch/profiler/profiler.py b/torch/profiler/profiler.py index 333e175cd618..a6c1ef81cb7a 100644 --- a/torch/profiler/profiler.py +++ b/torch/profiler/profiler.py @@ -89,7 +89,8 @@ class profile(object): during the profiling; - ``record_shapes`` - save information about operator's input shapes; - ``profile_memory`` - track tensor memory allocation/deallocation; - - ``with_stack`` - record source information (file and line number) for the ops. + - ``with_stack`` - record source information (file and line number) for the ops; + - ``with_flops`` - use formula to estimate the FLOPS of specific operators (matrix multiplication and 2D convolution); - ``use_cuda`` - (deprecated, use ``activities``). .. note:: @@ -162,6 +163,7 @@ def __init__( record_shapes: bool = False, profile_memory: bool = False, with_stack: bool = False, + with_flops: bool = False, # deprecated: use_cuda: Optional[bool] = None): if activities: @@ -191,6 +193,7 @@ def __init__( self.record_steps = False self.on_trace_ready = on_trace_ready self.record_shapes = record_shapes + self.with_flops = with_flops self.profile_memory = profile_memory self.with_stack = with_stack self.step_num = 0 @@ -337,6 +340,7 @@ def _start_warmup(self): use_cuda=(ProfilerActivity.CUDA in self.activities), use_cpu=(ProfilerActivity.CPU in self.activities), record_shapes=self.record_shapes, + with_flops=self.with_flops, profile_memory=self.profile_memory, with_stack=self.with_stack, use_kineto=True,