diff --git a/test/test_profiler.py b/test/test_profiler.py
index f1cdff7e43c6..ac215ab10aa7 100644
--- a/test/test_profiler.py
+++ b/test/test_profiler.py
@@ -385,6 +385,20 @@ def test_flops(self):
         profiler_output = prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=10)
         self.assertIn("FLOPS", profiler_output)
 
+        if not (kineto_available() and torch.cuda.is_available()):
+            return
+
+        with profile(activities=[
+                torch.profiler.ProfilerActivity.CPU,
+                torch.profiler.ProfilerActivity.CUDA],
+                record_shapes=True,
+                with_flops=True,
+        ) as kineto_profiler:
+            model(inputs)
+        profiler_output = kineto_profiler.key_averages().table(
+            sort_by="self_cuda_time_total", row_limit=-1)
+        self.assertIn("FLOPS", profiler_output)
+
     @unittest.skipIf(not kineto_available(), "Kineto is required")
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
     def test_kineto_profiler_api(self):
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index 475caadcfb73..ad165299514c 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -365,8 +365,8 @@ class profile(object):
 
         with_flops (bool, optional): If with_flops is set, the profiler will estimate
             the FLOPS (floating pointer operations per second) value using the operator's input shape
-            and total CPU time. This allows one to estimate the hardware performance. Currently,
-            this option only works for the matrix multiplication and convolution functions.
+            and total time. This allows one to estimate the hardware performance. Currently,
+            this option only works for the matrix multiplication and 2D convolution operators.
 
         profile_memory (bool, optional): track tensor memory allocation/deallocation.
 
diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h
index 9290bf0204f0..bd4e1b8d6198 100644
--- a/torch/csrc/autograd/profiler_kineto.h
+++ b/torch/csrc/autograd/profiler_kineto.h
@@ -170,7 +170,7 @@ struct TORCH_API KinetoEvent {
   uint8_t activity_type_;
   c10::optional<std::vector<std::vector<int64_t>>> shapes_;
   c10::optional<std::vector<std::string>> stack_;
-  uint64_t flops_;
+  uint64_t flops_ = 0;
 
   std::string name_;
   uint64_t device_index_ = 0;
diff --git a/torch/csrc/autograd/profiler_legacy.h b/torch/csrc/autograd/profiler_legacy.h
index 23169cd33450..e71db8879f4b 100644
--- a/torch/csrc/autograd/profiler_legacy.h
+++ b/torch/csrc/autograd/profiler_legacy.h
@@ -331,7 +331,7 @@ struct TORCH_API LegacyEvent {
   uint64_t correlation_id_;
   // Extra arguments for computing op flops
   std::unordered_map<std::string, c10::IValue> extra_args_;
-  uint64_t flops_;
+  uint64_t flops_ = 0;
 };
 
 // a linked-list of fixed sized vectors, to avoid
diff --git a/torch/profiler/profiler.py b/torch/profiler/profiler.py
index 333e175cd618..a6c1ef81cb7a 100644
--- a/torch/profiler/profiler.py
+++ b/torch/profiler/profiler.py
@@ -89,7 +89,8 @@ class profile(object):
       during the profiling;
     - ``record_shapes`` - save information about operator's input shapes;
     - ``profile_memory`` - track tensor memory allocation/deallocation;
-    - ``with_stack`` - record source information (file and line number) for the ops.
+    - ``with_stack`` - record source information (file and line number) for the ops;
+    - ``with_flops`` - use formula to estimate the FLOPS of specific operators (matrix multiplication and 2D convolution);
     - ``use_cuda`` - (deprecated, use ``activities``).
 
     .. note::
@@ -162,6 +163,7 @@ def __init__(
             record_shapes: bool = False,
             profile_memory: bool = False,
             with_stack: bool = False,
+            with_flops: bool = False,
             # deprecated:
             use_cuda: Optional[bool] = None):
         if activities:
@@ -191,6 +193,7 @@ def __init__(
             self.record_steps = False
         self.on_trace_ready = on_trace_ready
         self.record_shapes = record_shapes
+        self.with_flops = with_flops
         self.profile_memory = profile_memory
         self.with_stack = with_stack
         self.step_num = 0
@@ -337,6 +340,7 @@ def _start_warmup(self):
             use_cuda=(ProfilerActivity.CUDA in self.activities),
             use_cpu=(ProfilerActivity.CPU in self.activities),
             record_shapes=self.record_shapes,
+            with_flops=self.with_flops,
             profile_memory=self.profile_memory,
             with_stack=self.with_stack,
             use_kineto=True,