pytorch · gaoteng-git · Jun 17, 2021 · Jun 17, 2021
diff --git a/tb_plugin/test/test_profiler.py b/tb_plugin/test/test_profiler.py
@@ -1086,6 +1086,95 @@ def test_gpu_utilization(self):
                 count += 1
         self.assertEqual(count, 2)
 
+    # Test GPU utilization 3 metrics works fine if kernel out of ProfilerStep.
+    def test_gpu_utilization_kernel_out_of_step(self):
+        json_content = """
+          [{
+            "ph": "X", "cat": "Operator",
+            "name": "aten::mat_mul", "pid": 13721, "tid": "456",
+            "ts": 10, "dur": 10,
+            "args": {"Input Dims": [], "External id": 1}
+          },
+          {
+            "ph": "X", "cat": "Operator",
+            "name": "aten::mm", "pid": 13721, "tid": "456",
+            "ts": 120, "dur": 70,
+            "args": {"Input Dims": [], "External id": 3}
+          },
+          {
+            "ph": "X", "cat": "Operator",
+            "name": "aten::mm", "pid": 13721, "tid": "456",
+            "ts": 220, "dur": 20,
+            "args": {"Input Dims": [], "External id": 4}
+          },
+          {
+            "ph": "X", "cat": "Operator",
+            "name": "ProfilerStep#2", "pid": 13721, "tid": "456",
+            "ts": 100, "dur": 100,
+            "args": {"Input Dims": [], "External id": 2}
+          },
+          {
+            "ph": "X", "cat": "Kernel",
+            "name": "void cunn_ClassNLLCriterion_updateGradInput_kernel<float>", "pid": 1, "tid": "stream 7",
+            "ts": 60, "dur": 20,
+            "args": {"correlation": 334, "external id": 1, "device": 1,
+                     "blocks per SM": 0.5, "est. achieved occupancy %": 0.6}
+          },
+          {
+            "ph": "X", "cat": "Runtime",
+            "name": "cudaLaunchKernel", "pid": 13721, "tid": "456",
+            "ts": 15, "dur": 5,
+            "args": {"correlation": 334, "external id": 1}
+          },
+          {
+            "ph": "X", "cat": "Kernel",
+            "name": "void cunn_ClassNLLCriterion_updateGradInput_kernel<float>", "pid": 1, "tid": "stream 7",
+            "ts": 240, "dur": 25,
+            "args": {"correlation": 337, "external id": 4, "device": 1,
+                     "blocks per SM": 10.5, "est. achieved occupancy %": 0.3}
+          },
+          {
+            "ph": "X", "cat": "Runtime",
+            "name": "cudaLaunchKernel", "pid": 13721, "tid": "456",
+            "ts": 230, "dur": 10,
+            "args": {"correlation": 337, "external id": 4}
+          }]
+        """
+        profile = parse_json_trace(json_content)
+        profile.process()
+
+        self.assertEqual(len(profile.gpu_ids), 1)
+        self.assertAlmostEqual(profile.gpu_utilization[1], 0.0)
+        self.assertTrue(profile.sm_efficency[1] is None)
+        self.assertTrue(profile.occupancy[1] is None)
+        self.assertTrue(profile.blocks_per_sm_count[1] > 0)
+        self.assertTrue(profile.occupancy_count[1] > 0)
+
+        count = 0
+        for agg_by_op in profile.kernel_list_groupby_name_op:
+            if agg_by_op.name == "void cunn_ClassNLLCriterion_updateGradInput_kernel<float>" \
+                    and agg_by_op.op_name == "aten::mat_mul":
+                self.assertAlmostEqual(agg_by_op.avg_blocks_per_sm, 0.5)
+                self.assertAlmostEqual(agg_by_op.avg_occupancy, 0.6)
+                count += 1
+            if agg_by_op.name == "void cunn_ClassNLLCriterion_updateGradInput_kernel<float>" and \
+                    agg_by_op.op_name == "aten::mm":
+                self.assertAlmostEqual(
+                    agg_by_op.avg_blocks_per_sm, 10.5)
+                self.assertAlmostEqual(
+                    agg_by_op.avg_occupancy, 0.3)
+                count += 1
+        self.assertEqual(count, 2)
+
+        count = 0
+        for _id, (name, row) in enumerate(profile.kernel_stat.iterrows()):
+            # The kernel with zero "dur" should be ignored.
+            if name == "void cunn_ClassNLLCriterion_updateGradInput_kernel<float>":
+                self.assertAlmostEqual(row["blocks_per_sm"], (20 * 0.5 + 25 * 10.5) / (20 + 25))
+                self.assertAlmostEqual(row["occupancy"], (20 * 0.6 + 25 * 0.3) / (20 + 25))
+                count += 1
+        self.assertEqual(count, 1)
+
     def test_dump_gpu_metrics(self):
         profile = RunProfile("test_dump_gpu_metrics", None)
         # Faked data for easy to see in UI. Real data values are 1/100 of these.

diff --git a/tb_plugin/torch_tb_profiler/profiler/gpu_metrics_parser.py b/tb_plugin/torch_tb_profiler/profiler/gpu_metrics_parser.py
@@ -136,17 +136,20 @@ def calculate_avg(approximated_sm_efficency_ranges, total_dur):
         self.blocks_per_sm_per_device = None  # Release memory.
 
     # Weighted average. Weighted by kernel's time duration.
-    def calculate_occupancy(self):
+    def calculate_occupancy(self, steps_start_time, steps_end_time):
         for gpu_id in self.gpu_ids:
             occupancys_on_a_device = self.occupancy_per_device[gpu_id]
             total_time = 0
             total_occupancy = 0.0
             for r in occupancys_on_a_device:
-                dur = r[1] - r[0]
-                total_occupancy += r[2] * dur
-                total_time += dur
-            avg_occupancy = total_occupancy / total_time
-            self.avg_occupancy_per_device[gpu_id] = avg_occupancy
+                min_time = max(r[0], steps_start_time)
+                max_time = min(r[1], steps_end_time)
+                if min_time < max_time:
+                    dur = max_time - min_time
+                    total_occupancy += r[2] * dur
+                    total_time += dur
+            if total_time > 0:
+                self.avg_occupancy_per_device[gpu_id] = total_occupancy / total_time
 
     def parse_events(self, events, global_start_time, global_end_time, steps_start_time, steps_end_time):
         logger.debug("GPU Metrics, parse events")
@@ -156,7 +159,7 @@ def parse_events(self, events, global_start_time, global_end_time, steps_start_t
 
         self.calculate_gpu_utilization(global_start_time, global_end_time, steps_start_time, steps_end_time)
         self.calculate_approximated_sm_efficency(steps_start_time, steps_end_time)
-        self.calculate_occupancy()
+        self.calculate_occupancy(steps_start_time, steps_end_time)
 
     def parse_event(self, event):
         ts = event.ts

diff --git a/tb_plugin/torch_tb_profiler/profiler/run_generator.py b/tb_plugin/torch_tb_profiler/profiler/run_generator.py
@@ -49,8 +49,6 @@ def generate_run_profile(self):
         profile_run.gpu_utilization = self.profile_data.gpu_utilization
         profile_run.sm_efficency = self.profile_data.sm_efficency
         profile_run.occupancy = self.profile_data.occupancy
-        profile_run.blocks_per_sm_count = self.profile_data.blocks_per_sm_count
-        profile_run.occupancy_count = self.profile_data.occupancy_count
 
         # add memory stats
         if self.profile_data.has_memory_data:

diff --git a/tb_plugin/torch_tb_profiler/run.py b/tb_plugin/torch_tb_profiler/run.py
@@ -193,12 +193,12 @@ def get_gpu_metrics_data(profile):
                 gpu_metrics_data.append({"title": "GPU Utilization",
                                          "value": "{} %".format(
                                              round(profile.gpu_utilization[gpu_id] * 100, 2))})
-                if profile.blocks_per_sm_count[gpu_id] > 0:
+                if profile.sm_efficency[gpu_id] is not None:
                     gpu_metrics_data.append({"title": "Est. SM Efficiency",
                                              "value": "{} %".format(
                                                  round(profile.sm_efficency[gpu_id] * 100, 2))})
                     has_sm_efficiency = True
-                if profile.occupancy_count[gpu_id] > 0:
+                if profile.occupancy[gpu_id] is not None:
                     gpu_metrics_data.append({"title": "Est. Achieved Occupancy",
                                              "value": "{} %".format(round(profile.occupancy[gpu_id], 2))})
                     has_occupancy = True