diff --git a/tb_plugin/test/test_profiler.py b/tb_plugin/test/test_profiler.py index f816e7d99..aa1f4459c 100644 --- a/tb_plugin/test/test_profiler.py +++ b/tb_plugin/test/test_profiler.py @@ -1086,6 +1086,95 @@ def test_gpu_utilization(self): count += 1 self.assertEqual(count, 2) + # Test GPU utilization 3 metrics works fine if kernel out of ProfilerStep. + def test_gpu_utilization_kernel_out_of_step(self): + json_content = """ + [{ + "ph": "X", "cat": "Operator", + "name": "aten::mat_mul", "pid": 13721, "tid": "456", + "ts": 10, "dur": 10, + "args": {"Input Dims": [], "External id": 1} + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::mm", "pid": 13721, "tid": "456", + "ts": 120, "dur": 70, + "args": {"Input Dims": [], "External id": 3} + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::mm", "pid": 13721, "tid": "456", + "ts": 220, "dur": 20, + "args": {"Input Dims": [], "External id": 4} + }, + { + "ph": "X", "cat": "Operator", + "name": "ProfilerStep#2", "pid": 13721, "tid": "456", + "ts": 100, "dur": 100, + "args": {"Input Dims": [], "External id": 2} + }, + { + "ph": "X", "cat": "Kernel", + "name": "void cunn_ClassNLLCriterion_updateGradInput_kernel", "pid": 1, "tid": "stream 7", + "ts": 60, "dur": 20, + "args": {"correlation": 334, "external id": 1, "device": 1, + "blocks per SM": 0.5, "est. achieved occupancy %": 0.6} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 13721, "tid": "456", + "ts": 15, "dur": 5, + "args": {"correlation": 334, "external id": 1} + }, + { + "ph": "X", "cat": "Kernel", + "name": "void cunn_ClassNLLCriterion_updateGradInput_kernel", "pid": 1, "tid": "stream 7", + "ts": 240, "dur": 25, + "args": {"correlation": 337, "external id": 4, "device": 1, + "blocks per SM": 10.5, "est. achieved occupancy %": 0.3} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 13721, "tid": "456", + "ts": 230, "dur": 10, + "args": {"correlation": 337, "external id": 4} + }] + """ + profile = parse_json_trace(json_content) + profile.process() + + self.assertEqual(len(profile.gpu_ids), 1) + self.assertAlmostEqual(profile.gpu_utilization[1], 0.0) + self.assertTrue(profile.sm_efficency[1] is None) + self.assertTrue(profile.occupancy[1] is None) + self.assertTrue(profile.blocks_per_sm_count[1] > 0) + self.assertTrue(profile.occupancy_count[1] > 0) + + count = 0 + for agg_by_op in profile.kernel_list_groupby_name_op: + if agg_by_op.name == "void cunn_ClassNLLCriterion_updateGradInput_kernel" \ + and agg_by_op.op_name == "aten::mat_mul": + self.assertAlmostEqual(agg_by_op.avg_blocks_per_sm, 0.5) + self.assertAlmostEqual(agg_by_op.avg_occupancy, 0.6) + count += 1 + if agg_by_op.name == "void cunn_ClassNLLCriterion_updateGradInput_kernel" and \ + agg_by_op.op_name == "aten::mm": + self.assertAlmostEqual( + agg_by_op.avg_blocks_per_sm, 10.5) + self.assertAlmostEqual( + agg_by_op.avg_occupancy, 0.3) + count += 1 + self.assertEqual(count, 2) + + count = 0 + for _id, (name, row) in enumerate(profile.kernel_stat.iterrows()): + # The kernel with zero "dur" should be ignored. + if name == "void cunn_ClassNLLCriterion_updateGradInput_kernel": + self.assertAlmostEqual(row["blocks_per_sm"], (20 * 0.5 + 25 * 10.5) / (20 + 25)) + self.assertAlmostEqual(row["occupancy"], (20 * 0.6 + 25 * 0.3) / (20 + 25)) + count += 1 + self.assertEqual(count, 1) + def test_dump_gpu_metrics(self): profile = RunProfile("test_dump_gpu_metrics", None) # Faked data for easy to see in UI. Real data values are 1/100 of these. diff --git a/tb_plugin/torch_tb_profiler/profiler/gpu_metrics_parser.py b/tb_plugin/torch_tb_profiler/profiler/gpu_metrics_parser.py index 7cf699727..44542491e 100644 --- a/tb_plugin/torch_tb_profiler/profiler/gpu_metrics_parser.py +++ b/tb_plugin/torch_tb_profiler/profiler/gpu_metrics_parser.py @@ -136,17 +136,20 @@ def calculate_avg(approximated_sm_efficency_ranges, total_dur): self.blocks_per_sm_per_device = None # Release memory. # Weighted average. Weighted by kernel's time duration. - def calculate_occupancy(self): + def calculate_occupancy(self, steps_start_time, steps_end_time): for gpu_id in self.gpu_ids: occupancys_on_a_device = self.occupancy_per_device[gpu_id] total_time = 0 total_occupancy = 0.0 for r in occupancys_on_a_device: - dur = r[1] - r[0] - total_occupancy += r[2] * dur - total_time += dur - avg_occupancy = total_occupancy / total_time - self.avg_occupancy_per_device[gpu_id] = avg_occupancy + min_time = max(r[0], steps_start_time) + max_time = min(r[1], steps_end_time) + if min_time < max_time: + dur = max_time - min_time + total_occupancy += r[2] * dur + total_time += dur + if total_time > 0: + self.avg_occupancy_per_device[gpu_id] = total_occupancy / total_time def parse_events(self, events, global_start_time, global_end_time, steps_start_time, steps_end_time): logger.debug("GPU Metrics, parse events") @@ -156,7 +159,7 @@ def parse_events(self, events, global_start_time, global_end_time, steps_start_t self.calculate_gpu_utilization(global_start_time, global_end_time, steps_start_time, steps_end_time) self.calculate_approximated_sm_efficency(steps_start_time, steps_end_time) - self.calculate_occupancy() + self.calculate_occupancy(steps_start_time, steps_end_time) def parse_event(self, event): ts = event.ts diff --git a/tb_plugin/torch_tb_profiler/profiler/run_generator.py b/tb_plugin/torch_tb_profiler/profiler/run_generator.py index 44923bcac..965dad6ff 100644 --- a/tb_plugin/torch_tb_profiler/profiler/run_generator.py +++ b/tb_plugin/torch_tb_profiler/profiler/run_generator.py @@ -49,8 +49,6 @@ def generate_run_profile(self): profile_run.gpu_utilization = self.profile_data.gpu_utilization profile_run.sm_efficency = self.profile_data.sm_efficency profile_run.occupancy = self.profile_data.occupancy - profile_run.blocks_per_sm_count = self.profile_data.blocks_per_sm_count - profile_run.occupancy_count = self.profile_data.occupancy_count # add memory stats if self.profile_data.has_memory_data: diff --git a/tb_plugin/torch_tb_profiler/run.py b/tb_plugin/torch_tb_profiler/run.py index ee4e9d961..9376521ec 100644 --- a/tb_plugin/torch_tb_profiler/run.py +++ b/tb_plugin/torch_tb_profiler/run.py @@ -193,12 +193,12 @@ def get_gpu_metrics_data(profile): gpu_metrics_data.append({"title": "GPU Utilization", "value": "{} %".format( round(profile.gpu_utilization[gpu_id] * 100, 2))}) - if profile.blocks_per_sm_count[gpu_id] > 0: + if profile.sm_efficency[gpu_id] is not None: gpu_metrics_data.append({"title": "Est. SM Efficiency", "value": "{} %".format( round(profile.sm_efficency[gpu_id] * 100, 2))}) has_sm_efficiency = True - if profile.occupancy_count[gpu_id] > 0: + if profile.occupancy[gpu_id] is not None: gpu_metrics_data.append({"title": "Est. Achieved Occupancy", "value": "{} %".format(round(profile.occupancy[gpu_id], 2))}) has_occupancy = True