From 1bf6ab266e0ed1b2db64bc2db4d13ca20040e062 Mon Sep 17 00:00:00 2001 From: Teng Gao Date: Tue, 29 Jun 2021 14:45:39 +0800 Subject: [PATCH 1/2] workaround for negative gpu metrics from input json file --- .../profiler/gpu_metrics_parser.py | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/tb_plugin/torch_tb_profiler/profiler/gpu_metrics_parser.py b/tb_plugin/torch_tb_profiler/profiler/gpu_metrics_parser.py index 613babbf0..0d7bb4d8e 100644 --- a/tb_plugin/torch_tb_profiler/profiler/gpu_metrics_parser.py +++ b/tb_plugin/torch_tb_profiler/profiler/gpu_metrics_parser.py @@ -172,10 +172,20 @@ def parse_event(self, event): if gpu_id not in self.gpu_ids: self.gpu_ids.add(gpu_id) self.kernel_ranges_per_device[gpu_id].append((ts, ts + dur)) - self.blocks_per_sm_per_device[gpu_id].append((ts, ts + dur, event.args.get("blocks per SM", 0.0))) - self.occupancy_per_device[gpu_id].append((ts, ts + dur, - event.args.get("est. achieved occupancy %", 0.0))) if "blocks per SM" in event.args: - self.blocks_per_sm_count[gpu_id] += 1 + blocks_per_sm = event.args.get("blocks per SM") + if blocks_per_sm >= 0.0: + self.blocks_per_sm_per_device[gpu_id].append((ts, ts + dur, blocks_per_sm)) + self.blocks_per_sm_count[gpu_id] += 1 + else: + # Workaround for negative value input. + logger.warning("blocks per SM {} is negative!".format(blocks_per_sm)) + if "est. achieved occupancy %" in event.args: - self.occupancy_count[gpu_id] += 1 + occupancy = event.args.get("est. achieved occupancy %") + if occupancy >= 0.0: + self.occupancy_per_device[gpu_id].append((ts, ts + dur, occupancy)) + self.occupancy_count[gpu_id] += 1 + else: + # Workaround for negative value input. + logger.warning("est. achieved occupancy % {} is negative!".format(occupancy)) From 8b6b3e444d767fb113e6b16bc36b3a115b8b2d6f Mon Sep 17 00:00:00 2001 From: Teng Gao Date: Tue, 29 Jun 2021 16:38:09 +0800 Subject: [PATCH 2/2] refine --- tb_plugin/torch_tb_profiler/profiler/gpu_metrics_parser.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tb_plugin/torch_tb_profiler/profiler/gpu_metrics_parser.py b/tb_plugin/torch_tb_profiler/profiler/gpu_metrics_parser.py index 0d7bb4d8e..88efe3da8 100644 --- a/tb_plugin/torch_tb_profiler/profiler/gpu_metrics_parser.py +++ b/tb_plugin/torch_tb_profiler/profiler/gpu_metrics_parser.py @@ -174,12 +174,12 @@ def parse_event(self, event): self.kernel_ranges_per_device[gpu_id].append((ts, ts + dur)) if "blocks per SM" in event.args: blocks_per_sm = event.args.get("blocks per SM") - if blocks_per_sm >= 0.0: + if blocks_per_sm > 0.0: self.blocks_per_sm_per_device[gpu_id].append((ts, ts + dur, blocks_per_sm)) self.blocks_per_sm_count[gpu_id] += 1 else: # Workaround for negative value input. - logger.warning("blocks per SM {} is negative!".format(blocks_per_sm)) + logger.warning("blocks per SM {} with ts {} is not positive!".format(blocks_per_sm, ts)) if "est. achieved occupancy %" in event.args: occupancy = event.args.get("est. achieved occupancy %") @@ -188,4 +188,4 @@ def parse_event(self, event): self.occupancy_count[gpu_id] += 1 else: # Workaround for negative value input. - logger.warning("est. achieved occupancy % {} is negative!".format(occupancy)) + logger.warning("est. achieved occupancy % {} with ts {} is negative!".format(occupancy, ts))