Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 89 additions & 0 deletions tb_plugin/test/test_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1086,6 +1086,95 @@ def test_gpu_utilization(self):
count += 1
self.assertEqual(count, 2)

# Test GPU utilization 3 metrics works fine if kernel out of ProfilerStep.
def test_gpu_utilization_kernel_out_of_step(self):
json_content = """
[{
"ph": "X", "cat": "Operator",
"name": "aten::mat_mul", "pid": 13721, "tid": "456",
"ts": 10, "dur": 10,
"args": {"Input Dims": [], "External id": 1}
},
{
"ph": "X", "cat": "Operator",
"name": "aten::mm", "pid": 13721, "tid": "456",
"ts": 120, "dur": 70,
"args": {"Input Dims": [], "External id": 3}
},
{
"ph": "X", "cat": "Operator",
"name": "aten::mm", "pid": 13721, "tid": "456",
"ts": 220, "dur": 20,
"args": {"Input Dims": [], "External id": 4}
},
{
"ph": "X", "cat": "Operator",
"name": "ProfilerStep#2", "pid": 13721, "tid": "456",
"ts": 100, "dur": 100,
"args": {"Input Dims": [], "External id": 2}
},
{
"ph": "X", "cat": "Kernel",
"name": "void cunn_ClassNLLCriterion_updateGradInput_kernel<float>", "pid": 1, "tid": "stream 7",
"ts": 60, "dur": 20,
"args": {"correlation": 334, "external id": 1, "device": 1,
"blocks per SM": 0.5, "est. achieved occupancy %": 0.6}
},
{
"ph": "X", "cat": "Runtime",
"name": "cudaLaunchKernel", "pid": 13721, "tid": "456",
"ts": 15, "dur": 5,
"args": {"correlation": 334, "external id": 1}
},
{
"ph": "X", "cat": "Kernel",
"name": "void cunn_ClassNLLCriterion_updateGradInput_kernel<float>", "pid": 1, "tid": "stream 7",
"ts": 240, "dur": 25,
"args": {"correlation": 337, "external id": 4, "device": 1,
"blocks per SM": 10.5, "est. achieved occupancy %": 0.3}
},
{
"ph": "X", "cat": "Runtime",
"name": "cudaLaunchKernel", "pid": 13721, "tid": "456",
"ts": 230, "dur": 10,
"args": {"correlation": 337, "external id": 4}
}]
"""
profile = parse_json_trace(json_content)
profile.process()

self.assertEqual(len(profile.gpu_ids), 1)
self.assertAlmostEqual(profile.gpu_utilization[1], 0.0)
self.assertTrue(profile.sm_efficency[1] is None)
self.assertTrue(profile.occupancy[1] is None)
self.assertTrue(profile.blocks_per_sm_count[1] > 0)
self.assertTrue(profile.occupancy_count[1] > 0)

count = 0
for agg_by_op in profile.kernel_list_groupby_name_op:
if agg_by_op.name == "void cunn_ClassNLLCriterion_updateGradInput_kernel<float>" \
and agg_by_op.op_name == "aten::mat_mul":
self.assertAlmostEqual(agg_by_op.avg_blocks_per_sm, 0.5)
self.assertAlmostEqual(agg_by_op.avg_occupancy, 0.6)
count += 1
if agg_by_op.name == "void cunn_ClassNLLCriterion_updateGradInput_kernel<float>" and \
agg_by_op.op_name == "aten::mm":
self.assertAlmostEqual(
agg_by_op.avg_blocks_per_sm, 10.5)
self.assertAlmostEqual(
agg_by_op.avg_occupancy, 0.3)
count += 1
self.assertEqual(count, 2)

count = 0
for _id, (name, row) in enumerate(profile.kernel_stat.iterrows()):
# The kernel with zero "dur" should be ignored.
if name == "void cunn_ClassNLLCriterion_updateGradInput_kernel<float>":
self.assertAlmostEqual(row["blocks_per_sm"], (20 * 0.5 + 25 * 10.5) / (20 + 25))
self.assertAlmostEqual(row["occupancy"], (20 * 0.6 + 25 * 0.3) / (20 + 25))
count += 1
self.assertEqual(count, 1)

def test_dump_gpu_metrics(self):
profile = RunProfile("test_dump_gpu_metrics", None)
# Faked data for easy to see in UI. Real data values are 1/100 of these.
Expand Down
17 changes: 10 additions & 7 deletions tb_plugin/torch_tb_profiler/profiler/gpu_metrics_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,17 +136,20 @@ def calculate_avg(approximated_sm_efficency_ranges, total_dur):
self.blocks_per_sm_per_device = None # Release memory.

# Weighted average. Weighted by kernel's time duration.
def calculate_occupancy(self):
def calculate_occupancy(self, steps_start_time, steps_end_time):
for gpu_id in self.gpu_ids:
occupancys_on_a_device = self.occupancy_per_device[gpu_id]
total_time = 0
total_occupancy = 0.0
for r in occupancys_on_a_device:
dur = r[1] - r[0]
total_occupancy += r[2] * dur
total_time += dur
avg_occupancy = total_occupancy / total_time
self.avg_occupancy_per_device[gpu_id] = avg_occupancy
min_time = max(r[0], steps_start_time)
max_time = min(r[1], steps_end_time)
if min_time < max_time:
dur = max_time - min_time
total_occupancy += r[2] * dur
total_time += dur
if total_time > 0:
self.avg_occupancy_per_device[gpu_id] = total_occupancy / total_time

def parse_events(self, events, global_start_time, global_end_time, steps_start_time, steps_end_time):
logger.debug("GPU Metrics, parse events")
Expand All @@ -156,7 +159,7 @@ def parse_events(self, events, global_start_time, global_end_time, steps_start_t

self.calculate_gpu_utilization(global_start_time, global_end_time, steps_start_time, steps_end_time)
self.calculate_approximated_sm_efficency(steps_start_time, steps_end_time)
self.calculate_occupancy()
self.calculate_occupancy(steps_start_time, steps_end_time)

def parse_event(self, event):
ts = event.ts
Expand Down
2 changes: 0 additions & 2 deletions tb_plugin/torch_tb_profiler/profiler/run_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,6 @@ def generate_run_profile(self):
profile_run.gpu_utilization = self.profile_data.gpu_utilization
profile_run.sm_efficency = self.profile_data.sm_efficency
profile_run.occupancy = self.profile_data.occupancy
profile_run.blocks_per_sm_count = self.profile_data.blocks_per_sm_count
profile_run.occupancy_count = self.profile_data.occupancy_count

# add memory stats
if self.profile_data.has_memory_data:
Expand Down
4 changes: 2 additions & 2 deletions tb_plugin/torch_tb_profiler/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,12 +193,12 @@ def get_gpu_metrics_data(profile):
gpu_metrics_data.append({"title": "GPU Utilization",
"value": "{} %".format(
round(profile.gpu_utilization[gpu_id] * 100, 2))})
if profile.blocks_per_sm_count[gpu_id] > 0:
if profile.sm_efficency[gpu_id] is not None:
gpu_metrics_data.append({"title": "Est. SM Efficiency",
"value": "{} %".format(
round(profile.sm_efficency[gpu_id] * 100, 2))})
has_sm_efficiency = True
if profile.occupancy_count[gpu_id] > 0:
if profile.occupancy[gpu_id] is not None:
gpu_metrics_data.append({"title": "Est. Achieved Occupancy",
"value": "{} %".format(round(profile.occupancy[gpu_id], 2))})
has_occupancy = True
Expand Down