Skip to content

Commit

Permalink
Align run.py profile_one_step and run_one_step (#1296)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #1296

Align the print summary for profile_one_step and run_one_step, as well as fix the repeat arg in profile. repeat=1 will avoid repeat saves of profiling traces. This will also fix the table generated by prof.key_averages.

Test Plan: CI tests

Reviewed By: xuzhao9, FindHao

Differential Revision: D41170045

Pulled By: aaronenyeshi

fbshipit-source-id: 3f115670b38ccf27e9619096bcc022c32582d26b
  • Loading branch information
aaronenyeshi authored and facebook-github-bot committed Nov 10, 2022
1 parent f28c72d commit 022dfe3
Showing 1 changed file with 41 additions and 25 deletions.
66 changes: 41 additions & 25 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,32 @@ def run_one_step_with_cudastreams(func, streamcount):
print('{:<20} {:>20}'.format("GPU Time:", "%.3f milliseconds" % start_event.elapsed_time(end_event)), sep='')


def printResultSummaryTime(result_summary, model_flops=None, model=None, analyzer_enabled=False, model_analyzer=None):
if args.device == "cuda":
gpu_time = np.median(list(map(lambda x: x[0], result_summary)))
cpu_walltime = np.median(list(map(lambda x: x[1], result_summary)))
if hasattr(model, "NUM_BATCHES"):
print('{:<20} {:>20}'.format("GPU Time per batch:", "%.3f milliseconds" %
(gpu_time / model.NUM_BATCHES), sep=''))
print('{:<20} {:>20}'.format("CPU Wall Time per batch:", "%.3f milliseconds" %
(cpu_walltime / model.NUM_BATCHES), sep=''))
else:
print('{:<20} {:>20}'.format("GPU Time:", "%.3f milliseconds" % gpu_time, sep=''))
print('{:<20} {:>20}'.format("CPU Total Wall Time:", "%.3f milliseconds" % cpu_walltime, sep=''))
else:
cpu_walltime = np.median(list(map(lambda x: x[0], result_summary)))
print('{:<20} {:>20}'.format("CPU Total Wall Time:", "%.3f milliseconds" % cpu_walltime, sep=''))

# if model_flops is not None, output the TFLOPs per sec
if model_flops:
if analyzer_enabled:
tflops = model_analyzer.calculate_flops()
else:
flops, batch_size = model_flops
tflops = flops * batch_size / (cpu_walltime / 1.0e3) / 1.0e12
print('{:<20} {:>20}'.format("FLOPS:", "%.4f TFLOPs per second" % tflops, sep=''))


def run_one_step(func, nwarmup=WARMUP_ROUNDS, model_flops=None, num_iter=10, model=None, export_dcgm_metrics_file=False, stress=0, metrics_needed=[]):
# Warm-up `nwarmup` rounds
for _i in range(nwarmup):
Expand Down Expand Up @@ -134,44 +160,25 @@ def run_one_step(func, nwarmup=WARMUP_ROUNDS, model_flops=None, num_iter=10, mod
if analyzer_enabled:
model_analyzer.stop_monitor()

if args.device == "cuda":
gpu_time = np.median(list(map(lambda x: x[0], result_summary)))
cpu_walltime = np.median(list(map(lambda x: x[1], result_summary)))
if hasattr(model, "NUM_BATCHES"):
print('{:<20} {:>20}'.format("GPU Time per batch:", "%.3f milliseconds" %
(gpu_time / model.NUM_BATCHES), sep=''))
print('{:<20} {:>20}'.format("CPU Wall Time per batch:", "%.3f milliseconds" %
(cpu_walltime / model.NUM_BATCHES), sep=''))
else:
print('{:<20} {:>20}'.format("GPU Time:", "%.3f milliseconds" % gpu_time, sep=''))
print('{:<20} {:>20}'.format("CPU Total Wall Time:", "%.3f milliseconds" % cpu_walltime, sep=''))
else:
cpu_walltime = np.median(list(map(lambda x: x[0], result_summary)))
print('{:<20} {:>20}'.format("CPU Total Wall Time:", "%.3f milliseconds" % cpu_walltime, sep=''))

if analyzer_enabled:
model_analyzer.aggregate()

# if model_flops is not None, output the TFLOPs per sec
if model_flops:
if analyzer_enabled:
tflops = model_analyzer.calculate_flops()
else:
flops, batch_size = model_flops
tflops = flops * batch_size / (cpu_walltime / 1.0e3) / 1.0e12
print('{:<20} {:>20}'.format("FLOPS:", "%.4f TFLOPs per second" % tflops, sep=''))
printResultSummaryTime(result_summary, model_flops, model, analyzer_enabled, model_analyzer)

if gpu_peak_mem_enabled:
gpu_peak_mem = model_analyzer.calculate_gpu_peak_mem()
print('{:<20} {:>20}'.format("GPU Peak Memory:", "%.4f GB" % gpu_peak_mem, sep=''))
if cpu_peak_mem_enabled:
cpu_peak_mem = model_analyzer.calculate_cpu_peak_mem()
print('{:<20} {:>20}'.format("CPU Peak Memory:", "%.4f GB" % cpu_peak_mem, sep=''))

if export_dcgm_metrics_file:
model_analyzer.export_all_records_to_csv()


def profile_one_step(func, nwarmup=WARMUP_ROUNDS):
activity_groups = []
result_summary = []
device_to_activity = {'cuda': profiler.ProfilerActivity.CUDA, 'cpu': profiler.ProfilerActivity.CPU}
if args.profile_devices:
activity_groups = [
Expand Down Expand Up @@ -200,17 +207,25 @@ def profile_one_step(func, nwarmup=WARMUP_ROUNDS):
nwarmup = 0
eg.start()
with profiler.profile(
schedule=profiler.schedule(wait=0, warmup=nwarmup, active=1),
schedule=profiler.schedule(wait=0, warmup=nwarmup, active=1, repeat=1),
activities=activity_groups,
record_shapes=args.profile_detailed,
profile_memory=args.profile_detailed,
with_stack=args.profile_detailed,
with_flops=args.profile_detailed,
on_trace_ready=profiler.tensorboard_trace_handler(args.profile_folder)
) as prof:
for _i in range(nwarmup + 1):
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
for i in range(nwarmup + 1):
t0 = time.time_ns()
start_event.record()
func()
torch.cuda.synchronize() # Need to sync here to match run_one_step()'s timed run.
end_event.record()
t1 = time.time_ns()
if i >= nwarmup:
result_summary.append((start_event.elapsed_time(end_event), (t1 - t0) / 1_000_000))
prof.step()
if args.profile_eg and eg:
eg.stop()
Expand All @@ -219,6 +234,7 @@ def profile_one_step(func, nwarmup=WARMUP_ROUNDS):
print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=30))
print(f"Saved TensorBoard Profiler traces to {args.profile_folder}.")

printResultSummaryTime(result_summary)

def _validate_devices(devices: str):
devices_list = devices.split(",")
Expand Down

0 comments on commit 022dfe3

Please sign in to comment.