In [None]:
import torch

In [None]:
torch._logging.set_logs(output_code=True)
f = torch.compile(torch.softmax)
x = torch.randn(10).cuda()
f(x, dim=0)

In [None]:
def time_pytorch_function(func, input):
    # CUDA IS ASYNC so can't use python time module
    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    # Warmup
    for _ in range(5): func(input)

    start.record()
    func(input)
    end.record()
    torch.cuda.synchronize()
    return start.elapsed_time(end)

In [None]:
def square_1(x): return x*x
def square_2(x): return x**2
b = torch.randn(10000, 10000).cuda()

In [None]:
def trace_handler(prof):
    print(prof.key_averages().table(
        sort_by="self_cuda_time_total", row_limit=-1))
    prof.export_chrome_trace("/tmp/test_trace_" + str(prof.step_num) + ".json")

with torch.profiler.profile(
    activities=[
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA,
    ],

    # In this example with wait=1, warmup=1, active=2, repeat=1,
    # profiler will skip the first step/iteration,
    # start warming up on the second, record
    # the third and the forth iterations,
    # after which the trace will become available
    # and on_trace_ready (when set) is called;
    # the cycle repeats starting with the next step

    schedule=torch.profiler.schedule(
        wait=1,
        warmup=1,
        active=2,
        repeat=1),
    on_trace_ready=trace_handler
    # on_trace_ready=torch.profiler.tensorboard_trace_handler('./log')
    # used when outputting for tensorboard
    ) as p:
        for iter in range(10):
            torch.square(torch.randn(10000, 10000).cuda())
            # send a signal to the profiler that the next iteration has started
            p.step()

STAGE:2024-04-07 15:58:27 62509:62509 ActivityProfilerController.cpp:314] Completed Stage: Warm Up


-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                     Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
            ProfilerStep*         0.16%       1.880ms        99.76%        1.162s     581.157ms             2  
              aten::randn         0.00%      51.000us        86.25%        1.005s     502.421ms             2  
              aten::empty         0.00%      50.000us         0.00%      50.000us      25.000us             2  
            aten::normal_        86.24%        1.005s        86.24%        1.005s     502.371ms             2  
                 aten::to         0.01%      65.000us        13.33%     155.310ms      38.828ms             4  
           aten::_to_copy         0.00%      53.000us        13.32%     155.245ms      77.623ms         

STAGE:2024-04-07 15:58:28 62509:62509 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-04-07 15:58:28 62509:62509 ActivityProfilerController.cpp:324] Completed Stage: Post Processing
