From 6a81dd29db397f84b2c097b8db732e5f357e1131 Mon Sep 17 00:00:00 2001 From: guyang3532 Date: Tue, 15 Jun 2021 19:20:39 +0800 Subject: [PATCH] Merge from branch tb_plugin --- tb_plugin/README.md | 30 +++++++++++------ tb_plugin/test/result_check_file.txt | 8 ++--- tb_plugin/torch_tb_profiler/consts.py | 33 +++++++++++-------- .../profiler/run_generator.py | 2 +- tb_plugin/torch_tb_profiler/run.py | 2 +- 5 files changed, 46 insertions(+), 29 deletions(-) diff --git a/tb_plugin/README.md b/tb_plugin/README.md index e0fbdb2b0..f518d3a5c 100644 --- a/tb_plugin/README.md +++ b/tb_plugin/README.md @@ -24,7 +24,7 @@ and give optimization recommendations. * Build the wheel - `python setup.py build_fe sdist bdist_wheel` \ - **_Note_**: the build_fe step need setup yarn and nodejs + **_Note_**: the build_fe step need setup yarn and Node.js - `python setup.py sdist bdist_wheel` ### Quick Start Instructions @@ -37,12 +37,12 @@ and give optimization recommendations. [kineto/tb_plugin/examples/resnet50_profiler_api.py](https://github.com/pytorch/kineto/blob/master/tb_plugin/examples/resnet50_profiler_api.py). Also you can learn how to profile your model and generate profiling data from [PyTorch Profiler](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html?highlight=tensorboard). - Note: The recommended way to produce profiling data is assigning "torch.profiler.tensorboard_trace_handler" - to "on_trace_ready" on creation of "torch.profiler.schedule". + Note: The recommended way to produce profiling data is assigning `torch.profiler.tensorboard_trace_handler` + to `on_trace_ready` on creation of `torch.profiler.schedule`. * Start TensorBoard - Specify the profiling data folder to "logdir" in Tensorboard. If you use the above samples data, start TensorBoard with: + Specify the profiling data folder to `logdir` in Tensorboard. If you use the above samples data, start TensorBoard with: `tensorboard --logdir=./samples` @@ -56,14 +56,17 @@ and give optimization recommendations. * Open TensorBoard in Chrome browser Open URL `http://localhost:6006` in the browser. - If you use '--bind_all' in tensorboard start cmd, the hostname may not be 'localhost'. You may find it in the log printed after the cmd. + If you use `--bind_all` in tensorboard start command, the hostname may not be 'localhost'. You may find it in the log printed after the cmd. * Navigate to PYTORCH_PROFILER tab If the files under `--logdir` are too big or too many, please wait a while and refresh the browser to check latest loaded result. -* Also support loading profiling data stored in AWS(S3://), Azure blob(https://\.blob.core.windows.net) and Google Cloud(GS://) - * S3: install boto3. set environment variables: `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`. Optionally, `S3_ENDPOINT` can be set as well.\ + +* Loading profiling data from cloud + * S3 (S3://) + + install `boto3`. set environment variables: `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`. Optionally, `S3_ENDPOINT` can be set as well.\ For minio, the S3 url should start with the bucket name `s3:////` instead of minio prefix `s3://minio//`. At the same time, the `S3_ENDPOINT` is needed as well. \ For example, the following command can be used to create minio storage after following guides: * Server: https://docs.min.io/docs/minio-quickstart-guide.html @@ -81,8 +84,15 @@ and give optimization recommendations. export S3_ENDPOINT=http://localhost:9000 tensorboard --logdir=s3://profiler/version_2/ --bind_all ``` - * Azure Blob: install azure-storage-blob. Optionally, set environment variable `AZURE_STORAGE_CONNECTION_STRING` - * Google Cloud: install google-cloud-storage. + + * Azure blob (https://\.blob.core.windows.net) + + install `azure-storage-blob`. Optionally, set environment variable `AZURE_STORAGE_CONNECTION_STRING` + + * Google Cloud (GS://) + + install `google-cloud-storage`. + --- > **_NOTES:_** For AWS, Google Cloud and Azure Blob, the trace files need to be put on a top level folder under bucket/container. --- @@ -93,7 +103,7 @@ and give optimization recommendations. and open tensorboard in browser to see all the views described below. - Note: for accessing data in azure blob, you need to install torch-tb-profiler with cmd: `pip install torch-tb-profiler[blob]` + Note: for accessing data in azure blob, you need to install torch-tb-profiler with `pip install torch-tb-profiler[blob]` ### Quick Usage Instructions diff --git a/tb_plugin/test/result_check_file.txt b/tb_plugin/test/result_check_file.txt index c59f1ad50..2f041f283 100644 --- a/tb_plugin/test/result_check_file.txt +++ b/tb_plugin/test/result_check_file.txt @@ -1,10 +1,10 @@ -{"steps": {"columns": [{"type": "string", "name": "Step"}, {"type": "number", "name": "Kernel"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Memcpy"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Memset"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Runtime"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "DataLoader"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "CPU Exec"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Other"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}], "rows": [["5", 98598, "
Step 5
Total: 187948us
Kernel: 98598us
Percentage: 52.46%
", 1941, "
Step 5
Total: 187948us
Memcpy: 1941us
Percentage: 1.03%
", 90, "
Step 5
Total: 187948us
Memset: 90us
Percentage: 0.05%
", 2796, "
Step 5
Total: 187948us
Runtime: 2796us
Percentage: 1.49%
", 69317, "
Step 5
Total: 187948us
DataLoader: 69317us
Percentage: 36.88%
", 14091, "
Step 5
Total: 187948us
CPU Exec: 14091us
Percentage: 7.5%
", 1115, "
Step 5
Total: 187948us
Other: 1115us
Percentage: 0.59%
"], ["6", 98570, "
Step 6
Total: 175153us
Kernel: 98570us
Percentage: 56.28%
", 1947, "
Step 6
Total: 175153us
Memcpy: 1947us
Percentage: 1.11%
", 89, "
Step 6
Total: 175153us
Memset: 89us
Percentage: 0.05%
", 2762, "
Step 6
Total: 175153us
Runtime: 2762us
Percentage: 1.58%
", 57669, "
Step 6
Total: 175153us
DataLoader: 57669us
Percentage: 32.92%
", 12968, "
Step 6
Total: 175153us
CPU Exec: 12968us
Percentage: 7.4%
", 1148, "
Step 6
Total: 175153us
Other: 1148us
Percentage: 0.66%
"], ["7", 98596, "
Step 7
Total: 179733us
Kernel: 98596us
Percentage: 54.86%
", 1931, "
Step 7
Total: 179733us
Memcpy: 1931us
Percentage: 1.07%
", 91, "
Step 7
Total: 179733us
Memset: 91us
Percentage: 0.05%
", 2877, "
Step 7
Total: 179733us
Runtime: 2877us
Percentage: 1.6%
", 61257, "
Step 7
Total: 179733us
DataLoader: 61257us
Percentage: 34.08%
", 13768, "
Step 7
Total: 179733us
CPU Exec: 13768us
Percentage: 7.66%
", 1213, "
Step 7
Total: 179733us
Other: 1213us
Percentage: 0.67%
"], ["8", 98623, "
Step 8
Total: 174564us
Kernel: 98623us
Percentage: 56.5%
", 1938, "
Step 8
Total: 174564us
Memcpy: 1938us
Percentage: 1.11%
", 89, "
Step 8
Total: 174564us
Memset: 89us
Percentage: 0.05%
", 2841, "
Step 8
Total: 174564us
Runtime: 2841us
Percentage: 1.63%
", 56453, "
Step 8
Total: 174564us
DataLoader: 56453us
Percentage: 32.34%
", 13420, "
Step 8
Total: 174564us
CPU Exec: 13420us
Percentage: 7.69%
", 1200, "
Step 8
Total: 174564us
Other: 1200us
Percentage: 0.69%
"], ["9", 98504, "
Step 9
Total: 182172us
Kernel: 98504us
Percentage: 54.07%
", 1937, "
Step 9
Total: 182172us
Memcpy: 1937us
Percentage: 1.06%
", 87, "
Step 9
Total: 182172us
Memset: 87us
Percentage: 0.05%
", 2788, "
Step 9
Total: 182172us
Runtime: 2788us
Percentage: 1.53%
", 62690, "
Step 9
Total: 182172us
DataLoader: 62690us
Percentage: 34.41%
", 15025, "
Step 9
Total: 182172us
CPU Exec: 15025us
Percentage: 8.25%
", 1141, "
Step 9
Total: 182172us
Other: 1141us
Percentage: 0.63%
"], ["10", 98641, "
Step 10
Total: 165983us
Kernel: 98641us
Percentage: 59.43%
", 1798, "
Step 10
Total: 165983us
Memcpy: 1798us
Percentage: 1.08%
", 88, "
Step 10
Total: 165983us
Memset: 88us
Percentage: 0.05%
", 3381, "
Step 10
Total: 165983us
Runtime: 3381us
Percentage: 2.04%
", 48185, "
Step 10
Total: 165983us
DataLoader: 48185us
Percentage: 29.03%
", 12773, "
Step 10
Total: 165983us
CPU Exec: 12773us
Percentage: 7.7%
", 1117, "
Step 10
Total: 165983us
Other: 1117us
Percentage: 0.67%
"]]}, "performance": [{"name": "Average Step Time", "description": "", "value": 177592, "extra": 100, "children": [{"name": "Kernel", "description": "", "value": 98589, "extra": 55.51}, {"name": "Memcpy", "description": "", "value": 1915, "extra": 1.08}, {"name": "Memset", "description": "", "value": 89, "extra": 0.05}, {"name": "Runtime", "description": "", "value": 2908, "extra": 1.64}, {"name": "DataLoader", "description": "", "value": 59262, "extra": 33.37}, {"name": "CPU Exec", "description": "", "value": 13674, "extra": 7.7}, {"name": "Other", "description": "", "value": 1156, "extra": 0.65}]}], "recommendations": "
  • This run has high time cost on input data loading. 33.4% of the step time is in DataLoader. You could try to set num_workers on DataLoader's construction and enable multi-processes on data loading. Reference: Single- and Multi-process Data Loading
", "environments": [{"title": "Number of Worker(s)", "value": "1"}, {"title": "Device Type", "value": "GPU"}], "gpu_metrics": {"title": "GPU Summary", "data": [{"title": "GPU 0:", "value": ""}, {"title": "Name", "value": "Tesla V100-DGXS-32GB"}, {"title": "Memory", "value": "31.74 GB"}, {"title": "Compute Capability", "value": "7.0"}, {"title": "GPU Utilization", "value": "55.51 %"}, {"title": "Est. SM Efficiency", "value": "54.68 %"}, {"title": "Est. Achieved Occupancy", "value": "49.13 %"}], "tooltip": "The GPU usage metrics:\n\nGPU Utilization:\nGPU busy time / All steps time. GPU busy time is the time during which there is at least one GPU kernel running on it. All steps time is the total time of all profiler steps(or called as iterations).\n\nEst. SM Efficiency:\nEstimated Stream Multiprocessor Efficiency. Est. SM Efficiency of a kernel, SM_Eff_K = min(blocks of this kernel / SM number of this GPU, 100%). This overall number is the sum of all kernels' SM_Eff_K weighted by kernel's execution duration, divided by all steps time.\n\nEst. Achieved Occupancy:\nOccupancy is the ratio of active threads on an SM to the maximum number of active threads supported by the SM. The theoretical occupancy of a kernel is upper limit occupancy of this kernel, limited by multiple factors such as kernel shape, kernel used resource, and the GPU compute capability.Est. Achieved Occupancy of a kernel, OCC_K = min(threads of the kernel / SM number / max threads per SM, theoretical occupancy of the kernel). This overall number is the weighted sum of all kernels OCC_K using kernel's execution duration as weight."}} +{"steps": {"columns": [{"type": "string", "name": "Step"}, {"type": "number", "name": "Kernel"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Memcpy"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Memset"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Runtime"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "DataLoader"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "CPU Exec"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Other"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}], "rows": [["5", 98598, "
Step 5
Total: 187948us
Kernel: 98598us
Percentage: 52.46%
", 1941, "
Step 5
Total: 187948us
Memcpy: 1941us
Percentage: 1.03%
", 90, "
Step 5
Total: 187948us
Memset: 90us
Percentage: 0.05%
", 2796, "
Step 5
Total: 187948us
Runtime: 2796us
Percentage: 1.49%
", 69317, "
Step 5
Total: 187948us
DataLoader: 69317us
Percentage: 36.88%
", 14091, "
Step 5
Total: 187948us
CPU Exec: 14091us
Percentage: 7.5%
", 1115, "
Step 5
Total: 187948us
Other: 1115us
Percentage: 0.59%
"], ["6", 98570, "
Step 6
Total: 175153us
Kernel: 98570us
Percentage: 56.28%
", 1947, "
Step 6
Total: 175153us
Memcpy: 1947us
Percentage: 1.11%
", 89, "
Step 6
Total: 175153us
Memset: 89us
Percentage: 0.05%
", 2762, "
Step 6
Total: 175153us
Runtime: 2762us
Percentage: 1.58%
", 57669, "
Step 6
Total: 175153us
DataLoader: 57669us
Percentage: 32.92%
", 12968, "
Step 6
Total: 175153us
CPU Exec: 12968us
Percentage: 7.4%
", 1148, "
Step 6
Total: 175153us
Other: 1148us
Percentage: 0.66%
"], ["7", 98596, "
Step 7
Total: 179733us
Kernel: 98596us
Percentage: 54.86%
", 1931, "
Step 7
Total: 179733us
Memcpy: 1931us
Percentage: 1.07%
", 91, "
Step 7
Total: 179733us
Memset: 91us
Percentage: 0.05%
", 2877, "
Step 7
Total: 179733us
Runtime: 2877us
Percentage: 1.6%
", 61257, "
Step 7
Total: 179733us
DataLoader: 61257us
Percentage: 34.08%
", 13768, "
Step 7
Total: 179733us
CPU Exec: 13768us
Percentage: 7.66%
", 1213, "
Step 7
Total: 179733us
Other: 1213us
Percentage: 0.67%
"], ["8", 98623, "
Step 8
Total: 174564us
Kernel: 98623us
Percentage: 56.5%
", 1938, "
Step 8
Total: 174564us
Memcpy: 1938us
Percentage: 1.11%
", 89, "
Step 8
Total: 174564us
Memset: 89us
Percentage: 0.05%
", 2841, "
Step 8
Total: 174564us
Runtime: 2841us
Percentage: 1.63%
", 56453, "
Step 8
Total: 174564us
DataLoader: 56453us
Percentage: 32.34%
", 13420, "
Step 8
Total: 174564us
CPU Exec: 13420us
Percentage: 7.69%
", 1200, "
Step 8
Total: 174564us
Other: 1200us
Percentage: 0.69%
"], ["9", 98504, "
Step 9
Total: 182172us
Kernel: 98504us
Percentage: 54.07%
", 1937, "
Step 9
Total: 182172us
Memcpy: 1937us
Percentage: 1.06%
", 87, "
Step 9
Total: 182172us
Memset: 87us
Percentage: 0.05%
", 2788, "
Step 9
Total: 182172us
Runtime: 2788us
Percentage: 1.53%
", 62690, "
Step 9
Total: 182172us
DataLoader: 62690us
Percentage: 34.41%
", 15025, "
Step 9
Total: 182172us
CPU Exec: 15025us
Percentage: 8.25%
", 1141, "
Step 9
Total: 182172us
Other: 1141us
Percentage: 0.63%
"], ["10", 98641, "
Step 10
Total: 165983us
Kernel: 98641us
Percentage: 59.43%
", 1798, "
Step 10
Total: 165983us
Memcpy: 1798us
Percentage: 1.08%
", 88, "
Step 10
Total: 165983us
Memset: 88us
Percentage: 0.05%
", 3381, "
Step 10
Total: 165983us
Runtime: 3381us
Percentage: 2.04%
", 48185, "
Step 10
Total: 165983us
DataLoader: 48185us
Percentage: 29.03%
", 12773, "
Step 10
Total: 165983us
CPU Exec: 12773us
Percentage: 7.7%
", 1117, "
Step 10
Total: 165983us
Other: 1117us
Percentage: 0.67%
"]]}, "performance": [{"name": "Average Step Time", "description": "", "value": 177592, "extra": 100, "children": [{"name": "Kernel", "description": "", "value": 98589, "extra": 55.51}, {"name": "Memcpy", "description": "", "value": 1915, "extra": 1.08}, {"name": "Memset", "description": "", "value": 89, "extra": 0.05}, {"name": "Runtime", "description": "", "value": 2908, "extra": 1.64}, {"name": "DataLoader", "description": "", "value": 59262, "extra": 33.37}, {"name": "CPU Exec", "description": "", "value": 13674, "extra": 7.7}, {"name": "Other", "description": "", "value": 1156, "extra": 0.65}]}], "recommendations": "
  • This run has high time cost on input data loading. 33.4% of the step time is in DataLoader. You could try to set num_workers on DataLoader's construction and enable multi-processes on data loading. Reference: Single- and Multi-process Data Loading
", "environments": [{"title": "Number of Worker(s)", "value": "1"}, {"title": "Device Type", "value": "GPU"}], "gpu_metrics": {"title": "GPU Summary", "data": [{"title": "GPU 0:", "value": ""}, {"title": "Name", "value": "Tesla V100-DGXS-32GB"}, {"title": "Memory", "value": "31.74 GB"}, {"title": "Compute Capability", "value": "7.0"}, {"title": "GPU Utilization", "value": "55.51 %"}, {"title": "Est. SM Efficiency", "value": "54.68 %"}, {"title": "Est. Achieved Occupancy", "value": "49.13 %"}], "tooltip": "The GPU usage metrics:\n\nGPU Utilization:\nGPU busy time / All steps time. The bigger, the better. GPU busy time is the time during which there is at least one GPU kernel running on it. All steps time is the total time of all profiler steps(or called as iterations).\n\nEst. SM Efficiency:\nEstimated Stream Multiprocessor Efficiency. The bigger, the better. This metric of a kernel, SM_Eff_K = min(blocks of this kernel / SM number of this GPU, 100%). This overall number is the sum of all kernels' SM_Eff_K weighted by kernel's execution duration, divided by all steps time.\n\nEst. Achieved Occupancy:\nThe bigger, the better. Occupancy is the ratio of active warps on an SM to the maximum number of active warps supported by the SM. The theoretical occupancy of a kernel is upper limit occupancy of this kernel, limited by multiple factors such as kernel shape, kernel used resource, and the GPU compute capability.\nEst. Achieved Occupancy of a kernel, OCC_K = min(threads of the kernel / SM number / max threads per SM, theoretical occupancy of the kernel). This overall number is the weighted average of all kernels' OCC_K using kernel's execution duration as weight. It shows fine-grained low-level GPU utilization."}} {"device_total_time": {"title": "Device Total Time (us)", "columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["aten::cudnn_convolution_backward", 273428], ["CudnnConvolutionBackward", 273428], ["aten::cudnn_convolution_backward_weight", 142461], ["aten::cudnn_convolution_backward_input", 130967], ["aten::cudnn_convolution", 126619], ["aten::_convolution", 126619], ["aten::convolution", 126619], ["aten::conv2d", 126619], ["aten::cudnn_batch_norm_backward", 61939], ["CudnnBatchNormBackward", 61939], ["aten::cudnn_batch_norm", 34245], ["aten::_batch_norm_impl_index", 34245], ["aten::batch_norm", 34245], ["aten::threshold_backward", 27298], ["ReluBackward1", 27298], ["aten::add_", 24098], ["aten::clamp_min", 17860], ["aten::clamp_min_", 17860], ["aten::relu_", 17860], ["aten::add", 16038], ["aten::copy_", 11492], ["aten::to", 11492], ["aten::max_pool2d_with_indices_backward", 4677], ["MaxPool2DWithIndicesBackward", 4677], ["torch::autograd::AccumulateGrad", 3030], ["aten::mul_", 2409], ["aten::fill_", 1887], ["aten::zero_", 1881], ["aten::max_pool2d_with_indices", 1420], ["aten::max_pool2d", 1420], ["aten::mm", 275], ["AddmmBackward", 275], ["aten::mean", 212], ["aten::adaptive_avg_pool2d", 212], ["aten::addmm", 197], ["aten::linear", 197], ["aten::div", 144], ["MeanBackward1", 144], ["aten::cross_entropy_loss", 60], ["aten::_log_softmax_backward_data", 53], ["LogSoftmaxBackward", 53], ["aten::sum", 44], ["aten::_log_softmax", 42], ["aten::log_softmax", 42], ["aten::nll_loss_forward", 18], ["aten::nll_loss", 18], ["aten::nll_loss_nd", 18], ["aten::nll_loss_backward", 18], ["NllLossBackward", 18], ["aten::ones_like", 6]]}, "device_self_time": {"title": "Device Self Time (us)", "columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["aten::cudnn_convolution_backward_weight", 142461], ["aten::cudnn_convolution_backward_input", 130967], ["aten::cudnn_convolution", 126619], ["aten::cudnn_batch_norm_backward", 61939], ["aten::cudnn_batch_norm", 34245], ["aten::threshold_backward", 27298], ["aten::add_", 24098], ["aten::clamp_min", 17860], ["aten::add", 16038], ["aten::copy_", 11492], ["aten::max_pool2d_with_indices_backward", 3822], ["aten::mul_", 2409], ["aten::fill_", 1887], ["aten::max_pool2d_with_indices", 1420], ["aten::mm", 275], ["aten::mean", 212], ["aten::addmm", 197], ["aten::div", 144], ["aten::_log_softmax_backward_data", 53], ["aten::sum", 44], ["aten::_log_softmax", 42], ["aten::nll_loss_forward", 18], ["aten::nll_loss_backward", 18]]}, "host_total_time": {"title": "Host Total Time (us)", "columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["CudnnConvolutionBackward", 90989], ["aten::batch_norm", 87977], ["aten::cudnn_convolution_backward", 87772], ["aten::add_", 78125], ["aten::_batch_norm_impl_index", 78071], ["aten::conv2d", 77781], ["aten::cudnn_batch_norm", 71527], ["aten::convolution", 70394], ["aten::empty", 68147], ["aten::to", 64332], ["aten::_convolution", 64243], ["aten::cudnn_convolution", 56998], ["aten::copy_", 52853], ["aten::cudnn_convolution_backward_input", 41445], ["aten::cudnn_convolution_backward_weight", 40246], ["aten::div", 35158], ["CudnnBatchNormBackward", 34608], ["aten::contiguous", 31137], ["aten::cudnn_batch_norm_backward", 30460], ["aten::mul_", 29081], ["torch::autograd::AccumulateGrad", 28494], ["aten::zero_", 27597], ["aten::empty_like", 26064], ["aten::stack", 24346], ["aten::relu_", 24181], ["aten::add", 19289], ["aten::cat", 17085], ["aten::fill_", 17059], ["aten::_cat", 16933], ["aten::clamp_min_", 15665], ["aten::view", 14027], ["aten::resize_", 12406], ["aten::empty_strided", 11829], ["ReluBackward1", 11656], ["aten::clamp_min", 10311], ["aten::permute", 9775], ["aten::threshold_backward", 9482], ["aten::as_strided", 7600], ["aten::unsqueeze", 6603], ["aten::linear", 1408], ["AddmmBackward", 1303], ["aten::cross_entropy_loss", 1180], ["aten::zeros", 1105], ["aten::addmm", 1034], ["MeanBackward1", 987], ["aten::mm", 860], ["NllLossBackward", 716], ["aten::max_pool2d", 687], ["aten::nll_loss_backward", 614], ["aten::t", 584], ["aten::log_softmax", 567], ["aten::max_pool2d_with_indices", 562], ["aten::adaptive_avg_pool2d", 561], ["aten::nll_loss_nd", 495], ["MaxPool2DWithIndicesBackward", 484], ["aten::ones_like", 452], ["aten::mean", 445], ["aten::_log_softmax", 433], ["aten::nll_loss", 414], ["aten::max_pool2d_with_indices_backward", 411], ["LogSoftmaxBackward", 359], ["aten::narrow", 350], ["aten::nll_loss_forward", 346], ["aten::transpose", 329], ["aten::sum", 327], ["aten::_log_softmax_backward_data", 306], ["aten::expand", 229], ["aten::slice", 223], ["aten::detach_", 208], ["AddBackward0", 175], ["aten::flatten", 164], ["TBackward", 103], ["detach_", 100], ["ViewBackward", 80], ["aten::reshape", 55], ["aten::conj", 12]]}, "host_self_time": {"title": "Host Self Time (us)", "columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["aten::empty", 68147], ["aten::add_", 51013], ["aten::copy_", 40255], ["aten::cudnn_convolution", 33121], ["aten::cudnn_convolution_backward_input", 29324], ["aten::cudnn_convolution_backward_weight", 22804], ["aten::mul_", 20515], ["aten::div", 20135], ["aten::cudnn_batch_norm", 19843], ["aten::_cat", 16282], ["aten::to", 14834], ["aten::add", 14329], ["aten::view", 14027], ["aten::resize_", 12406], ["aten::cudnn_batch_norm_backward", 12238], ["aten::empty_strided", 11829], ["aten::empty_like", 11742], ["aten::zero_", 10693], ["aten::batch_norm", 9906], ["aten::fill_", 9879], ["aten::relu_", 8516], ["aten::as_strided", 7600], ["aten::conv2d", 7387], ["aten::_convolution", 7245], ["aten::clamp_min", 7106], ["aten::_batch_norm_impl_index", 6544], ["aten::convolution", 6151], ["aten::threshold_backward", 6090], ["aten::cudnn_convolution_backward", 6081], ["aten::permute", 5515], ["aten::contiguous", 5510], ["torch::autograd::AccumulateGrad", 5457], ["aten::clamp_min_", 5354], ["CudnnBatchNormBackward", 4148], ["aten::unsqueeze", 3574], ["CudnnConvolutionBackward", 3217], ["ReluBackward1", 2174], ["aten::zeros", 659], ["aten::stack", 658], ["aten::addmm", 639], ["aten::mm", 575], ["MeanBackward1", 541], ["aten::max_pool2d_with_indices", 477], ["aten::nll_loss_backward", 388], ["aten::nll_loss_forward", 266], ["aten::t", 255], ["aten::mean", 234], ["aten::transpose", 197], ["AddmmBackward", 182], ["aten::max_pool2d_with_indices_backward", 176], ["AddBackward0", 175], ["aten::_log_softmax", 170], ["aten::sum", 153], ["aten::cat", 152], ["aten::expand", 150], ["aten::narrow", 127], ["aten::max_pool2d", 125], ["aten::linear", 124], ["aten::slice", 123], ["aten::cross_entropy_loss", 118], ["aten::adaptive_avg_pool2d", 116], ["aten::detach_", 108], ["aten::_log_softmax_backward_data", 108], ["NllLossBackward", 102], ["detach_", 100], ["aten::ones_like", 95], ["aten::log_softmax", 90], ["aten::flatten", 84], ["aten::nll_loss_nd", 81], ["MaxPool2DWithIndicesBackward", 73], ["aten::nll_loss", 68], ["LogSoftmaxBackward", 53], ["aten::reshape", 29], ["ViewBackward", 25], ["TBackward", 18], ["aten::conj", 12]]}} [{"name": "aten::cudnn_convolution_backward_weight", "calls": 318, "device_self_duration": 142461, "device_total_duration": 142461, "host_self_duration": 22804, "host_total_duration": 40246, "has_call_stack": false}, {"name": "aten::cudnn_convolution_backward_input", "calls": 312, "device_self_duration": 130967, "device_total_duration": 130967, "host_self_duration": 29324, "host_total_duration": 41445, "has_call_stack": false}, {"name": "aten::cudnn_convolution", "calls": 318, "device_self_duration": 126619, "device_total_duration": 126619, "host_self_duration": 33121, "host_total_duration": 56998, "has_call_stack": true}, {"name": "aten::cudnn_batch_norm_backward", "calls": 318, "device_self_duration": 61939, "device_total_duration": 61939, "host_self_duration": 12238, "host_total_duration": 30460, "has_call_stack": false}, {"name": "aten::cudnn_batch_norm", "calls": 318, "device_self_duration": 34245, "device_total_duration": 34245, "host_self_duration": 19843, "host_total_duration": 71527, "has_call_stack": true}, {"name": "aten::threshold_backward", "calls": 294, "device_self_duration": 27298, "device_total_duration": 27298, "host_self_duration": 6090, "host_total_duration": 9482, "has_call_stack": false}, {"name": "aten::add_", "calls": 2994, "device_self_duration": 24098, "device_total_duration": 24098, "host_self_duration": 51013, "host_total_duration": 78125, "has_call_stack": true}, {"name": "aten::clamp_min", "calls": 294, "device_self_duration": 17860, "device_total_duration": 17860, "host_self_duration": 7106, "host_total_duration": 10311, "has_call_stack": true}, {"name": "aten::add", "calls": 414, "device_self_duration": 16038, "device_total_duration": 16038, "host_self_duration": 14329, "host_total_duration": 19289, "has_call_stack": true}, {"name": "aten::copy_", "calls": 588, "device_self_duration": 11492, "device_total_duration": 11492, "host_self_duration": 40255, "host_total_duration": 52853, "has_call_stack": true}, {"name": "aten::max_pool2d_with_indices_backward", "calls": 6, "device_self_duration": 3822, "device_total_duration": 4677, "host_self_duration": 176, "host_total_duration": 411, "has_call_stack": false}, {"name": "aten::mul_", "calls": 966, "device_self_duration": 2409, "device_total_duration": 2409, "host_self_duration": 20515, "host_total_duration": 29081, "has_call_stack": true}, {"name": "aten::fill_", "calls": 978, "device_self_duration": 1887, "device_total_duration": 1887, "host_self_duration": 9879, "host_total_duration": 17059, "has_call_stack": true}, {"name": "aten::max_pool2d_with_indices", "calls": 6, "device_self_duration": 1420, "device_total_duration": 1420, "host_self_duration": 477, "host_total_duration": 562, "has_call_stack": true}, {"name": "aten::mm", "calls": 12, "device_self_duration": 275, "device_total_duration": 275, "host_self_duration": 575, "host_total_duration": 860, "has_call_stack": false}, {"name": "aten::mean", "calls": 6, "device_self_duration": 212, "device_total_duration": 212, "host_self_duration": 234, "host_total_duration": 445, "has_call_stack": true}, {"name": "aten::addmm", "calls": 6, "device_self_duration": 197, "device_total_duration": 197, "host_self_duration": 639, "host_total_duration": 1034, "has_call_stack": true}, {"name": "aten::div", "calls": 198, "device_self_duration": 144, "device_total_duration": 144, "host_self_duration": 20135, "host_total_duration": 35158, "has_call_stack": true}, {"name": "aten::_log_softmax_backward_data", "calls": 6, "device_self_duration": 53, "device_total_duration": 53, "host_self_duration": 108, "host_total_duration": 306, "has_call_stack": false}, {"name": "aten::sum", "calls": 6, "device_self_duration": 44, "device_total_duration": 44, "host_self_duration": 153, "host_total_duration": 327, "has_call_stack": false}, {"name": "aten::_log_softmax", "calls": 6, "device_self_duration": 42, "device_total_duration": 42, "host_self_duration": 170, "host_total_duration": 433, "has_call_stack": true}, {"name": "aten::nll_loss_forward", "calls": 6, "device_self_duration": 18, "device_total_duration": 18, "host_self_duration": 266, "host_total_duration": 346, "has_call_stack": true}, {"name": "aten::nll_loss_backward", "calls": 6, "device_self_duration": 18, "device_total_duration": 18, "host_self_duration": 388, "host_total_duration": 614, "has_call_stack": false}, {"name": "aten::empty", "calls": 4404, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 68147, "host_total_duration": 68147, "has_call_stack": true}, {"name": "aten::zero_", "calls": 996, "device_self_duration": 0, "device_total_duration": 1881, "host_self_duration": 10693, "host_total_duration": 27597, "has_call_stack": true}, {"name": "aten::zeros", "calls": 24, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 659, "host_total_duration": 1105, "has_call_stack": true}, {"name": "aten::view", "calls": 846, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 14027, "host_total_duration": 14027, "has_call_stack": true}, {"name": "aten::as_strided", "calls": 432, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 7600, "host_total_duration": 7600, "has_call_stack": true}, {"name": "aten::permute", "calls": 192, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 5515, "host_total_duration": 9775, "has_call_stack": true}, {"name": "aten::empty_like", "calls": 528, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 11742, "host_total_duration": 26064, "has_call_stack": true}, {"name": "aten::contiguous", "calls": 192, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 5510, "host_total_duration": 31137, "has_call_stack": true}, {"name": "aten::empty_strided", "calls": 402, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 11829, "host_total_duration": 11829, "has_call_stack": true}, {"name": "aten::to", "calls": 414, "device_self_duration": 0, "device_total_duration": 11492, "host_self_duration": 14834, "host_total_duration": 64332, "has_call_stack": true}, {"name": "aten::unsqueeze", "calls": 192, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 3574, "host_total_duration": 6603, "has_call_stack": true}, {"name": "aten::resize_", "calls": 1902, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 12406, "host_total_duration": 12406, "has_call_stack": true}, {"name": "aten::slice", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 123, "host_total_duration": 223, "has_call_stack": true}, {"name": "aten::narrow", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 127, "host_total_duration": 350, "has_call_stack": true}, {"name": "aten::_cat", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 16282, "host_total_duration": 16933, "has_call_stack": true}, {"name": "aten::cat", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 152, "host_total_duration": 17085, "has_call_stack": true}, {"name": "aten::stack", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 658, "host_total_duration": 24346, "has_call_stack": true}, {"name": "detach_", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 100, "host_total_duration": 100, "has_call_stack": true}, {"name": "aten::detach_", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 108, "host_total_duration": 208, "has_call_stack": true}, {"name": "aten::_convolution", "calls": 318, "device_self_duration": 0, "device_total_duration": 126619, "host_self_duration": 7245, "host_total_duration": 64243, "has_call_stack": true}, {"name": "aten::convolution", "calls": 318, "device_self_duration": 0, "device_total_duration": 126619, "host_self_duration": 6151, "host_total_duration": 70394, "has_call_stack": true}, {"name": "aten::conv2d", "calls": 318, "device_self_duration": 0, "device_total_duration": 126619, "host_self_duration": 7387, "host_total_duration": 77781, "has_call_stack": true}, {"name": "aten::_batch_norm_impl_index", "calls": 318, "device_self_duration": 0, "device_total_duration": 34245, "host_self_duration": 6544, "host_total_duration": 78071, "has_call_stack": true}, {"name": "aten::batch_norm", "calls": 318, "device_self_duration": 0, "device_total_duration": 34245, "host_self_duration": 9906, "host_total_duration": 87977, "has_call_stack": true}, {"name": "aten::clamp_min_", "calls": 294, "device_self_duration": 0, "device_total_duration": 17860, "host_self_duration": 5354, "host_total_duration": 15665, "has_call_stack": true}, {"name": "aten::relu_", "calls": 294, "device_self_duration": 0, "device_total_duration": 17860, "host_self_duration": 8516, "host_total_duration": 24181, "has_call_stack": true}, {"name": "aten::max_pool2d", "calls": 6, "device_self_duration": 0, "device_total_duration": 1420, "host_self_duration": 125, "host_total_duration": 687, "has_call_stack": true}, {"name": "aten::adaptive_avg_pool2d", "calls": 6, "device_self_duration": 0, "device_total_duration": 212, "host_self_duration": 116, "host_total_duration": 561, "has_call_stack": true}, {"name": "aten::flatten", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 84, "host_total_duration": 164, "has_call_stack": true}, {"name": "aten::transpose", "calls": 30, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 197, "host_total_duration": 329, "has_call_stack": true}, {"name": "aten::t", "calls": 30, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 255, "host_total_duration": 584, "has_call_stack": true}, {"name": "aten::expand", "calls": 12, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 150, "host_total_duration": 229, "has_call_stack": true}, {"name": "aten::linear", "calls": 6, "device_self_duration": 0, "device_total_duration": 197, "host_self_duration": 124, "host_total_duration": 1408, "has_call_stack": true}, {"name": "aten::log_softmax", "calls": 6, "device_self_duration": 0, "device_total_duration": 42, "host_self_duration": 90, "host_total_duration": 567, "has_call_stack": true}, {"name": "aten::nll_loss", "calls": 6, "device_self_duration": 0, "device_total_duration": 18, "host_self_duration": 68, "host_total_duration": 414, "has_call_stack": true}, {"name": "aten::nll_loss_nd", "calls": 6, "device_self_duration": 0, "device_total_duration": 18, "host_self_duration": 81, "host_total_duration": 495, "has_call_stack": true}, {"name": "aten::cross_entropy_loss", "calls": 6, "device_self_duration": 0, "device_total_duration": 60, "host_self_duration": 118, "host_total_duration": 1180, "has_call_stack": true}, {"name": "aten::ones_like", "calls": 6, "device_self_duration": 0, "device_total_duration": 6, "host_self_duration": 95, "host_total_duration": 452, "has_call_stack": true}, {"name": "NllLossBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 18, "host_self_duration": 102, "host_total_duration": 716, "has_call_stack": false}, {"name": "LogSoftmaxBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 53, "host_self_duration": 53, "host_total_duration": 359, "has_call_stack": false}, {"name": "aten::conj", "calls": 12, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 12, "host_total_duration": 12, "has_call_stack": false}, {"name": "AddmmBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 275, "host_self_duration": 182, "host_total_duration": 1303, "has_call_stack": false}, {"name": "torch::autograd::AccumulateGrad", "calls": 966, "device_self_duration": 0, "device_total_duration": 3030, "host_self_duration": 5457, "host_total_duration": 28494, "has_call_stack": false}, {"name": "TBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 18, "host_total_duration": 103, "has_call_stack": false}, {"name": "aten::reshape", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 29, "host_total_duration": 55, "has_call_stack": false}, {"name": "ViewBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 25, "host_total_duration": 80, "has_call_stack": false}, {"name": "MeanBackward1", "calls": 6, "device_self_duration": 0, "device_total_duration": 144, "host_self_duration": 541, "host_total_duration": 987, "has_call_stack": false}, {"name": "ReluBackward1", "calls": 294, "device_self_duration": 0, "device_total_duration": 27298, "host_self_duration": 2174, "host_total_duration": 11656, "has_call_stack": false}, {"name": "AddBackward0", "calls": 96, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 175, "host_total_duration": 175, "has_call_stack": false}, {"name": "CudnnBatchNormBackward", "calls": 318, "device_self_duration": 0, "device_total_duration": 61939, "host_self_duration": 4148, "host_total_duration": 34608, "has_call_stack": false}, {"name": "aten::cudnn_convolution_backward", "calls": 318, "device_self_duration": 0, "device_total_duration": 273428, "host_self_duration": 6081, "host_total_duration": 87772, "has_call_stack": false}, {"name": "CudnnConvolutionBackward", "calls": 318, "device_self_duration": 0, "device_total_duration": 273428, "host_self_duration": 3217, "host_total_duration": 90989, "has_call_stack": false}, {"name": "MaxPool2DWithIndicesBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 4677, "host_self_duration": 73, "host_total_duration": 484, "has_call_stack": false}] -{"data": {"columns": [{"type": "string", "name": "Name"}, {"type": "number", "name": "Calls"}, {"type": "number", "name": "Total Duration (us)"}, {"type": "number", "name": "Mean Duration (us)"}, {"type": "number", "name": "Max Duration (us)"}, {"type": "number", "name": "Min Duration (us)"}, {"type": "number", "name": "Mean Blocks Per SM", "tooltip": "Blocks Per SM:\nmin(blocks of this kernel / SM number of this GPU). If this number is less than 1, it indicates the GPU multiprocessors are not fully utilized."}, {"type": "number", "name": "Mean Est. Achieved Occupancy (%)", "tooltip": "Est. Achieved Occupancy:\nOccupancy is the ratio of active threads on an SM to the maximum number of active threads supported by the SM. The theoretical occupancy of a kernel is upper limit occupancy of this kernel, limited by multiple factors such as kernel shape, kernel used resource, and the GPU compute capability.Est. Achieved Occupancy of a kernel, OCC_K = min(threads of the kernel / SM number / max threads per SM, theoretical occupancy of the kernel). This overall number is the weighted sum of all kernels OCC_K using kernel's execution duration as weight."}], "rows": [["cask_cudnn::computeBOffsetsKernel(cask_cudnn::ComputeBOffsetsParams)", 72, 73, 1, 2, 1, 0.02, 0.0], ["cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", 138, 342, 2, 4, 1, 0.13, 1.73], ["cask_cudnn::computeWgradBOffsetsKernel(cask_cudnn::ComputeWgradBOffsetsParams)", 66, 81, 1, 2, 1, 0.02, 0.0], ["cask_cudnn::computeWgradSplitKOffsetsKernel(cask_cudnn::ComputeSplitKOffsetsParams)", 66, 81, 1, 2, 1, 0.15, 1.68], ["void (anonymous namespace)::softmax_warp_backward(float*, float const*, float const*, int, int, int)", 6, 53, 9, 9, 8, 0.1, 1.0], ["void (anonymous namespace)::softmax_warp_forward(float*, float const*, int, int, int)", 6, 42, 7, 7, 7, 0.1, 1.0], ["void at::native::(anonymous namespace)::max_pool_backward_nchw(int, float const*, long const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*)", 6, 3822, 637, 638, 636, 1254.4, 100.0], ["void at::native::(anonymous namespace)::max_pool_forward_nchw(int, float const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*, long*)", 6, 1420, 237, 239, 234, 313.6, 100.0], ["void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", 6, 44, 7, 8, 7, 0.02, 0.0], ["void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", 6, 212, 35, 36, 35, 51.2, 100.0], ["void at::native::unrolled_elementwise_kernel, at::detail::Array, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, at::native::MulScalarFunctor, at::detail::Array, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast)", 6, 144, 24, 24, 24, 156.8, 100.0], ["void at::native::unrolled_elementwise_kernel, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array::StoreWithoutCast>(int, at::native::copy_device_to_device(at::TensorIterator&, bool)::{lambda()#2}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array::StoreWithoutCast)", 6, 30, 5, 5, 5, 1.56, 5.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array)", 294, 17860, 61, 252, 5, 666.65, 100.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor, at::detail::Array >(int, at::native::AddFunctor, at::detail::Array)", 3090, 39814, 13, 378, 1, 641.54, 92.32], ["void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", 318, 322, 1, 2, 1, 0.01, 0.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", 978, 1887, 2, 143, 0, 599.07, 86.78], ["void at::native::vectorized_elementwise_kernel<4, at::native::MulScalarFunctor, at::detail::Array >(int, at::native::MulScalarFunctor, at::detail::Array)", 966, 2409, 2, 25, 1, 43.72, 58.39], ["void at::native::vectorized_elementwise_kernel<4, at::native::threshold_kernel_impl(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array >(int, at::native::threshold_kernel_impl(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array)", 294, 27298, 93, 377, 13, 653.06, 100.0], ["void cudnn::bn_bw_1C11_kernel_new(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float)", 264, 59642, 226, 915, 45, 4.34, 67.98], ["void cudnn::bn_bw_1C11_singleread(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnBwPersistentState*, int, float, float, float, int, float, cudnnStatus_t*, bool)", 54, 2297, 43, 73, 18, 20.81, 75.0], ["void cudnn::bn_fw_tr_1C11_kernel_NCHW(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float)", 150, 27060, 180, 452, 53, 3.12, 64.06], ["void cudnn::bn_fw_tr_1C11_singleread(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnFwPersistentState*, int, float, float, float, int, float, float, cudnnStatus_t*, bool)", 168, 7185, 43, 89, 13, 12.57, 75.0], ["void cudnn::cnn::im2col4d_kernel(cudnn::cnn::im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, float const*, float*)", 6, 614, 102, 103, 101, 0.95, 24.0], ["void cudnn::cnn::wgrad_alg0_engine(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 12, 7068, 589, 987, 193, 85.34, 37.5], ["void cudnn::cnn::wgrad_alg0_engine(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 156, 66472, 426, 745, 345, 9.78, 38.0], ["void cudnn::cnn::wgrad_alg0_engine(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 6, 4065, 678, 692, 652, 6.4, 25.0], ["void cudnn::detail::dgrad_engine(int, int, int, float const*, int, float const*, int, float*, kernel_grad_params, unsigned long long, int, unsigned long long, int, float, int, int, int)", 162, 80756, 498, 1017, 323, 42.25, 29.97], ["void cudnn::ops::scalePackedTensor_kernel(cudnnTensor4dStruct, float*, float)", 162, 4631, 29, 143, 5, 496.39, 100.0], ["void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams)", 36, 134, 4, 5, 2, 0.4, 3.0], ["void cudnn::winograd_nonfused::winogradForwardData4x4(cudnn::winograd_nonfused::WinogradDataParams)", 120, 4710, 39, 66, 17, 10.11, 50.0], ["void cudnn::winograd_nonfused::winogradForwardFilter4x4(cudnn::winograd_nonfused::WinogradFilterParams)", 120, 2662, 22, 67, 5, 8.68, 73.22], ["void cudnn::winograd_nonfused::winogradForwardOutput4x4(cudnn::winograd_nonfused::WinogradOutputParams)", 120, 5369, 45, 73, 19, 10.0, 50.0], ["void cudnn::winograd_nonfused::winogradWgradData4x4(cudnn::winograd_nonfused::WinogradDataParams)", 78, 4692, 60, 126, 20, 15.46, 38.0], ["void cudnn::winograd_nonfused::winogradWgradDelta4x4(cudnn::winograd_nonfused::WinogradDeltaParams)", 78, 4573, 59, 125, 17, 15.69, 50.0], ["void cudnn::winograd_nonfused::winogradWgradOutput4x4(cudnn::winograd_nonfused::WinogradWgradOutputParams)", 78, 1504, 19, 69, 5, 8.06, 41.33], ["void cunn_ClassNLLCriterion_updateGradInput_kernel(float*, float*, long*, float*, float*, int, int, int, int, long)", 6, 12, 2, 2, 2, 0.01, 0.0], ["void cunn_ClassNLLCriterion_updateOutput_kernel(float*, float*, float*, long*, float*, int, int, int, int, long)", 6, 18, 3, 3, 3, 0.01, 0.0], ["void explicit_convolve_sgemm(int, int, int, float const*, int, float const*, int, float*, kernel_conv_params, unsigned long long, int, unsigned long long, int, float, float, int, float const*, float const*)", 6, 4759, 793, 796, 790, 9.8, 31.0], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 90, 36957, 411, 748, 347, 12.34, 50.0], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 12, 5219, 435, 437, 432, 9.8, 31.0], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 60, 25782, 430, 729, 352, 3.9, 42.09], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 6, 3917, 653, 686, 595, 4.9, 25.0], ["void nchwToNhwcKernel(int, int, int, int, float const*, float*, float, float)", 12, 453, 38, 68, 9, 73.28, 100.0], ["void nhwcToNchwKernel(int, int, int, int, float const*, float*, float, float)", 6, 105, 18, 18, 17, 22.4, 100.0], ["void splitKreduce_kernel(cublasSplitKParams, float const*, float const*, float*, float const*, float const*, float const*)", 12, 30, 2, 3, 2, 4.44, 28.0], ["volta_scudnn_128x128_relu_interior_nn_v1", 6, 3010, 502, 508, 495, 9.8, 25.0], ["volta_scudnn_128x128_stridedB_interior_nn_v1", 18, 4693, 261, 281, 252, 9.8, 25.0], ["volta_scudnn_128x128_stridedB_medium_nn_v1", 12, 3501, 292, 296, 286, 19.6, 25.0], ["volta_scudnn_128x128_stridedB_small_nn_v1", 6, 2995, 499, 505, 493, 19.6, 25.0], ["volta_scudnn_128x128_stridedB_splitK_medium_nn_v1", 6, 3720, 620, 623, 614, 5.6, 25.0], ["volta_scudnn_128x128_stridedB_splitK_small_nn_v1", 48, 20448, 426, 676, 307, 6.83, 25.0], ["volta_scudnn_128x32_sliced1x4_ldg4_relu_exp_medium_nhwc_tn_v1", 6, 3270, 545, 627, 526, 4.9, 25.0], ["volta_scudnn_128x64_relu_interior_nn_v1", 30, 8022, 267, 316, 94, 37.1, 25.0], ["volta_scudnn_128x64_relu_medium_nn_v1", 6, 3627, 604, 606, 603, 39.2, 25.0], ["volta_scudnn_128x64_relu_small_nn_v1", 12, 3265, 272, 279, 254, 9.8, 25.0], ["volta_scudnn_128x64_relu_xregs_large_nn_v1", 6, 3200, 533, 607, 516, 4.9, 19.0], ["volta_scudnn_128x64_stridedB_interior_nn_v1", 30, 9597, 320, 510, 252, 12.9, 19.0], ["volta_scudnn_128x64_stridedB_small_nn_v1", 6, 584, 97, 100, 93, 9.8, 19.0], ["volta_scudnn_128x64_stridedB_splitK_xregs_large_nn_v1", 12, 7817, 651, 671, 635, 15.96, 19.0], ["volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1", 36, 12704, 353, 362, 344, 22.4, 25.0], ["volta_sgemm_128x32_nt", 24, 8629, 360, 477, 18, 0.97, 11.51], ["volta_sgemm_32x128_nn", 18, 3053, 170, 171, 168, 22.05, 50.0], ["volta_sgemm_32x128_nt", 18, 2843, 158, 159, 156, 22.05, 50.0], ["volta_sgemm_64x32_sliced1x4_nn", 6, 150, 25, 26, 24, 2.0, 25.0], ["volta_sgemm_64x32_sliced1x4_tn", 6, 149, 25, 26, 24, 1.0, 13.0], ["volta_sgemm_64x64_nn", 42, 8551, 204, 217, 195, 12.34, 24.14], ["volta_sgemm_64x64_nt", 102, 21084, 207, 279, 184, 10.24, 19.38]]}} +{"data": {"columns": [{"type": "string", "name": "Name"}, {"type": "number", "name": "Calls"}, {"type": "number", "name": "Total Duration (us)"}, {"type": "number", "name": "Mean Duration (us)"}, {"type": "number", "name": "Max Duration (us)"}, {"type": "number", "name": "Min Duration (us)"}, {"type": "number", "name": "Mean Blocks Per SM", "tooltip": "Blocks Per SM = blocks of this kernel / SM number of this GPU.\nIf this number is less than 1, it indicates the GPU multiprocessors are not fully utilized.\n\"Mean Blocks per SM\" is the weighted average of all calls of this kernel, using each call's execution duration as weight."}, {"type": "number", "name": "Mean Est. Achieved Occupancy (%)", "tooltip": "Est. Achieved Occupancy:\nThe bigger, the better. Occupancy is the ratio of active warps on an SM to the maximum number of active warps supported by the SM. The theoretical occupancy of a kernel is upper limit occupancy of this kernel, limited by multiple factors such as kernel shape, kernel used resource, and the GPU compute capability.\nEst. Achieved Occupancy of a kernel, OCC_K = min(threads of the kernel / SM number / max threads per SM, theoretical occupancy of the kernel). This \"Mean\" number is the weighted average of all calls' OCC_K of the kernel, using each call's execution duration as weight. It shows fine-grained low-level GPU utilization."}], "rows": [["cask_cudnn::computeBOffsetsKernel(cask_cudnn::ComputeBOffsetsParams)", 72, 73, 1, 2, 1, 0.02, 0.0], ["cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", 138, 342, 2, 4, 1, 0.13, 1.73], ["cask_cudnn::computeWgradBOffsetsKernel(cask_cudnn::ComputeWgradBOffsetsParams)", 66, 81, 1, 2, 1, 0.02, 0.0], ["cask_cudnn::computeWgradSplitKOffsetsKernel(cask_cudnn::ComputeSplitKOffsetsParams)", 66, 81, 1, 2, 1, 0.15, 1.68], ["void (anonymous namespace)::softmax_warp_backward(float*, float const*, float const*, int, int, int)", 6, 53, 9, 9, 8, 0.1, 1.0], ["void (anonymous namespace)::softmax_warp_forward(float*, float const*, int, int, int)", 6, 42, 7, 7, 7, 0.1, 1.0], ["void at::native::(anonymous namespace)::max_pool_backward_nchw(int, float const*, long const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*)", 6, 3822, 637, 638, 636, 1254.4, 100.0], ["void at::native::(anonymous namespace)::max_pool_forward_nchw(int, float const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*, long*)", 6, 1420, 237, 239, 234, 313.6, 100.0], ["void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", 6, 44, 7, 8, 7, 0.02, 0.0], ["void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", 6, 212, 35, 36, 35, 51.2, 100.0], ["void at::native::unrolled_elementwise_kernel, at::detail::Array, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, at::native::MulScalarFunctor, at::detail::Array, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast)", 6, 144, 24, 24, 24, 156.8, 100.0], ["void at::native::unrolled_elementwise_kernel, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array::StoreWithoutCast>(int, at::native::copy_device_to_device(at::TensorIterator&, bool)::{lambda()#2}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array::StoreWithoutCast)", 6, 30, 5, 5, 5, 1.56, 5.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array)", 294, 17860, 61, 252, 5, 666.65, 100.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor, at::detail::Array >(int, at::native::AddFunctor, at::detail::Array)", 3090, 39814, 13, 378, 1, 641.54, 92.32], ["void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", 318, 322, 1, 2, 1, 0.01, 0.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", 978, 1887, 2, 143, 0, 599.07, 86.78], ["void at::native::vectorized_elementwise_kernel<4, at::native::MulScalarFunctor, at::detail::Array >(int, at::native::MulScalarFunctor, at::detail::Array)", 966, 2409, 2, 25, 1, 43.72, 58.39], ["void at::native::vectorized_elementwise_kernel<4, at::native::threshold_kernel_impl(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array >(int, at::native::threshold_kernel_impl(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array)", 294, 27298, 93, 377, 13, 653.06, 100.0], ["void cudnn::bn_bw_1C11_kernel_new(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float)", 264, 59642, 226, 915, 45, 4.34, 67.98], ["void cudnn::bn_bw_1C11_singleread(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnBwPersistentState*, int, float, float, float, int, float, cudnnStatus_t*, bool)", 54, 2297, 43, 73, 18, 20.81, 75.0], ["void cudnn::bn_fw_tr_1C11_kernel_NCHW(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float)", 150, 27060, 180, 452, 53, 3.12, 64.06], ["void cudnn::bn_fw_tr_1C11_singleread(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnFwPersistentState*, int, float, float, float, int, float, float, cudnnStatus_t*, bool)", 168, 7185, 43, 89, 13, 12.57, 75.0], ["void cudnn::cnn::im2col4d_kernel(cudnn::cnn::im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, float const*, float*)", 6, 614, 102, 103, 101, 0.95, 24.0], ["void cudnn::cnn::wgrad_alg0_engine(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 12, 7068, 589, 987, 193, 85.34, 37.5], ["void cudnn::cnn::wgrad_alg0_engine(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 156, 66472, 426, 745, 345, 9.78, 38.0], ["void cudnn::cnn::wgrad_alg0_engine(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 6, 4065, 678, 692, 652, 6.4, 25.0], ["void cudnn::detail::dgrad_engine(int, int, int, float const*, int, float const*, int, float*, kernel_grad_params, unsigned long long, int, unsigned long long, int, float, int, int, int)", 162, 80756, 498, 1017, 323, 42.25, 29.97], ["void cudnn::ops::scalePackedTensor_kernel(cudnnTensor4dStruct, float*, float)", 162, 4631, 29, 143, 5, 496.39, 100.0], ["void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams)", 36, 134, 4, 5, 2, 0.4, 3.0], ["void cudnn::winograd_nonfused::winogradForwardData4x4(cudnn::winograd_nonfused::WinogradDataParams)", 120, 4710, 39, 66, 17, 10.11, 50.0], ["void cudnn::winograd_nonfused::winogradForwardFilter4x4(cudnn::winograd_nonfused::WinogradFilterParams)", 120, 2662, 22, 67, 5, 8.68, 73.22], ["void cudnn::winograd_nonfused::winogradForwardOutput4x4(cudnn::winograd_nonfused::WinogradOutputParams)", 120, 5369, 45, 73, 19, 10.0, 50.0], ["void cudnn::winograd_nonfused::winogradWgradData4x4(cudnn::winograd_nonfused::WinogradDataParams)", 78, 4692, 60, 126, 20, 15.46, 38.0], ["void cudnn::winograd_nonfused::winogradWgradDelta4x4(cudnn::winograd_nonfused::WinogradDeltaParams)", 78, 4573, 59, 125, 17, 15.69, 50.0], ["void cudnn::winograd_nonfused::winogradWgradOutput4x4(cudnn::winograd_nonfused::WinogradWgradOutputParams)", 78, 1504, 19, 69, 5, 8.06, 41.33], ["void cunn_ClassNLLCriterion_updateGradInput_kernel(float*, float*, long*, float*, float*, int, int, int, int, long)", 6, 12, 2, 2, 2, 0.01, 0.0], ["void cunn_ClassNLLCriterion_updateOutput_kernel(float*, float*, float*, long*, float*, int, int, int, int, long)", 6, 18, 3, 3, 3, 0.01, 0.0], ["void explicit_convolve_sgemm(int, int, int, float const*, int, float const*, int, float*, kernel_conv_params, unsigned long long, int, unsigned long long, int, float, float, int, float const*, float const*)", 6, 4759, 793, 796, 790, 9.8, 31.0], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 90, 36957, 411, 748, 347, 12.34, 50.0], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 12, 5219, 435, 437, 432, 9.8, 31.0], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 60, 25782, 430, 729, 352, 3.9, 42.09], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 6, 3917, 653, 686, 595, 4.9, 25.0], ["void nchwToNhwcKernel(int, int, int, int, float const*, float*, float, float)", 12, 453, 38, 68, 9, 73.28, 100.0], ["void nhwcToNchwKernel(int, int, int, int, float const*, float*, float, float)", 6, 105, 18, 18, 17, 22.4, 100.0], ["void splitKreduce_kernel(cublasSplitKParams, float const*, float const*, float*, float const*, float const*, float const*)", 12, 30, 2, 3, 2, 4.44, 28.0], ["volta_scudnn_128x128_relu_interior_nn_v1", 6, 3010, 502, 508, 495, 9.8, 25.0], ["volta_scudnn_128x128_stridedB_interior_nn_v1", 18, 4693, 261, 281, 252, 9.8, 25.0], ["volta_scudnn_128x128_stridedB_medium_nn_v1", 12, 3501, 292, 296, 286, 19.6, 25.0], ["volta_scudnn_128x128_stridedB_small_nn_v1", 6, 2995, 499, 505, 493, 19.6, 25.0], ["volta_scudnn_128x128_stridedB_splitK_medium_nn_v1", 6, 3720, 620, 623, 614, 5.6, 25.0], ["volta_scudnn_128x128_stridedB_splitK_small_nn_v1", 48, 20448, 426, 676, 307, 6.83, 25.0], ["volta_scudnn_128x32_sliced1x4_ldg4_relu_exp_medium_nhwc_tn_v1", 6, 3270, 545, 627, 526, 4.9, 25.0], ["volta_scudnn_128x64_relu_interior_nn_v1", 30, 8022, 267, 316, 94, 37.1, 25.0], ["volta_scudnn_128x64_relu_medium_nn_v1", 6, 3627, 604, 606, 603, 39.2, 25.0], ["volta_scudnn_128x64_relu_small_nn_v1", 12, 3265, 272, 279, 254, 9.8, 25.0], ["volta_scudnn_128x64_relu_xregs_large_nn_v1", 6, 3200, 533, 607, 516, 4.9, 19.0], ["volta_scudnn_128x64_stridedB_interior_nn_v1", 30, 9597, 320, 510, 252, 12.9, 19.0], ["volta_scudnn_128x64_stridedB_small_nn_v1", 6, 584, 97, 100, 93, 9.8, 19.0], ["volta_scudnn_128x64_stridedB_splitK_xregs_large_nn_v1", 12, 7817, 651, 671, 635, 15.96, 19.0], ["volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1", 36, 12704, 353, 362, 344, 22.4, 25.0], ["volta_sgemm_128x32_nt", 24, 8629, 360, 477, 18, 0.97, 11.51], ["volta_sgemm_32x128_nn", 18, 3053, 170, 171, 168, 22.05, 50.0], ["volta_sgemm_32x128_nt", 18, 2843, 158, 159, 156, 22.05, 50.0], ["volta_sgemm_64x32_sliced1x4_nn", 6, 150, 25, 26, 24, 2.0, 25.0], ["volta_sgemm_64x32_sliced1x4_tn", 6, 149, 25, 26, 24, 1.0, 13.0], ["volta_sgemm_64x64_nn", 42, 8551, 204, 217, 195, 12.34, 24.14], ["volta_sgemm_64x64_nt", 102, 21084, 207, 279, 184, 10.24, 19.38]]}} {"total": {"columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["cask_cudnn::computeBOffsetsKernel(cask_cudnn::ComputeBOffsetsParams)", 73.0], ["cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", 342.0], ["cask_cudnn::computeWgradBOffsetsKernel(cask_cudnn::ComputeWgradBOffsetsParams)", 81.0], ["cask_cudnn::computeWgradSplitKOffsetsKernel(cask_cudnn::ComputeSplitKOffsetsParams)", 81.0], ["void (anonymous namespace)::softmax_warp_backward(float*, float const*, float const*, int, int, int)", 53.0], ["void (anonymous namespace)::softmax_warp_forward(float*, float const*, int, int, int)", 42.0], ["void at::native::(anonymous namespace)::max_pool_backward_nchw(int, float const*, long const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*)", 3822.0], ["void at::native::(anonymous namespace)::max_pool_forward_nchw(int, float const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*, long*)", 1420.0], ["void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", 44.0], ["void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", 212.0], ["void at::native::unrolled_elementwise_kernel, at::detail::Array, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, at::native::MulScalarFunctor, at::detail::Array, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast)", 144.0], ["void at::native::unrolled_elementwise_kernel, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array::StoreWithoutCast>(int, at::native::copy_device_to_device(at::TensorIterator&, bool)::{lambda()#2}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array::StoreWithoutCast)", 30.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array)", 17860.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor, at::detail::Array >(int, at::native::AddFunctor, at::detail::Array)", 39814.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", 322.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", 1887.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::MulScalarFunctor, at::detail::Array >(int, at::native::MulScalarFunctor, at::detail::Array)", 2409.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::threshold_kernel_impl(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array >(int, at::native::threshold_kernel_impl(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array)", 27298.0], ["void cudnn::bn_bw_1C11_kernel_new(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float)", 59642.0], ["void cudnn::bn_bw_1C11_singleread(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnBwPersistentState*, int, float, float, float, int, float, cudnnStatus_t*, bool)", 2297.0], ["void cudnn::bn_fw_tr_1C11_kernel_NCHW(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float)", 27060.0], ["void cudnn::bn_fw_tr_1C11_singleread(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnFwPersistentState*, int, float, float, float, int, float, float, cudnnStatus_t*, bool)", 7185.0], ["void cudnn::cnn::im2col4d_kernel(cudnn::cnn::im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, float const*, float*)", 614.0], ["void cudnn::cnn::wgrad_alg0_engine(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 7068.0], ["void cudnn::cnn::wgrad_alg0_engine(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 66472.0], ["void cudnn::cnn::wgrad_alg0_engine(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 4065.0], ["void cudnn::detail::dgrad_engine(int, int, int, float const*, int, float const*, int, float*, kernel_grad_params, unsigned long long, int, unsigned long long, int, float, int, int, int)", 80756.0], ["void cudnn::ops::scalePackedTensor_kernel(cudnnTensor4dStruct, float*, float)", 4631.0], ["void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams)", 134.0], ["void cudnn::winograd_nonfused::winogradForwardData4x4(cudnn::winograd_nonfused::WinogradDataParams)", 4710.0], ["void cudnn::winograd_nonfused::winogradForwardFilter4x4(cudnn::winograd_nonfused::WinogradFilterParams)", 2662.0], ["void cudnn::winograd_nonfused::winogradForwardOutput4x4(cudnn::winograd_nonfused::WinogradOutputParams)", 5369.0], ["void cudnn::winograd_nonfused::winogradWgradData4x4(cudnn::winograd_nonfused::WinogradDataParams)", 4692.0], ["void cudnn::winograd_nonfused::winogradWgradDelta4x4(cudnn::winograd_nonfused::WinogradDeltaParams)", 4573.0], ["void cudnn::winograd_nonfused::winogradWgradOutput4x4(cudnn::winograd_nonfused::WinogradWgradOutputParams)", 1504.0], ["void cunn_ClassNLLCriterion_updateGradInput_kernel(float*, float*, long*, float*, float*, int, int, int, int, long)", 12.0], ["void cunn_ClassNLLCriterion_updateOutput_kernel(float*, float*, float*, long*, float*, int, int, int, int, long)", 18.0], ["void explicit_convolve_sgemm(int, int, int, float const*, int, float const*, int, float*, kernel_conv_params, unsigned long long, int, unsigned long long, int, float, float, int, float const*, float const*)", 4759.0], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 36957.0], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 5219.0], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 25782.0], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 3917.0], ["void nchwToNhwcKernel(int, int, int, int, float const*, float*, float, float)", 453.0], ["void nhwcToNchwKernel(int, int, int, int, float const*, float*, float, float)", 105.0], ["void splitKreduce_kernel(cublasSplitKParams, float const*, float const*, float*, float const*, float const*, float const*)", 30.0], ["volta_scudnn_128x128_relu_interior_nn_v1", 3010.0], ["volta_scudnn_128x128_stridedB_interior_nn_v1", 4693.0], ["volta_scudnn_128x128_stridedB_medium_nn_v1", 3501.0], ["volta_scudnn_128x128_stridedB_small_nn_v1", 2995.0], ["volta_scudnn_128x128_stridedB_splitK_medium_nn_v1", 3720.0], ["volta_scudnn_128x128_stridedB_splitK_small_nn_v1", 20448.0], ["volta_scudnn_128x32_sliced1x4_ldg4_relu_exp_medium_nhwc_tn_v1", 3270.0], ["volta_scudnn_128x64_relu_interior_nn_v1", 8022.0], ["volta_scudnn_128x64_relu_medium_nn_v1", 3627.0], ["volta_scudnn_128x64_relu_small_nn_v1", 3265.0], ["volta_scudnn_128x64_relu_xregs_large_nn_v1", 3200.0], ["volta_scudnn_128x64_stridedB_interior_nn_v1", 9597.0], ["volta_scudnn_128x64_stridedB_small_nn_v1", 584.0], ["volta_scudnn_128x64_stridedB_splitK_xregs_large_nn_v1", 7817.0], ["volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1", 12704.0], ["volta_sgemm_128x32_nt", 8629.0], ["volta_sgemm_32x128_nn", 3053.0], ["volta_sgemm_32x128_nt", 2843.0], ["volta_sgemm_64x32_sliced1x4_nn", 150.0], ["volta_sgemm_64x32_sliced1x4_tn", 149.0], ["volta_sgemm_64x64_nn", 8551.0], ["volta_sgemm_64x64_nt", 21084.0]]}} -{"steps": {"columns": [{"type": "string", "name": "Step"}, {"type": "number", "name": "Kernel"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Memcpy"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Memset"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Runtime"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "DataLoader"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "CPU Exec"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Other"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}], "rows": [["5", 99778, "
Step 5
Total: 182306us
Kernel: 99778us
Percentage: 54.73%
", 3606, "
Step 5
Total: 182306us
Memcpy: 3606us
Percentage: 1.98%
", 98, "
Step 5
Total: 182306us
Memset: 98us
Percentage: 0.05%
", 41028, "
Step 5
Total: 182306us
Runtime: 41028us
Percentage: 22.51%
", 4341, "
Step 5
Total: 182306us
DataLoader: 4341us
Percentage: 2.38%
", 27460, "
Step 5
Total: 182306us
CPU Exec: 27460us
Percentage: 15.06%
", 5995, "
Step 5
Total: 182306us
Other: 5995us
Percentage: 3.29%
"], ["6", 99208, "
Step 6
Total: 126183us
Kernel: 99208us
Percentage: 78.62%
", 2948, "
Step 6
Total: 126183us
Memcpy: 2948us
Percentage: 2.34%
", 98, "
Step 6
Total: 126183us
Memset: 98us
Percentage: 0.08%
", 3406, "
Step 6
Total: 126183us
Runtime: 3406us
Percentage: 2.7%
", 0, "
Step 6
Total: 126183us
DataLoader: 0us
Percentage: 0.0%
", 16404, "
Step 6
Total: 126183us
CPU Exec: 16404us
Percentage: 13.0%
", 4119, "
Step 6
Total: 126183us
Other: 4119us
Percentage: 3.26%
"], ["7", 99114, "
Step 7
Total: 127181us
Kernel: 99114us
Percentage: 77.93%
", 2949, "
Step 7
Total: 127181us
Memcpy: 2949us
Percentage: 2.32%
", 98, "
Step 7
Total: 127181us
Memset: 98us
Percentage: 0.08%
", 3417, "
Step 7
Total: 127181us
Runtime: 3417us
Percentage: 2.69%
", 6, "
Step 7
Total: 127181us
DataLoader: 6us
Percentage: 0.0%
", 19521, "
Step 7
Total: 127181us
CPU Exec: 19521us
Percentage: 15.35%
", 2076, "
Step 7
Total: 127181us
Other: 2076us
Percentage: 1.63%
"], ["8", 99021, "
Step 8
Total: 123079us
Kernel: 99021us
Percentage: 80.45%
", 2975, "
Step 8
Total: 123079us
Memcpy: 2975us
Percentage: 2.42%
", 97, "
Step 8
Total: 123079us
Memset: 97us
Percentage: 0.08%
", 3544, "
Step 8
Total: 123079us
Runtime: 3544us
Percentage: 2.88%
", 0, "
Step 8
Total: 123079us
DataLoader: 0us
Percentage: 0.0%
", 15464, "
Step 8
Total: 123079us
CPU Exec: 15464us
Percentage: 12.56%
", 1978, "
Step 8
Total: 123079us
Other: 1978us
Percentage: 1.61%
"], ["9", 98791, "
Step 9
Total: 163461us
Kernel: 98791us
Percentage: 60.44%
", 3596, "
Step 9
Total: 163461us
Memcpy: 3596us
Percentage: 2.2%
", 97, "
Step 9
Total: 163461us
Memset: 97us
Percentage: 0.06%
", 8275, "
Step 9
Total: 163461us
Runtime: 8275us
Percentage: 5.06%
", 1370, "
Step 9
Total: 163461us
DataLoader: 1370us
Percentage: 0.84%
", 43905, "
Step 9
Total: 163461us
CPU Exec: 43905us
Percentage: 26.86%
", 7427, "
Step 9
Total: 163461us
Other: 7427us
Percentage: 4.54%
"], ["10", 98956, "
Step 10
Total: 124198us
Kernel: 98956us
Percentage: 79.68%
", 2885, "
Step 10
Total: 124198us
Memcpy: 2885us
Percentage: 2.32%
", 98, "
Step 10
Total: 124198us
Memset: 98us
Percentage: 0.08%
", 3714, "
Step 10
Total: 124198us
Runtime: 3714us
Percentage: 2.99%
", 1400, "
Step 10
Total: 124198us
DataLoader: 1400us
Percentage: 1.13%
", 13235, "
Step 10
Total: 124198us
CPU Exec: 13235us
Percentage: 10.66%
", 3910, "
Step 10
Total: 124198us
Other: 3910us
Percentage: 3.15%
"]]}, "performance": [{"name": "Average Step Time", "description": "", "value": 141068, "extra": 100, "children": [{"name": "Kernel", "description": "", "value": 99145, "extra": 70.28}, {"name": "Memcpy", "description": "", "value": 3160, "extra": 2.24}, {"name": "Memset", "description": "", "value": 98, "extra": 0.07}, {"name": "Runtime", "description": "", "value": 10564, "extra": 7.49}, {"name": "DataLoader", "description": "", "value": 1186, "extra": 0.84}, {"name": "CPU Exec", "description": "", "value": 22665, "extra": 16.07}, {"name": "Other", "description": "", "value": 4251, "extra": 3.01}]}], "recommendations": "
  • N/A
", "environments": [{"title": "Number of Worker(s)", "value": "1"}, {"title": "Device Type", "value": "GPU"}], "gpu_metrics": {"title": "GPU Summary", "data": [{"title": "GPU 0:", "value": ""}, {"title": "Name", "value": "Tesla V100-DGXS-32GB"}, {"title": "Memory", "value": "31.74 GB"}, {"title": "Compute Capability", "value": "7.0"}, {"title": "GPU Utilization", "value": "70.27 %"}, {"title": "Est. SM Efficiency", "value": "69.22 %"}, {"title": "Est. Achieved Occupancy", "value": "48.91 %"}], "tooltip": "The GPU usage metrics:\n\nGPU Utilization:\nGPU busy time / All steps time. GPU busy time is the time during which there is at least one GPU kernel running on it. All steps time is the total time of all profiler steps(or called as iterations).\n\nEst. SM Efficiency:\nEstimated Stream Multiprocessor Efficiency. Est. SM Efficiency of a kernel, SM_Eff_K = min(blocks of this kernel / SM number of this GPU, 100%). This overall number is the sum of all kernels' SM_Eff_K weighted by kernel's execution duration, divided by all steps time.\n\nEst. Achieved Occupancy:\nOccupancy is the ratio of active threads on an SM to the maximum number of active threads supported by the SM. The theoretical occupancy of a kernel is upper limit occupancy of this kernel, limited by multiple factors such as kernel shape, kernel used resource, and the GPU compute capability.Est. Achieved Occupancy of a kernel, OCC_K = min(threads of the kernel / SM number / max threads per SM, theoretical occupancy of the kernel). This overall number is the weighted sum of all kernels OCC_K using kernel's execution duration as weight."}} +{"steps": {"columns": [{"type": "string", "name": "Step"}, {"type": "number", "name": "Kernel"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Memcpy"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Memset"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Runtime"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "DataLoader"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "CPU Exec"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Other"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}], "rows": [["5", 99778, "
Step 5
Total: 182306us
Kernel: 99778us
Percentage: 54.73%
", 3606, "
Step 5
Total: 182306us
Memcpy: 3606us
Percentage: 1.98%
", 98, "
Step 5
Total: 182306us
Memset: 98us
Percentage: 0.05%
", 41028, "
Step 5
Total: 182306us
Runtime: 41028us
Percentage: 22.51%
", 4341, "
Step 5
Total: 182306us
DataLoader: 4341us
Percentage: 2.38%
", 27460, "
Step 5
Total: 182306us
CPU Exec: 27460us
Percentage: 15.06%
", 5995, "
Step 5
Total: 182306us
Other: 5995us
Percentage: 3.29%
"], ["6", 99208, "
Step 6
Total: 126183us
Kernel: 99208us
Percentage: 78.62%
", 2948, "
Step 6
Total: 126183us
Memcpy: 2948us
Percentage: 2.34%
", 98, "
Step 6
Total: 126183us
Memset: 98us
Percentage: 0.08%
", 3406, "
Step 6
Total: 126183us
Runtime: 3406us
Percentage: 2.7%
", 0, "
Step 6
Total: 126183us
DataLoader: 0us
Percentage: 0.0%
", 16404, "
Step 6
Total: 126183us
CPU Exec: 16404us
Percentage: 13.0%
", 4119, "
Step 6
Total: 126183us
Other: 4119us
Percentage: 3.26%
"], ["7", 99114, "
Step 7
Total: 127181us
Kernel: 99114us
Percentage: 77.93%
", 2949, "
Step 7
Total: 127181us
Memcpy: 2949us
Percentage: 2.32%
", 98, "
Step 7
Total: 127181us
Memset: 98us
Percentage: 0.08%
", 3417, "
Step 7
Total: 127181us
Runtime: 3417us
Percentage: 2.69%
", 6, "
Step 7
Total: 127181us
DataLoader: 6us
Percentage: 0.0%
", 19521, "
Step 7
Total: 127181us
CPU Exec: 19521us
Percentage: 15.35%
", 2076, "
Step 7
Total: 127181us
Other: 2076us
Percentage: 1.63%
"], ["8", 99021, "
Step 8
Total: 123079us
Kernel: 99021us
Percentage: 80.45%
", 2975, "
Step 8
Total: 123079us
Memcpy: 2975us
Percentage: 2.42%
", 97, "
Step 8
Total: 123079us
Memset: 97us
Percentage: 0.08%
", 3544, "
Step 8
Total: 123079us
Runtime: 3544us
Percentage: 2.88%
", 0, "
Step 8
Total: 123079us
DataLoader: 0us
Percentage: 0.0%
", 15464, "
Step 8
Total: 123079us
CPU Exec: 15464us
Percentage: 12.56%
", 1978, "
Step 8
Total: 123079us
Other: 1978us
Percentage: 1.61%
"], ["9", 98791, "
Step 9
Total: 163461us
Kernel: 98791us
Percentage: 60.44%
", 3596, "
Step 9
Total: 163461us
Memcpy: 3596us
Percentage: 2.2%
", 97, "
Step 9
Total: 163461us
Memset: 97us
Percentage: 0.06%
", 8275, "
Step 9
Total: 163461us
Runtime: 8275us
Percentage: 5.06%
", 1370, "
Step 9
Total: 163461us
DataLoader: 1370us
Percentage: 0.84%
", 43905, "
Step 9
Total: 163461us
CPU Exec: 43905us
Percentage: 26.86%
", 7427, "
Step 9
Total: 163461us
Other: 7427us
Percentage: 4.54%
"], ["10", 98956, "
Step 10
Total: 124198us
Kernel: 98956us
Percentage: 79.68%
", 2885, "
Step 10
Total: 124198us
Memcpy: 2885us
Percentage: 2.32%
", 98, "
Step 10
Total: 124198us
Memset: 98us
Percentage: 0.08%
", 3714, "
Step 10
Total: 124198us
Runtime: 3714us
Percentage: 2.99%
", 1400, "
Step 10
Total: 124198us
DataLoader: 1400us
Percentage: 1.13%
", 13235, "
Step 10
Total: 124198us
CPU Exec: 13235us
Percentage: 10.66%
", 3910, "
Step 10
Total: 124198us
Other: 3910us
Percentage: 3.15%
"]]}, "performance": [{"name": "Average Step Time", "description": "", "value": 141068, "extra": 100, "children": [{"name": "Kernel", "description": "", "value": 99145, "extra": 70.28}, {"name": "Memcpy", "description": "", "value": 3160, "extra": 2.24}, {"name": "Memset", "description": "", "value": 98, "extra": 0.07}, {"name": "Runtime", "description": "", "value": 10564, "extra": 7.49}, {"name": "DataLoader", "description": "", "value": 1186, "extra": 0.84}, {"name": "CPU Exec", "description": "", "value": 22665, "extra": 16.07}, {"name": "Other", "description": "", "value": 4251, "extra": 3.01}]}], "recommendations": "
  • N/A
", "environments": [{"title": "Number of Worker(s)", "value": "1"}, {"title": "Device Type", "value": "GPU"}], "gpu_metrics": {"title": "GPU Summary", "data": [{"title": "GPU 0:", "value": ""}, {"title": "Name", "value": "Tesla V100-DGXS-32GB"}, {"title": "Memory", "value": "31.74 GB"}, {"title": "Compute Capability", "value": "7.0"}, {"title": "GPU Utilization", "value": "70.27 %"}, {"title": "Est. SM Efficiency", "value": "69.22 %"}, {"title": "Est. Achieved Occupancy", "value": "48.91 %"}], "tooltip": "The GPU usage metrics:\n\nGPU Utilization:\nGPU busy time / All steps time. The bigger, the better. GPU busy time is the time during which there is at least one GPU kernel running on it. All steps time is the total time of all profiler steps(or called as iterations).\n\nEst. SM Efficiency:\nEstimated Stream Multiprocessor Efficiency. The bigger, the better. This metric of a kernel, SM_Eff_K = min(blocks of this kernel / SM number of this GPU, 100%). This overall number is the sum of all kernels' SM_Eff_K weighted by kernel's execution duration, divided by all steps time.\n\nEst. Achieved Occupancy:\nThe bigger, the better. Occupancy is the ratio of active warps on an SM to the maximum number of active warps supported by the SM. The theoretical occupancy of a kernel is upper limit occupancy of this kernel, limited by multiple factors such as kernel shape, kernel used resource, and the GPU compute capability.\nEst. Achieved Occupancy of a kernel, OCC_K = min(threads of the kernel / SM number / max threads per SM, theoretical occupancy of the kernel). This overall number is the weighted average of all kernels' OCC_K using kernel's execution duration as weight. It shows fine-grained low-level GPU utilization."}} {"device_total_time": {"title": "Device Total Time (us)", "columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["aten::cudnn_convolution_backward", 274794], ["CudnnConvolutionBackward", 274794], ["aten::cudnn_convolution_backward_weight", 141300], ["aten::cudnn_convolution_backward_input", 133494], ["aten::cudnn_convolution", 128683], ["aten::_convolution", 128683], ["aten::convolution", 128683], ["aten::conv2d", 128683], ["aten::cudnn_batch_norm_backward", 61899], ["CudnnBatchNormBackward", 61899], ["aten::cudnn_batch_norm", 34315], ["aten::_batch_norm_impl_index", 34315], ["aten::batch_norm", 34315], ["aten::threshold_backward", 27280], ["ReluBackward1", 27280], ["aten::add_", 24052], ["aten::to", 18959], ["aten::copy_", 18959], ["aten::clamp_min", 17862], ["aten::clamp_min_", 17862], ["aten::relu_", 17862], ["aten::add", 16026], ["aten::max_pool2d_with_indices_backward", 4695], ["MaxPool2DWithIndicesBackward", 4695], ["torch::autograd::AccumulateGrad", 3012], ["aten::mul_", 2395], ["aten::fill_", 1888], ["aten::zero_", 1882], ["aten::max_pool2d_with_indices", 1422], ["aten::max_pool2d", 1422], ["aten::mm", 274], ["AddmmBackward", 274], ["aten::mean", 210], ["aten::adaptive_avg_pool2d", 210], ["aten::addmm", 197], ["aten::linear", 197], ["aten::div", 145], ["MeanBackward1", 145], ["aten::cross_entropy_loss", 60], ["aten::_log_softmax_backward_data", 51], ["LogSoftmaxBackward", 51], ["aten::sum", 45], ["aten::_log_softmax", 42], ["aten::log_softmax", 42], ["aten::nll_loss_forward", 18], ["aten::nll_loss", 18], ["aten::nll_loss_nd", 18], ["aten::nll_loss_backward", 18], ["NllLossBackward", 18], ["aten::ones_like", 6]]}, "device_self_time": {"title": "Device Self Time (us)", "columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["aten::cudnn_convolution_backward_weight", 141300], ["aten::cudnn_convolution_backward_input", 133494], ["aten::cudnn_convolution", 128683], ["aten::cudnn_batch_norm_backward", 61899], ["aten::cudnn_batch_norm", 34315], ["aten::threshold_backward", 27280], ["aten::add_", 24052], ["aten::copy_", 18959], ["aten::clamp_min", 17862], ["aten::add", 16026], ["aten::max_pool2d_with_indices_backward", 3838], ["aten::mul_", 2395], ["aten::fill_", 1888], ["aten::max_pool2d_with_indices", 1422], ["aten::mm", 274], ["aten::mean", 210], ["aten::addmm", 197], ["aten::div", 145], ["aten::_log_softmax_backward_data", 51], ["aten::sum", 45], ["aten::_log_softmax", 42], ["aten::nll_loss_forward", 18], ["aten::nll_loss_backward", 18]]}, "host_total_time": {"title": "Host Total Time (us)", "columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["CudnnConvolutionBackward", 119890], ["aten::cudnn_convolution_backward", 115797], ["aten::batch_norm", 105589], ["aten::add_", 97540], ["aten::_batch_norm_impl_index", 95925], ["aten::conv2d", 91000], ["aten::cudnn_batch_norm", 87823], ["aten::empty", 82024], ["aten::convolution", 81781], ["aten::_convolution", 74086], ["aten::cudnn_convolution", 64167], ["aten::cudnn_convolution_backward_weight", 60712], ["aten::to", 57776], ["aten::copy_", 56915], ["aten::cudnn_convolution_backward_input", 47359], ["CudnnBatchNormBackward", 41825], ["torch::autograd::AccumulateGrad", 37189], ["aten::cudnn_batch_norm_backward", 36641], ["aten::mul_", 35389], ["aten::relu_", 29432], ["aten::zero_", 28309], ["aten::add", 23831], ["aten::clamp_min_", 19059], ["aten::empty_like", 18591], ["aten::fill_", 17657], ["aten::resize_", 15019], ["ReluBackward1", 14944], ["aten::clamp_min", 12503], ["aten::threshold_backward", 12062], ["aten::view", 9046], ["AddmmBackward", 2026], ["aten::linear", 1463], ["aten::mm", 1424], ["aten::zeros", 1319], ["aten::cross_entropy_loss", 1225], ["aten::addmm", 1060], ["NllLossBackward", 889], ["aten::nll_loss_backward", 747], ["aten::t", 725], ["MeanBackward1", 663], ["aten::max_pool2d", 599], ["MaxPool2DWithIndicesBackward", 590], ["aten::adaptive_avg_pool2d", 581], ["aten::log_softmax", 580], ["aten::nll_loss_nd", 507], ["LogSoftmaxBackward", 500], ["aten::max_pool2d_with_indices_backward", 493], ["aten::ones_like", 470], ["aten::div", 469], ["aten::mean", 454], ["aten::empty_strided", 453], ["aten::_log_softmax_backward_data", 424], ["aten::max_pool2d_with_indices", 422], ["aten::_log_softmax", 420], ["aten::nll_loss", 418], ["aten::transpose", 413], ["aten::sum", 411], ["aten::nll_loss_forward", 343], ["aten::detach_", 323], ["aten::as_strided", 244], ["aten::expand", 237], ["aten::set_", 221], ["AddBackward0", 200], ["aten::flatten", 163], ["detach_", 156], ["TBackward", 151], ["ViewBackward", 132], ["aten::reshape", 88], ["aten::conj", 15]]}, "host_self_time": {"title": "Host Self Time (us)", "columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["aten::empty", 82024], ["aten::add_", 62385], ["aten::cudnn_convolution", 35632], ["aten::cudnn_convolution_backward_input", 31902], ["aten::cudnn_convolution_backward_weight", 30672], ["aten::mul_", 24617], ["aten::cudnn_batch_norm", 23800], ["aten::add", 17808], ["aten::cudnn_batch_norm_backward", 15118], ["aten::resize_", 15019], ["aten::zero_", 10815], ["aten::relu_", 10373], ["aten::_convolution", 9919], ["aten::batch_norm", 9664], ["aten::fill_", 9660], ["aten::conv2d", 9219], ["aten::view", 9046], ["aten::clamp_min", 8409], ["aten::empty_like", 8385], ["aten::_batch_norm_impl_index", 8102], ["aten::threshold_backward", 7820], ["aten::cudnn_convolution_backward", 7726], ["aten::convolution", 7695], ["torch::autograd::AccumulateGrad", 7181], ["aten::clamp_min_", 6556], ["CudnnBatchNormBackward", 5184], ["CudnnConvolutionBackward", 4093], ["ReluBackward1", 2882], ["aten::mm", 1032], ["aten::zeros", 877], ["aten::addmm", 652], ["aten::to", 547], ["aten::nll_loss_backward", 463], ["aten::empty_strided", 453], ["aten::div", 343], ["aten::max_pool2d_with_indices", 325], ["aten::t", 312], ["aten::nll_loss_forward", 264], ["aten::transpose", 254], ["aten::as_strided", 244], ["AddmmBackward", 244], ["aten::mean", 233], ["aten::copy_", 230], ["aten::set_", 221], ["aten::max_pool2d_with_indices_backward", 213], ["aten::sum", 201], ["AddBackward0", 200], ["aten::max_pool2d", 177], ["aten::_log_softmax", 168], ["aten::detach_", 167], ["detach_", 156], ["aten::expand", 152], ["NllLossBackward", 142], ["aten::_log_softmax_backward_data", 142], ["aten::linear", 139], ["aten::cross_entropy_loss", 138], ["aten::adaptive_avg_pool2d", 127], ["aten::log_softmax", 106], ["MaxPool2DWithIndicesBackward", 97], ["aten::ones_like", 96], ["MeanBackward1", 95], ["aten::nll_loss_nd", 89], ["aten::flatten", 88], ["LogSoftmaxBackward", 76], ["aten::nll_loss", 75], ["ViewBackward", 44], ["aten::reshape", 43], ["TBackward", 33], ["aten::conj", 15]]}} [{"name": "aten::cudnn_convolution_backward_weight", "calls": 318, "device_self_duration": 141300, "device_total_duration": 141300, "host_self_duration": 30672, "host_total_duration": 60712, "has_call_stack": false}, {"name": "aten::cudnn_convolution_backward_input", "calls": 312, "device_self_duration": 133494, "device_total_duration": 133494, "host_self_duration": 31902, "host_total_duration": 47359, "has_call_stack": false}, {"name": "aten::cudnn_convolution", "calls": 318, "device_self_duration": 128683, "device_total_duration": 128683, "host_self_duration": 35632, "host_total_duration": 64167, "has_call_stack": true}, {"name": "aten::cudnn_batch_norm_backward", "calls": 318, "device_self_duration": 61899, "device_total_duration": 61899, "host_self_duration": 15118, "host_total_duration": 36641, "has_call_stack": false}, {"name": "aten::cudnn_batch_norm", "calls": 318, "device_self_duration": 34315, "device_total_duration": 34315, "host_self_duration": 23800, "host_total_duration": 87823, "has_call_stack": true}, {"name": "aten::threshold_backward", "calls": 294, "device_self_duration": 27280, "device_total_duration": 27280, "host_self_duration": 7820, "host_total_duration": 12062, "has_call_stack": false}, {"name": "aten::add_", "calls": 2994, "device_self_duration": 24052, "device_total_duration": 24052, "host_self_duration": 62385, "host_total_duration": 97540, "has_call_stack": true}, {"name": "aten::copy_", "calls": 12, "device_self_duration": 18959, "device_total_duration": 18959, "host_self_duration": 230, "host_total_duration": 56915, "has_call_stack": true}, {"name": "aten::clamp_min", "calls": 294, "device_self_duration": 17862, "device_total_duration": 17862, "host_self_duration": 8409, "host_total_duration": 12503, "has_call_stack": true}, {"name": "aten::add", "calls": 414, "device_self_duration": 16026, "device_total_duration": 16026, "host_self_duration": 17808, "host_total_duration": 23831, "has_call_stack": true}, {"name": "aten::max_pool2d_with_indices_backward", "calls": 6, "device_self_duration": 3838, "device_total_duration": 4695, "host_self_duration": 213, "host_total_duration": 493, "has_call_stack": false}, {"name": "aten::mul_", "calls": 966, "device_self_duration": 2395, "device_total_duration": 2395, "host_self_duration": 24617, "host_total_duration": 35389, "has_call_stack": true}, {"name": "aten::fill_", "calls": 978, "device_self_duration": 1888, "device_total_duration": 1888, "host_self_duration": 9660, "host_total_duration": 17657, "has_call_stack": true}, {"name": "aten::max_pool2d_with_indices", "calls": 6, "device_self_duration": 1422, "device_total_duration": 1422, "host_self_duration": 325, "host_total_duration": 422, "has_call_stack": true}, {"name": "aten::mm", "calls": 12, "device_self_duration": 274, "device_total_duration": 274, "host_self_duration": 1032, "host_total_duration": 1424, "has_call_stack": false}, {"name": "aten::mean", "calls": 6, "device_self_duration": 210, "device_total_duration": 210, "host_self_duration": 233, "host_total_duration": 454, "has_call_stack": true}, {"name": "aten::addmm", "calls": 6, "device_self_duration": 197, "device_total_duration": 197, "host_self_duration": 652, "host_total_duration": 1060, "has_call_stack": true}, {"name": "aten::div", "calls": 6, "device_self_duration": 145, "device_total_duration": 145, "host_self_duration": 343, "host_total_duration": 469, "has_call_stack": false}, {"name": "aten::_log_softmax_backward_data", "calls": 6, "device_self_duration": 51, "device_total_duration": 51, "host_self_duration": 142, "host_total_duration": 424, "has_call_stack": false}, {"name": "aten::sum", "calls": 6, "device_self_duration": 45, "device_total_duration": 45, "host_self_duration": 201, "host_total_duration": 411, "has_call_stack": false}, {"name": "aten::_log_softmax", "calls": 6, "device_self_duration": 42, "device_total_duration": 42, "host_self_duration": 168, "host_total_duration": 420, "has_call_stack": true}, {"name": "aten::nll_loss_forward", "calls": 6, "device_self_duration": 18, "device_total_duration": 18, "host_self_duration": 264, "host_total_duration": 343, "has_call_stack": true}, {"name": "aten::nll_loss_backward", "calls": 6, "device_self_duration": 18, "device_total_duration": 18, "host_self_duration": 463, "host_total_duration": 747, "has_call_stack": false}, {"name": "aten::empty", "calls": 4212, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 82024, "host_total_duration": 82024, "has_call_stack": true}, {"name": "aten::zero_", "calls": 996, "device_self_duration": 0, "device_total_duration": 1882, "host_self_duration": 10815, "host_total_duration": 28309, "has_call_stack": true}, {"name": "aten::zeros", "calls": 24, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 877, "host_total_duration": 1319, "has_call_stack": true}, {"name": "aten::to", "calls": 36, "device_self_duration": 0, "device_total_duration": 18959, "host_self_duration": 547, "host_total_duration": 57776, "has_call_stack": true}, {"name": "detach_", "calls": 12, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 156, "host_total_duration": 156, "has_call_stack": true}, {"name": "aten::detach_", "calls": 12, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 167, "host_total_duration": 323, "has_call_stack": true}, {"name": "aten::set_", "calls": 12, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 221, "host_total_duration": 221, "has_call_stack": true}, {"name": "aten::empty_strided", "calls": 18, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 453, "host_total_duration": 453, "has_call_stack": true}, {"name": "aten::resize_", "calls": 1896, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 15019, "host_total_duration": 15019, "has_call_stack": true}, {"name": "aten::_convolution", "calls": 318, "device_self_duration": 0, "device_total_duration": 128683, "host_self_duration": 9919, "host_total_duration": 74086, "has_call_stack": true}, {"name": "aten::convolution", "calls": 318, "device_self_duration": 0, "device_total_duration": 128683, "host_self_duration": 7695, "host_total_duration": 81781, "has_call_stack": true}, {"name": "aten::conv2d", "calls": 318, "device_self_duration": 0, "device_total_duration": 128683, "host_self_duration": 9219, "host_total_duration": 91000, "has_call_stack": true}, {"name": "aten::empty_like", "calls": 336, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 8385, "host_total_duration": 18591, "has_call_stack": true}, {"name": "aten::view", "calls": 654, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 9046, "host_total_duration": 9046, "has_call_stack": true}, {"name": "aten::_batch_norm_impl_index", "calls": 318, "device_self_duration": 0, "device_total_duration": 34315, "host_self_duration": 8102, "host_total_duration": 95925, "has_call_stack": true}, {"name": "aten::batch_norm", "calls": 318, "device_self_duration": 0, "device_total_duration": 34315, "host_self_duration": 9664, "host_total_duration": 105589, "has_call_stack": true}, {"name": "aten::clamp_min_", "calls": 294, "device_self_duration": 0, "device_total_duration": 17862, "host_self_duration": 6556, "host_total_duration": 19059, "has_call_stack": true}, {"name": "aten::relu_", "calls": 294, "device_self_duration": 0, "device_total_duration": 17862, "host_self_duration": 10373, "host_total_duration": 29432, "has_call_stack": true}, {"name": "aten::max_pool2d", "calls": 6, "device_self_duration": 0, "device_total_duration": 1422, "host_self_duration": 177, "host_total_duration": 599, "has_call_stack": true}, {"name": "aten::adaptive_avg_pool2d", "calls": 6, "device_self_duration": 0, "device_total_duration": 210, "host_self_duration": 127, "host_total_duration": 581, "has_call_stack": true}, {"name": "aten::flatten", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 88, "host_total_duration": 163, "has_call_stack": true}, {"name": "aten::as_strided", "calls": 42, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 244, "host_total_duration": 244, "has_call_stack": true}, {"name": "aten::transpose", "calls": 30, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 254, "host_total_duration": 413, "has_call_stack": true}, {"name": "aten::t", "calls": 30, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 312, "host_total_duration": 725, "has_call_stack": true}, {"name": "aten::expand", "calls": 12, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 152, "host_total_duration": 237, "has_call_stack": true}, {"name": "aten::linear", "calls": 6, "device_self_duration": 0, "device_total_duration": 197, "host_self_duration": 139, "host_total_duration": 1463, "has_call_stack": true}, {"name": "aten::log_softmax", "calls": 6, "device_self_duration": 0, "device_total_duration": 42, "host_self_duration": 106, "host_total_duration": 580, "has_call_stack": true}, {"name": "aten::nll_loss", "calls": 6, "device_self_duration": 0, "device_total_duration": 18, "host_self_duration": 75, "host_total_duration": 418, "has_call_stack": true}, {"name": "aten::nll_loss_nd", "calls": 6, "device_self_duration": 0, "device_total_duration": 18, "host_self_duration": 89, "host_total_duration": 507, "has_call_stack": true}, {"name": "aten::cross_entropy_loss", "calls": 6, "device_self_duration": 0, "device_total_duration": 60, "host_self_duration": 138, "host_total_duration": 1225, "has_call_stack": true}, {"name": "aten::ones_like", "calls": 6, "device_self_duration": 0, "device_total_duration": 6, "host_self_duration": 96, "host_total_duration": 470, "has_call_stack": true}, {"name": "NllLossBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 18, "host_self_duration": 142, "host_total_duration": 889, "has_call_stack": false}, {"name": "LogSoftmaxBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 51, "host_self_duration": 76, "host_total_duration": 500, "has_call_stack": false}, {"name": "aten::conj", "calls": 12, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 15, "host_total_duration": 15, "has_call_stack": false}, {"name": "AddmmBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 274, "host_self_duration": 244, "host_total_duration": 2026, "has_call_stack": false}, {"name": "torch::autograd::AccumulateGrad", "calls": 966, "device_self_duration": 0, "device_total_duration": 3012, "host_self_duration": 7181, "host_total_duration": 37189, "has_call_stack": false}, {"name": "TBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 33, "host_total_duration": 151, "has_call_stack": false}, {"name": "aten::reshape", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 43, "host_total_duration": 88, "has_call_stack": false}, {"name": "ViewBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 44, "host_total_duration": 132, "has_call_stack": false}, {"name": "MeanBackward1", "calls": 6, "device_self_duration": 0, "device_total_duration": 145, "host_self_duration": 95, "host_total_duration": 663, "has_call_stack": false}, {"name": "ReluBackward1", "calls": 294, "device_self_duration": 0, "device_total_duration": 27280, "host_self_duration": 2882, "host_total_duration": 14944, "has_call_stack": false}, {"name": "AddBackward0", "calls": 96, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 200, "host_total_duration": 200, "has_call_stack": false}, {"name": "CudnnBatchNormBackward", "calls": 318, "device_self_duration": 0, "device_total_duration": 61899, "host_self_duration": 5184, "host_total_duration": 41825, "has_call_stack": false}, {"name": "aten::cudnn_convolution_backward", "calls": 318, "device_self_duration": 0, "device_total_duration": 274794, "host_self_duration": 7726, "host_total_duration": 115797, "has_call_stack": false}, {"name": "CudnnConvolutionBackward", "calls": 318, "device_self_duration": 0, "device_total_duration": 274794, "host_self_duration": 4093, "host_total_duration": 119890, "has_call_stack": false}, {"name": "MaxPool2DWithIndicesBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 4695, "host_self_duration": 97, "host_total_duration": 590, "has_call_stack": false}] -{"data": {"columns": [{"type": "string", "name": "Name"}, {"type": "number", "name": "Calls"}, {"type": "number", "name": "Total Duration (us)"}, {"type": "number", "name": "Mean Duration (us)"}, {"type": "number", "name": "Max Duration (us)"}, {"type": "number", "name": "Min Duration (us)"}, {"type": "number", "name": "Mean Blocks Per SM", "tooltip": "Blocks Per SM:\nmin(blocks of this kernel / SM number of this GPU). If this number is less than 1, it indicates the GPU multiprocessors are not fully utilized."}, {"type": "number", "name": "Mean Est. Achieved Occupancy (%)", "tooltip": "Est. Achieved Occupancy:\nOccupancy is the ratio of active threads on an SM to the maximum number of active threads supported by the SM. The theoretical occupancy of a kernel is upper limit occupancy of this kernel, limited by multiple factors such as kernel shape, kernel used resource, and the GPU compute capability.Est. Achieved Occupancy of a kernel, OCC_K = min(threads of the kernel / SM number / max threads per SM, theoretical occupancy of the kernel). This overall number is the weighted sum of all kernels OCC_K using kernel's execution duration as weight."}], "rows": [["cask_cudnn::computeBOffsetsKernel(cask_cudnn::ComputeBOffsetsParams)", 54, 57, 1, 2, 1, 0.02, 0.0], ["cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", 108, 216, 2, 5, 1, 0.16, 2.0], ["cask_cudnn::computeWgradBOffsetsKernel(cask_cudnn::ComputeWgradBOffsetsParams)", 132, 150, 1, 2, 1, 0.02, 0.0], ["cask_cudnn::computeWgradSplitKOffsetsKernel(cask_cudnn::ComputeSplitKOffsetsParams)", 132, 155, 1, 2, 1, 0.16, 1.83], ["void (anonymous namespace)::softmax_warp_backward(float*, float const*, float const*, int, int, int)", 6, 51, 8, 9, 8, 0.1, 1.0], ["void (anonymous namespace)::softmax_warp_forward(float*, float const*, int, int, int)", 6, 42, 7, 7, 7, 0.1, 1.0], ["void at::native::(anonymous namespace)::max_pool_backward_nchw(int, float const*, long const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*)", 6, 3838, 640, 643, 637, 1254.4, 100.0], ["void at::native::(anonymous namespace)::max_pool_forward_nchw(int, float const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*, long*)", 6, 1422, 237, 243, 234, 313.6, 100.0], ["void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", 6, 45, 8, 8, 7, 0.02, 0.0], ["void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", 6, 210, 35, 35, 35, 51.2, 100.0], ["void at::native::unrolled_elementwise_kernel, at::detail::Array, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, at::native::MulScalarFunctor, at::detail::Array, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast)", 6, 145, 24, 25, 24, 156.8, 100.0], ["void at::native::unrolled_elementwise_kernel, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array::StoreWithoutCast>(int, at::native::copy_device_to_device(at::TensorIterator&, bool)::{lambda()#2}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array::StoreWithoutCast)", 6, 30, 5, 5, 5, 1.56, 5.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array)", 294, 17862, 61, 252, 5, 666.77, 100.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor, at::detail::Array >(int, at::native::AddFunctor, at::detail::Array)", 3090, 39753, 13, 376, 1, 641.51, 92.35], ["void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", 318, 325, 1, 2, 1, 0.01, 0.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", 978, 1888, 2, 143, 0, 600.2, 86.95], ["void at::native::vectorized_elementwise_kernel<4, at::native::MulScalarFunctor, at::detail::Array >(int, at::native::MulScalarFunctor, at::detail::Array)", 966, 2395, 2, 25, 1, 44.01, 58.56], ["void at::native::vectorized_elementwise_kernel<4, at::native::threshold_kernel_impl(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array >(int, at::native::threshold_kernel_impl(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array)", 294, 27280, 93, 377, 13, 653.26, 100.0], ["void cudnn::bn_bw_1C11_kernel_new(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float)", 264, 59568, 226, 923, 45, 4.33, 67.92], ["void cudnn::bn_bw_1C11_singleread(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnBwPersistentState*, int, float, float, float, int, float, cudnnStatus_t*, bool)", 54, 2331, 43, 75, 19, 20.83, 75.0], ["void cudnn::bn_fw_tr_1C11_kernel_NCHW(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float)", 150, 27084, 181, 454, 53, 3.12, 64.02], ["void cudnn::bn_fw_tr_1C11_singleread(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnFwPersistentState*, int, float, float, float, int, float, float, cudnnStatus_t*, bool)", 168, 7231, 43, 89, 11, 12.63, 75.0], ["void cudnn::cnn::wgrad_alg0_engine(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 12, 7068, 589, 990, 192, 85.38, 37.51], ["void cudnn::cnn::wgrad_alg0_engine(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 90, 43471, 483, 742, 363, 8.18, 38.0], ["void cudnn::cnn::wgrad_alg0_engine(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 6, 4038, 673, 691, 649, 6.4, 25.0], ["void cudnn::detail::dgrad_engine(int, int, int, float const*, int, float const*, int, float*, kernel_grad_params, unsigned long long, int, unsigned long long, int, float, int, int, int)", 180, 86855, 483, 1023, 323, 45.33, 30.04], ["void cudnn::ops::scalePackedTensor_kernel(cudnnTensor4dStruct, float*, float)", 180, 5901, 33, 142, 5, 525.02, 100.0], ["void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams)", 36, 126, 4, 5, 2, 0.4, 3.0], ["void cudnn::winograd_nonfused::winogradForwardData4x4(cudnn::winograd_nonfused::WinogradDataParams)", 120, 4648, 39, 67, 17, 10.15, 50.0], ["void cudnn::winograd_nonfused::winogradForwardFilter4x4(cudnn::winograd_nonfused::WinogradFilterParams)", 120, 2632, 22, 67, 4, 8.75, 73.78], ["void cudnn::winograd_nonfused::winogradForwardOutput4x4(cudnn::winograd_nonfused::WinogradOutputParams)", 120, 5314, 44, 72, 20, 10.02, 50.0], ["void cudnn::winograd_nonfused::winogradWgradData4x4(cudnn::winograd_nonfused::WinogradDataParams)", 78, 4681, 60, 126, 20, 15.46, 38.0], ["void cudnn::winograd_nonfused::winogradWgradDelta4x4(cudnn::winograd_nonfused::WinogradDeltaParams)", 78, 4559, 58, 126, 17, 15.71, 50.0], ["void cudnn::winograd_nonfused::winogradWgradOutput4x4(cudnn::winograd_nonfused::WinogradWgradOutputParams)", 78, 1484, 19, 69, 3, 8.13, 41.71], ["void cunn_ClassNLLCriterion_updateGradInput_kernel(float*, float*, long*, float*, float*, int, int, int, int, long)", 6, 12, 2, 2, 2, 0.01, 0.0], ["void cunn_ClassNLLCriterion_updateOutput_kernel(float*, float*, float*, long*, float*, int, int, int, int, long)", 6, 18, 3, 3, 3, 0.01, 0.0], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 90, 37016, 411, 735, 346, 12.39, 50.0], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 12, 5221, 435, 440, 431, 9.8, 31.0], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 72, 35106, 488, 822, 350, 3.83, 41.64], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 12, 7939, 662, 733, 584, 7.54, 25.0], ["void nchwToNhwcKernel(int, int, int, int, float const*, float*, float, float)", 12, 383, 32, 34, 29, 71.72, 100.0], ["void nhwcToNchwKernel(int, int, int, int, float const*, float*, float, float)", 6, 54, 9, 10, 8, 12.8, 100.0], ["void splitKreduce_kernel(cublasSplitKParams, float const*, float const*, float*, float const*, float const*, float const*)", 12, 31, 3, 4, 2, 4.39, 27.74], ["volta_scudnn_128x128_stridedB_medium_nn_v1", 12, 3550, 296, 309, 286, 19.6, 25.0], ["volta_scudnn_128x128_stridedB_small_nn_v1", 6, 3034, 506, 520, 491, 19.6, 25.0], ["volta_scudnn_128x128_stridedB_splitK_medium_nn_v1", 72, 25342, 352, 629, 323, 3.21, 25.0], ["volta_scudnn_128x128_stridedB_splitK_small_nn_v1", 48, 20473, 427, 681, 309, 6.82, 25.0], ["volta_scudnn_128x32_sliced1x4_ldg4_relu_exp_small_nhwc_tn_v1", 6, 3697, 616, 621, 614, 2.6, 25.0], ["volta_scudnn_128x64_relu_interior_nn_v1", 30, 7976, 266, 316, 92, 37.08, 25.0], ["volta_scudnn_128x64_relu_medium_nn_v1", 6, 3647, 608, 620, 602, 39.2, 25.0], ["volta_scudnn_128x64_relu_small_nn_v1", 12, 3273, 273, 286, 258, 9.8, 25.0], ["volta_scudnn_128x64_stridedB_interior_nn_v1", 30, 9559, 319, 508, 255, 12.91, 19.0], ["volta_scudnn_128x64_stridedB_small_nn_v1", 6, 582, 97, 99, 94, 9.8, 19.0], ["volta_scudnn_128x64_stridedB_splitK_xregs_large_nn_v1", 12, 7819, 652, 670, 634, 15.96, 19.0], ["volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1", 36, 12761, 354, 365, 344, 22.4, 25.0], ["volta_sgemm_128x32_nt", 24, 8658, 361, 479, 18, 0.97, 11.51], ["volta_sgemm_32x128_nn", 18, 3059, 170, 173, 167, 22.05, 50.0], ["volta_sgemm_32x128_nt", 18, 2837, 158, 159, 156, 22.05, 50.0], ["volta_sgemm_64x32_sliced1x4_nn", 6, 149, 25, 25, 24, 2.0, 25.0], ["volta_sgemm_64x32_sliced1x4_tn", 6, 148, 25, 25, 24, 1.0, 13.0], ["volta_sgemm_64x64_nn", 42, 8544, 203, 210, 197, 12.35, 24.14], ["volta_sgemm_64x64_nt", 102, 21125, 207, 281, 184, 10.28, 19.38]]}} +{"data": {"columns": [{"type": "string", "name": "Name"}, {"type": "number", "name": "Calls"}, {"type": "number", "name": "Total Duration (us)"}, {"type": "number", "name": "Mean Duration (us)"}, {"type": "number", "name": "Max Duration (us)"}, {"type": "number", "name": "Min Duration (us)"}, {"type": "number", "name": "Mean Blocks Per SM", "tooltip": "Blocks Per SM = blocks of this kernel / SM number of this GPU.\nIf this number is less than 1, it indicates the GPU multiprocessors are not fully utilized.\n\"Mean Blocks per SM\" is the weighted average of all calls of this kernel, using each call's execution duration as weight."}, {"type": "number", "name": "Mean Est. Achieved Occupancy (%)", "tooltip": "Est. Achieved Occupancy:\nThe bigger, the better. Occupancy is the ratio of active warps on an SM to the maximum number of active warps supported by the SM. The theoretical occupancy of a kernel is upper limit occupancy of this kernel, limited by multiple factors such as kernel shape, kernel used resource, and the GPU compute capability.\nEst. Achieved Occupancy of a kernel, OCC_K = min(threads of the kernel / SM number / max threads per SM, theoretical occupancy of the kernel). This \"Mean\" number is the weighted average of all calls' OCC_K of the kernel, using each call's execution duration as weight. It shows fine-grained low-level GPU utilization."}], "rows": [["cask_cudnn::computeBOffsetsKernel(cask_cudnn::ComputeBOffsetsParams)", 54, 57, 1, 2, 1, 0.02, 0.0], ["cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", 108, 216, 2, 5, 1, 0.16, 2.0], ["cask_cudnn::computeWgradBOffsetsKernel(cask_cudnn::ComputeWgradBOffsetsParams)", 132, 150, 1, 2, 1, 0.02, 0.0], ["cask_cudnn::computeWgradSplitKOffsetsKernel(cask_cudnn::ComputeSplitKOffsetsParams)", 132, 155, 1, 2, 1, 0.16, 1.83], ["void (anonymous namespace)::softmax_warp_backward(float*, float const*, float const*, int, int, int)", 6, 51, 8, 9, 8, 0.1, 1.0], ["void (anonymous namespace)::softmax_warp_forward(float*, float const*, int, int, int)", 6, 42, 7, 7, 7, 0.1, 1.0], ["void at::native::(anonymous namespace)::max_pool_backward_nchw(int, float const*, long const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*)", 6, 3838, 640, 643, 637, 1254.4, 100.0], ["void at::native::(anonymous namespace)::max_pool_forward_nchw(int, float const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*, long*)", 6, 1422, 237, 243, 234, 313.6, 100.0], ["void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", 6, 45, 8, 8, 7, 0.02, 0.0], ["void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", 6, 210, 35, 35, 35, 51.2, 100.0], ["void at::native::unrolled_elementwise_kernel, at::detail::Array, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, at::native::MulScalarFunctor, at::detail::Array, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast)", 6, 145, 24, 25, 24, 156.8, 100.0], ["void at::native::unrolled_elementwise_kernel, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array::StoreWithoutCast>(int, at::native::copy_device_to_device(at::TensorIterator&, bool)::{lambda()#2}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array::StoreWithoutCast)", 6, 30, 5, 5, 5, 1.56, 5.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array)", 294, 17862, 61, 252, 5, 666.77, 100.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor, at::detail::Array >(int, at::native::AddFunctor, at::detail::Array)", 3090, 39753, 13, 376, 1, 641.51, 92.35], ["void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", 318, 325, 1, 2, 1, 0.01, 0.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", 978, 1888, 2, 143, 0, 600.2, 86.95], ["void at::native::vectorized_elementwise_kernel<4, at::native::MulScalarFunctor, at::detail::Array >(int, at::native::MulScalarFunctor, at::detail::Array)", 966, 2395, 2, 25, 1, 44.01, 58.56], ["void at::native::vectorized_elementwise_kernel<4, at::native::threshold_kernel_impl(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array >(int, at::native::threshold_kernel_impl(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array)", 294, 27280, 93, 377, 13, 653.26, 100.0], ["void cudnn::bn_bw_1C11_kernel_new(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float)", 264, 59568, 226, 923, 45, 4.33, 67.92], ["void cudnn::bn_bw_1C11_singleread(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnBwPersistentState*, int, float, float, float, int, float, cudnnStatus_t*, bool)", 54, 2331, 43, 75, 19, 20.83, 75.0], ["void cudnn::bn_fw_tr_1C11_kernel_NCHW(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float)", 150, 27084, 181, 454, 53, 3.12, 64.02], ["void cudnn::bn_fw_tr_1C11_singleread(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnFwPersistentState*, int, float, float, float, int, float, float, cudnnStatus_t*, bool)", 168, 7231, 43, 89, 11, 12.63, 75.0], ["void cudnn::cnn::wgrad_alg0_engine(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 12, 7068, 589, 990, 192, 85.38, 37.51], ["void cudnn::cnn::wgrad_alg0_engine(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 90, 43471, 483, 742, 363, 8.18, 38.0], ["void cudnn::cnn::wgrad_alg0_engine(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 6, 4038, 673, 691, 649, 6.4, 25.0], ["void cudnn::detail::dgrad_engine(int, int, int, float const*, int, float const*, int, float*, kernel_grad_params, unsigned long long, int, unsigned long long, int, float, int, int, int)", 180, 86855, 483, 1023, 323, 45.33, 30.04], ["void cudnn::ops::scalePackedTensor_kernel(cudnnTensor4dStruct, float*, float)", 180, 5901, 33, 142, 5, 525.02, 100.0], ["void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams)", 36, 126, 4, 5, 2, 0.4, 3.0], ["void cudnn::winograd_nonfused::winogradForwardData4x4(cudnn::winograd_nonfused::WinogradDataParams)", 120, 4648, 39, 67, 17, 10.15, 50.0], ["void cudnn::winograd_nonfused::winogradForwardFilter4x4(cudnn::winograd_nonfused::WinogradFilterParams)", 120, 2632, 22, 67, 4, 8.75, 73.78], ["void cudnn::winograd_nonfused::winogradForwardOutput4x4(cudnn::winograd_nonfused::WinogradOutputParams)", 120, 5314, 44, 72, 20, 10.02, 50.0], ["void cudnn::winograd_nonfused::winogradWgradData4x4(cudnn::winograd_nonfused::WinogradDataParams)", 78, 4681, 60, 126, 20, 15.46, 38.0], ["void cudnn::winograd_nonfused::winogradWgradDelta4x4(cudnn::winograd_nonfused::WinogradDeltaParams)", 78, 4559, 58, 126, 17, 15.71, 50.0], ["void cudnn::winograd_nonfused::winogradWgradOutput4x4(cudnn::winograd_nonfused::WinogradWgradOutputParams)", 78, 1484, 19, 69, 3, 8.13, 41.71], ["void cunn_ClassNLLCriterion_updateGradInput_kernel(float*, float*, long*, float*, float*, int, int, int, int, long)", 6, 12, 2, 2, 2, 0.01, 0.0], ["void cunn_ClassNLLCriterion_updateOutput_kernel(float*, float*, float*, long*, float*, int, int, int, int, long)", 6, 18, 3, 3, 3, 0.01, 0.0], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 90, 37016, 411, 735, 346, 12.39, 50.0], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 12, 5221, 435, 440, 431, 9.8, 31.0], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 72, 35106, 488, 822, 350, 3.83, 41.64], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 12, 7939, 662, 733, 584, 7.54, 25.0], ["void nchwToNhwcKernel(int, int, int, int, float const*, float*, float, float)", 12, 383, 32, 34, 29, 71.72, 100.0], ["void nhwcToNchwKernel(int, int, int, int, float const*, float*, float, float)", 6, 54, 9, 10, 8, 12.8, 100.0], ["void splitKreduce_kernel(cublasSplitKParams, float const*, float const*, float*, float const*, float const*, float const*)", 12, 31, 3, 4, 2, 4.39, 27.74], ["volta_scudnn_128x128_stridedB_medium_nn_v1", 12, 3550, 296, 309, 286, 19.6, 25.0], ["volta_scudnn_128x128_stridedB_small_nn_v1", 6, 3034, 506, 520, 491, 19.6, 25.0], ["volta_scudnn_128x128_stridedB_splitK_medium_nn_v1", 72, 25342, 352, 629, 323, 3.21, 25.0], ["volta_scudnn_128x128_stridedB_splitK_small_nn_v1", 48, 20473, 427, 681, 309, 6.82, 25.0], ["volta_scudnn_128x32_sliced1x4_ldg4_relu_exp_small_nhwc_tn_v1", 6, 3697, 616, 621, 614, 2.6, 25.0], ["volta_scudnn_128x64_relu_interior_nn_v1", 30, 7976, 266, 316, 92, 37.08, 25.0], ["volta_scudnn_128x64_relu_medium_nn_v1", 6, 3647, 608, 620, 602, 39.2, 25.0], ["volta_scudnn_128x64_relu_small_nn_v1", 12, 3273, 273, 286, 258, 9.8, 25.0], ["volta_scudnn_128x64_stridedB_interior_nn_v1", 30, 9559, 319, 508, 255, 12.91, 19.0], ["volta_scudnn_128x64_stridedB_small_nn_v1", 6, 582, 97, 99, 94, 9.8, 19.0], ["volta_scudnn_128x64_stridedB_splitK_xregs_large_nn_v1", 12, 7819, 652, 670, 634, 15.96, 19.0], ["volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1", 36, 12761, 354, 365, 344, 22.4, 25.0], ["volta_sgemm_128x32_nt", 24, 8658, 361, 479, 18, 0.97, 11.51], ["volta_sgemm_32x128_nn", 18, 3059, 170, 173, 167, 22.05, 50.0], ["volta_sgemm_32x128_nt", 18, 2837, 158, 159, 156, 22.05, 50.0], ["volta_sgemm_64x32_sliced1x4_nn", 6, 149, 25, 25, 24, 2.0, 25.0], ["volta_sgemm_64x32_sliced1x4_tn", 6, 148, 25, 25, 24, 1.0, 13.0], ["volta_sgemm_64x64_nn", 42, 8544, 203, 210, 197, 12.35, 24.14], ["volta_sgemm_64x64_nt", 102, 21125, 207, 281, 184, 10.28, 19.38]]}} {"total": {"columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["cask_cudnn::computeBOffsetsKernel(cask_cudnn::ComputeBOffsetsParams)", 57.0], ["cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", 216.0], ["cask_cudnn::computeWgradBOffsetsKernel(cask_cudnn::ComputeWgradBOffsetsParams)", 150.0], ["cask_cudnn::computeWgradSplitKOffsetsKernel(cask_cudnn::ComputeSplitKOffsetsParams)", 155.0], ["void (anonymous namespace)::softmax_warp_backward(float*, float const*, float const*, int, int, int)", 51.0], ["void (anonymous namespace)::softmax_warp_forward(float*, float const*, int, int, int)", 42.0], ["void at::native::(anonymous namespace)::max_pool_backward_nchw(int, float const*, long const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*)", 3838.0], ["void at::native::(anonymous namespace)::max_pool_forward_nchw(int, float const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*, long*)", 1422.0], ["void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", 45.0], ["void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", 210.0], ["void at::native::unrolled_elementwise_kernel, at::detail::Array, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, at::native::MulScalarFunctor, at::detail::Array, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast)", 145.0], ["void at::native::unrolled_elementwise_kernel, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array::StoreWithoutCast>(int, at::native::copy_device_to_device(at::TensorIterator&, bool)::{lambda()#2}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array::StoreWithoutCast)", 30.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array)", 17862.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor, at::detail::Array >(int, at::native::AddFunctor, at::detail::Array)", 39753.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", 325.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", 1888.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::MulScalarFunctor, at::detail::Array >(int, at::native::MulScalarFunctor, at::detail::Array)", 2395.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::threshold_kernel_impl(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array >(int, at::native::threshold_kernel_impl(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array)", 27280.0], ["void cudnn::bn_bw_1C11_kernel_new(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float)", 59568.0], ["void cudnn::bn_bw_1C11_singleread(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnBwPersistentState*, int, float, float, float, int, float, cudnnStatus_t*, bool)", 2331.0], ["void cudnn::bn_fw_tr_1C11_kernel_NCHW(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float)", 27084.0], ["void cudnn::bn_fw_tr_1C11_singleread(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnFwPersistentState*, int, float, float, float, int, float, float, cudnnStatus_t*, bool)", 7231.0], ["void cudnn::cnn::wgrad_alg0_engine(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 7068.0], ["void cudnn::cnn::wgrad_alg0_engine(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 43471.0], ["void cudnn::cnn::wgrad_alg0_engine(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 4038.0], ["void cudnn::detail::dgrad_engine(int, int, int, float const*, int, float const*, int, float*, kernel_grad_params, unsigned long long, int, unsigned long long, int, float, int, int, int)", 86855.0], ["void cudnn::ops::scalePackedTensor_kernel(cudnnTensor4dStruct, float*, float)", 5901.0], ["void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams)", 126.0], ["void cudnn::winograd_nonfused::winogradForwardData4x4(cudnn::winograd_nonfused::WinogradDataParams)", 4648.0], ["void cudnn::winograd_nonfused::winogradForwardFilter4x4(cudnn::winograd_nonfused::WinogradFilterParams)", 2632.0], ["void cudnn::winograd_nonfused::winogradForwardOutput4x4(cudnn::winograd_nonfused::WinogradOutputParams)", 5314.0], ["void cudnn::winograd_nonfused::winogradWgradData4x4(cudnn::winograd_nonfused::WinogradDataParams)", 4681.0], ["void cudnn::winograd_nonfused::winogradWgradDelta4x4(cudnn::winograd_nonfused::WinogradDeltaParams)", 4559.0], ["void cudnn::winograd_nonfused::winogradWgradOutput4x4(cudnn::winograd_nonfused::WinogradWgradOutputParams)", 1484.0], ["void cunn_ClassNLLCriterion_updateGradInput_kernel(float*, float*, long*, float*, float*, int, int, int, int, long)", 12.0], ["void cunn_ClassNLLCriterion_updateOutput_kernel(float*, float*, float*, long*, float*, int, int, int, int, long)", 18.0], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 37016.0], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 5221.0], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 35106.0], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 7939.0], ["void nchwToNhwcKernel(int, int, int, int, float const*, float*, float, float)", 383.0], ["void nhwcToNchwKernel(int, int, int, int, float const*, float*, float, float)", 54.0], ["void splitKreduce_kernel(cublasSplitKParams, float const*, float const*, float*, float const*, float const*, float const*)", 31.0], ["volta_scudnn_128x128_stridedB_medium_nn_v1", 3550.0], ["volta_scudnn_128x128_stridedB_small_nn_v1", 3034.0], ["volta_scudnn_128x128_stridedB_splitK_medium_nn_v1", 25342.0], ["volta_scudnn_128x128_stridedB_splitK_small_nn_v1", 20473.0], ["volta_scudnn_128x32_sliced1x4_ldg4_relu_exp_small_nhwc_tn_v1", 3697.0], ["volta_scudnn_128x64_relu_interior_nn_v1", 7976.0], ["volta_scudnn_128x64_relu_medium_nn_v1", 3647.0], ["volta_scudnn_128x64_relu_small_nn_v1", 3273.0], ["volta_scudnn_128x64_stridedB_interior_nn_v1", 9559.0], ["volta_scudnn_128x64_stridedB_small_nn_v1", 582.0], ["volta_scudnn_128x64_stridedB_splitK_xregs_large_nn_v1", 7819.0], ["volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1", 12761.0], ["volta_sgemm_128x32_nt", 8658.0], ["volta_sgemm_32x128_nn", 3059.0], ["volta_sgemm_32x128_nt", 2837.0], ["volta_sgemm_64x32_sliced1x4_nn", 149.0], ["volta_sgemm_64x32_sliced1x4_tn", 148.0], ["volta_sgemm_64x64_nn", 8544.0], ["volta_sgemm_64x64_nt", 21125.0]]}} diff --git a/tb_plugin/torch_tb_profiler/consts.py b/tb_plugin/torch_tb_profiler/consts.py index 1dbfc9fcf..8e41a88f1 100644 --- a/tb_plugin/torch_tb_profiler/consts.py +++ b/tb_plugin/torch_tb_profiler/consts.py @@ -25,27 +25,34 @@ TOOLTIP_GPU_UTIL = \ "GPU Utilization:\n" \ - "GPU busy time / All steps time. " \ + "GPU busy time / All steps time. The bigger, the better. " \ "GPU busy time is the time during which there is at least one GPU kernel running on it. " \ "All steps time is the total time of all profiler steps(or called as iterations).\n" TOOLTIP_SM_EFFICIENCY = \ "Est. SM Efficiency:\n" \ - "Estimated Stream Multiprocessor Efficiency. " \ - "Est. SM Efficiency of a kernel, SM_Eff_K = min(blocks of this kernel / SM number of this GPU, 100%). " \ + "Estimated Stream Multiprocessor Efficiency. The bigger, the better. " \ + "This metric of a kernel, SM_Eff_K = min(blocks of this kernel / SM number of this GPU, 100%). " \ "This overall number is the sum of all kernels' SM_Eff_K weighted by kernel's execution duration, " \ "divided by all steps time.\n" -TOOLTIP_OCCUPANCY = \ +TOOLTIP_OCCUPANCY_COMMON = \ "Est. Achieved Occupancy:\n" \ - "Occupancy is the ratio of active threads on an SM " \ - "to the maximum number of active threads supported by the SM. " \ + "The bigger, the better. Occupancy is the ratio of active warps on an SM " \ + "to the maximum number of active warps supported by the SM. " \ "The theoretical occupancy of a kernel is upper limit occupancy of this kernel, " \ "limited by multiple factors such as kernel shape, kernel used resource, " \ - "and the GPU compute capability." \ + "and the GPU compute capability.\n" \ "Est. Achieved Occupancy of a kernel, OCC_K = " \ - "min(threads of the kernel / SM number / max threads per SM, theoretical occupancy of the kernel). " \ - "This overall number is the weighted sum of all kernels OCC_K " \ - "using kernel's execution duration as weight." + "min(threads of the kernel / SM number / max threads per SM, theoretical occupancy of the kernel). " +TOOLTIP_OCCUPANCY_OVERVIEW = \ + "This overall number is the weighted average of all kernels' OCC_K " \ + "using kernel's execution duration as weight. " \ + "It shows fine-grained low-level GPU utilization." +TOOLTIP_OCCUPANCY_TABLE = \ + "This \"Mean\" number is the weighted average of all calls' OCC_K of the kernel, " \ + "using each call's execution duration as weight. " \ + "It shows fine-grained low-level GPU utilization." TOOLTIP_BLOCKS_PER_SM = \ - "Blocks Per SM:\n" \ - "min(blocks of this kernel / SM number of this GPU). " \ - "If this number is less than 1, it indicates the GPU multiprocessors are not fully utilized." + "Blocks Per SM = blocks of this kernel / SM number of this GPU.\n" \ + "If this number is less than 1, it indicates the GPU multiprocessors are not fully utilized.\n" \ + "\"Mean Blocks per SM\" is the weighted average of all calls of this kernel, " \ + "using each call's execution duration as weight." diff --git a/tb_plugin/torch_tb_profiler/profiler/run_generator.py b/tb_plugin/torch_tb_profiler/profiler/run_generator.py index ecadcb30c..44923bcac 100644 --- a/tb_plugin/torch_tb_profiler/profiler/run_generator.py +++ b/tb_plugin/torch_tb_profiler/profiler/run_generator.py @@ -294,7 +294,7 @@ def _get_gpu_metrics_columns(blocks_per_sm_count, occupancy_count): "tooltip": consts.TOOLTIP_BLOCKS_PER_SM}) if occupancy_count > 0: columns.append({"type": "number", "name": "Mean Est. Achieved Occupancy (%)", - "tooltip": consts.TOOLTIP_OCCUPANCY}) + "tooltip": consts.TOOLTIP_OCCUPANCY_COMMON + consts.TOOLTIP_OCCUPANCY_TABLE}) return columns def _generate_kernel_op_table(self): diff --git a/tb_plugin/torch_tb_profiler/run.py b/tb_plugin/torch_tb_profiler/run.py index 2760111df..ee4e9d961 100644 --- a/tb_plugin/torch_tb_profiler/run.py +++ b/tb_plugin/torch_tb_profiler/run.py @@ -211,7 +211,7 @@ def get_gpu_metrics_tooltip(has_sm_efficiency, has_occupancy): if has_sm_efficiency: tooltip += "\n" + consts.TOOLTIP_SM_EFFICIENCY if has_occupancy: - tooltip += "\n" + consts.TOOLTIP_OCCUPANCY + tooltip += "\n" + consts.TOOLTIP_OCCUPANCY_COMMON + consts.TOOLTIP_OCCUPANCY_OVERVIEW return tooltip data, has_occupancy, has_sm_efficiency = get_gpu_metrics_data(self)