From 6a81dd29db397f84b2c097b8db732e5f357e1131 Mon Sep 17 00:00:00 2001
From: guyang3532 <yangu@microsoft.com>
Date: Tue, 15 Jun 2021 19:20:39 +0800
Subject: [PATCH] Merge from branch tb_plugin

---
 tb_plugin/README.md                           | 30 +++++++++++------
 tb_plugin/test/result_check_file.txt          |  8 ++---
 tb_plugin/torch_tb_profiler/consts.py         | 33 +++++++++++--------
 .../profiler/run_generator.py                 |  2 +-
 tb_plugin/torch_tb_profiler/run.py            |  2 +-
 5 files changed, 46 insertions(+), 29 deletions(-)
diff --git a/tb_plugin/README.md b/tb_plugin/README.md
index e0fbdb2b0..f518d3a5c 100644
--- a/tb_plugin/README.md
+++ b/tb_plugin/README.md
@@ -24,7 +24,7 @@ and give optimization recommendations.
 
 * Build the wheel
   - `python setup.py build_fe sdist bdist_wheel` \
-   **_Note_**: the build_fe step need setup yarn and nodejs
+   **_Note_**: the build_fe step need setup yarn and Node.js
   - `python setup.py sdist bdist_wheel`
 
 ### Quick Start Instructions
@@ -37,12 +37,12 @@ and give optimization recommendations.
   [kineto/tb_plugin/examples/resnet50_profiler_api.py](https://github.com/pytorch/kineto/blob/master/tb_plugin/examples/resnet50_profiler_api.py).
   Also you can learn how to profile your model and generate profiling data from [PyTorch Profiler](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html?highlight=tensorboard).
 
-  Note: The recommended way to produce profiling data is assigning "torch.profiler.tensorboard_trace_handler"
-  to "on_trace_ready" on creation of "torch.profiler.schedule".
+  Note: The recommended way to produce profiling data is assigning `torch.profiler.tensorboard_trace_handler`
+  to `on_trace_ready` on creation of `torch.profiler.schedule`.
 
 * Start TensorBoard
 
-  Specify the profiling data folder to "logdir" in Tensorboard. If you use the above samples data, start TensorBoard with:
+  Specify the profiling data folder to `logdir` in Tensorboard. If you use the above samples data, start TensorBoard with:
 
   `tensorboard --logdir=./samples`
 
@@ -56,14 +56,17 @@ and give optimization recommendations.
 * Open TensorBoard in Chrome browser
 
   Open URL `http://localhost:6006` in the browser.
-  If you use '--bind_all' in tensorboard start cmd, the hostname may not be 'localhost'. You may find it in the log printed after the cmd.
+  If you use `--bind_all` in tensorboard start command, the hostname may not be 'localhost'. You may find it in the log printed after the cmd.
 
 * Navigate to PYTORCH_PROFILER tab
 
   If the files under `--logdir` are too big or too many,
   please wait a while and refresh the browser to check latest loaded result.
-* Also support loading profiling data stored in AWS(S3://), Azure blob(https://\<account\>.blob.core.windows.net) and Google Cloud(GS://)
-  * S3: install boto3. set environment variables:  `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`. Optionally, `S3_ENDPOINT` can be set as well.\
+  
+* Loading profiling data from cloud
+  * S3 (S3://)
+  
+    install `boto3`. set environment variables:  `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`. Optionally, `S3_ENDPOINT` can be set as well.\
     For minio, the S3 url should start with the bucket name `s3://<bucket>/<folder>/` instead of minio prefix `s3://minio/<bucket>/<folder>`. At the same time, the `S3_ENDPOINT` is needed as well. \
     For example, the following command can be used to create minio storage after following guides: 
     * Server: https://docs.min.io/docs/minio-quickstart-guide.html
@@ -81,8 +84,15 @@ and give optimization recommendations.
      export S3_ENDPOINT=http://localhost:9000
      tensorboard --logdir=s3://profiler/version_2/ --bind_all
   ```
-  * Azure Blob: install azure-storage-blob. Optionally, set environment variable `AZURE_STORAGE_CONNECTION_STRING`
-  * Google Cloud: install google-cloud-storage.
+  
+  * Azure blob (https://\<account\>.blob.core.windows.net)
+
+    install `azure-storage-blob`. Optionally, set environment variable `AZURE_STORAGE_CONNECTION_STRING`
+    
+  * Google Cloud (GS://) 
+  
+    install `google-cloud-storage`.
+  
   ---
   > **_NOTES:_** For AWS, Google Cloud and Azure Blob, the trace files need to be put on a top level folder under bucket/container.
   ---
@@ -93,7 +103,7 @@ and give optimization recommendations.
   
   and open tensorboard in browser to see all the views described below.
  
-  Note: for accessing data in azure blob, you need to install torch-tb-profiler with cmd: `pip install torch-tb-profiler[blob]`
+  Note: for accessing data in azure blob, you need to install torch-tb-profiler with `pip install torch-tb-profiler[blob]`
   
 ### Quick Usage Instructions
 
diff --git a/tb_plugin/test/result_check_file.txt b/tb_plugin/test/result_check_file.txt
index c59f1ad50..2f041f283 100644
--- a/tb_plugin/test/result_check_file.txt
+++ b/tb_plugin/test/result_check_file.txt
@@ -1,10 +1,10 @@
-{"steps": {"columns": [{"type": "string", "name": "Step"}, {"type": "number", "name": "Kernel"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Memcpy"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Memset"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Runtime"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "DataLoader"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "CPU Exec"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Other"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}], "rows": [["5", 98598, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 187948us<br><b>Kernel: 98598us</b><br>Percentage: 52.46%</div>", 1941, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 187948us<br><b>Memcpy: 1941us</b><br>Percentage: 1.03%</div>", 90, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 187948us<br><b>Memset: 90us</b><br>Percentage: 0.05%</div>", 2796, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 187948us<br><b>Runtime: 2796us</b><br>Percentage: 1.49%</div>", 69317, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 187948us<br><b>DataLoader: 69317us</b><br>Percentage: 36.88%</div>", 14091, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 187948us<br><b>CPU Exec: 14091us</b><br>Percentage: 7.5%</div>", 1115, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 187948us<br><b>Other: 1115us</b><br>Percentage: 0.59%</div>"], ["6", 98570, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 175153us<br><b>Kernel: 98570us</b><br>Percentage: 56.28%</div>", 1947, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 175153us<br><b>Memcpy: 1947us</b><br>Percentage: 1.11%</div>", 89, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 175153us<br><b>Memset: 89us</b><br>Percentage: 0.05%</div>", 2762, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 175153us<br><b>Runtime: 2762us</b><br>Percentage: 1.58%</div>", 57669, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 175153us<br><b>DataLoader: 57669us</b><br>Percentage: 32.92%</div>", 12968, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 175153us<br><b>CPU Exec: 12968us</b><br>Percentage: 7.4%</div>", 1148, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 175153us<br><b>Other: 1148us</b><br>Percentage: 0.66%</div>"], ["7", 98596, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 179733us<br><b>Kernel: 98596us</b><br>Percentage: 54.86%</div>", 1931, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 179733us<br><b>Memcpy: 1931us</b><br>Percentage: 1.07%</div>", 91, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 179733us<br><b>Memset: 91us</b><br>Percentage: 0.05%</div>", 2877, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 179733us<br><b>Runtime: 2877us</b><br>Percentage: 1.6%</div>", 61257, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 179733us<br><b>DataLoader: 61257us</b><br>Percentage: 34.08%</div>", 13768, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 179733us<br><b>CPU Exec: 13768us</b><br>Percentage: 7.66%</div>", 1213, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 179733us<br><b>Other: 1213us</b><br>Percentage: 0.67%</div>"], ["8", 98623, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 174564us<br><b>Kernel: 98623us</b><br>Percentage: 56.5%</div>", 1938, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 174564us<br><b>Memcpy: 1938us</b><br>Percentage: 1.11%</div>", 89, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 174564us<br><b>Memset: 89us</b><br>Percentage: 0.05%</div>", 2841, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 174564us<br><b>Runtime: 2841us</b><br>Percentage: 1.63%</div>", 56453, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 174564us<br><b>DataLoader: 56453us</b><br>Percentage: 32.34%</div>", 13420, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 174564us<br><b>CPU Exec: 13420us</b><br>Percentage: 7.69%</div>", 1200, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 174564us<br><b>Other: 1200us</b><br>Percentage: 0.69%</div>"], ["9", 98504, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 182172us<br><b>Kernel: 98504us</b><br>Percentage: 54.07%</div>", 1937, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 182172us<br><b>Memcpy: 1937us</b><br>Percentage: 1.06%</div>", 87, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 182172us<br><b>Memset: 87us</b><br>Percentage: 0.05%</div>", 2788, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 182172us<br><b>Runtime: 2788us</b><br>Percentage: 1.53%</div>", 62690, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 182172us<br><b>DataLoader: 62690us</b><br>Percentage: 34.41%</div>", 15025, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 182172us<br><b>CPU Exec: 15025us</b><br>Percentage: 8.25%</div>", 1141, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 182172us<br><b>Other: 1141us</b><br>Percentage: 0.63%</div>"], ["10", 98641, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 165983us<br><b>Kernel: 98641us</b><br>Percentage: 59.43%</div>", 1798, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 165983us<br><b>Memcpy: 1798us</b><br>Percentage: 1.08%</div>", 88, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 165983us<br><b>Memset: 88us</b><br>Percentage: 0.05%</div>", 3381, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 165983us<br><b>Runtime: 3381us</b><br>Percentage: 2.04%</div>", 48185, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 165983us<br><b>DataLoader: 48185us</b><br>Percentage: 29.03%</div>", 12773, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 165983us<br><b>CPU Exec: 12773us</b><br>Percentage: 7.7%</div>", 1117, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 165983us<br><b>Other: 1117us</b><br>Percentage: 0.67%</div>"]]}, "performance": [{"name": "Average Step Time", "description": "", "value": 177592, "extra": 100, "children": [{"name": "Kernel", "description": "", "value": 98589, "extra": 55.51}, {"name": "Memcpy", "description": "", "value": 1915, "extra": 1.08}, {"name": "Memset", "description": "", "value": 89, "extra": 0.05}, {"name": "Runtime", "description": "", "value": 2908, "extra": 1.64}, {"name": "DataLoader", "description": "", "value": 59262, "extra": 33.37}, {"name": "CPU Exec", "description": "", "value": 13674, "extra": 7.7}, {"name": "Other", "description": "", "value": 1156, "extra": 0.65}]}], "recommendations": "<ul><li>This run has high time cost on input data loading. 33.4% of the step time is in DataLoader. You could try to set num_workers on DataLoader's construction and enable multi-processes on data loading. Reference: <a href =\"https://pytorch.org/docs/stable/data.html#single-and-multi-process-data-loading\" target=\"_blank\">Single- and Multi-process Data Loading</a></li></ul>", "environments": [{"title": "Number of Worker(s)", "value": "1"}, {"title": "Device Type", "value": "GPU"}], "gpu_metrics": {"title": "GPU Summary", "data": [{"title": "GPU 0:", "value": ""}, {"title": "Name", "value": "Tesla V100-DGXS-32GB"}, {"title": "Memory", "value": "31.74 GB"}, {"title": "Compute Capability", "value": "7.0"}, {"title": "GPU Utilization", "value": "55.51 %"}, {"title": "Est. SM Efficiency", "value": "54.68 %"}, {"title": "Est. Achieved Occupancy", "value": "49.13 %"}], "tooltip": "The GPU usage metrics:\n\nGPU Utilization:\nGPU busy time / All steps time. GPU busy time is the time during which there is at least one GPU kernel running on it. All steps time is the total time of all profiler steps(or called as iterations).\n\nEst. SM Efficiency:\nEstimated Stream Multiprocessor Efficiency. Est. SM Efficiency of a kernel, SM_Eff_K = min(blocks of this kernel / SM number of this GPU, 100%). This overall number is the sum of all kernels' SM_Eff_K weighted by kernel's execution duration, divided by all steps time.\n\nEst. Achieved Occupancy:\nOccupancy is the ratio of active threads on an SM to the maximum number of active threads supported by the SM. The theoretical occupancy of a kernel is upper limit occupancy of this kernel, limited by multiple factors such as kernel shape, kernel used resource, and the GPU compute capability.Est. Achieved Occupancy of a kernel, OCC_K = min(threads of the kernel / SM number / max threads per SM, theoretical occupancy of the kernel). This overall number is the weighted sum of all kernels OCC_K using kernel's execution duration as weight."}}
+{"steps": {"columns": [{"type": "string", "name": "Step"}, {"type": "number", "name": "Kernel"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Memcpy"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Memset"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Runtime"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "DataLoader"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "CPU Exec"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Other"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}], "rows": [["5", 98598, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 187948us<br><b>Kernel: 98598us</b><br>Percentage: 52.46%</div>", 1941, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 187948us<br><b>Memcpy: 1941us</b><br>Percentage: 1.03%</div>", 90, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 187948us<br><b>Memset: 90us</b><br>Percentage: 0.05%</div>", 2796, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 187948us<br><b>Runtime: 2796us</b><br>Percentage: 1.49%</div>", 69317, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 187948us<br><b>DataLoader: 69317us</b><br>Percentage: 36.88%</div>", 14091, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 187948us<br><b>CPU Exec: 14091us</b><br>Percentage: 7.5%</div>", 1115, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 187948us<br><b>Other: 1115us</b><br>Percentage: 0.59%</div>"], ["6", 98570, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 175153us<br><b>Kernel: 98570us</b><br>Percentage: 56.28%</div>", 1947, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 175153us<br><b>Memcpy: 1947us</b><br>Percentage: 1.11%</div>", 89, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 175153us<br><b>Memset: 89us</b><br>Percentage: 0.05%</div>", 2762, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 175153us<br><b>Runtime: 2762us</b><br>Percentage: 1.58%</div>", 57669, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 175153us<br><b>DataLoader: 57669us</b><br>Percentage: 32.92%</div>", 12968, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 175153us<br><b>CPU Exec: 12968us</b><br>Percentage: 7.4%</div>", 1148, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 175153us<br><b>Other: 1148us</b><br>Percentage: 0.66%</div>"], ["7", 98596, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 179733us<br><b>Kernel: 98596us</b><br>Percentage: 54.86%</div>", 1931, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 179733us<br><b>Memcpy: 1931us</b><br>Percentage: 1.07%</div>", 91, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 179733us<br><b>Memset: 91us</b><br>Percentage: 0.05%</div>", 2877, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 179733us<br><b>Runtime: 2877us</b><br>Percentage: 1.6%</div>", 61257, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 179733us<br><b>DataLoader: 61257us</b><br>Percentage: 34.08%</div>", 13768, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 179733us<br><b>CPU Exec: 13768us</b><br>Percentage: 7.66%</div>", 1213, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 179733us<br><b>Other: 1213us</b><br>Percentage: 0.67%</div>"], ["8", 98623, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 174564us<br><b>Kernel: 98623us</b><br>Percentage: 56.5%</div>", 1938, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 174564us<br><b>Memcpy: 1938us</b><br>Percentage: 1.11%</div>", 89, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 174564us<br><b>Memset: 89us</b><br>Percentage: 0.05%</div>", 2841, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 174564us<br><b>Runtime: 2841us</b><br>Percentage: 1.63%</div>", 56453, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 174564us<br><b>DataLoader: 56453us</b><br>Percentage: 32.34%</div>", 13420, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 174564us<br><b>CPU Exec: 13420us</b><br>Percentage: 7.69%</div>", 1200, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 174564us<br><b>Other: 1200us</b><br>Percentage: 0.69%</div>"], ["9", 98504, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 182172us<br><b>Kernel: 98504us</b><br>Percentage: 54.07%</div>", 1937, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 182172us<br><b>Memcpy: 1937us</b><br>Percentage: 1.06%</div>", 87, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 182172us<br><b>Memset: 87us</b><br>Percentage: 0.05%</div>", 2788, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 182172us<br><b>Runtime: 2788us</b><br>Percentage: 1.53%</div>", 62690, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 182172us<br><b>DataLoader: 62690us</b><br>Percentage: 34.41%</div>", 15025, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 182172us<br><b>CPU Exec: 15025us</b><br>Percentage: 8.25%</div>", 1141, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 182172us<br><b>Other: 1141us</b><br>Percentage: 0.63%</div>"], ["10", 98641, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 165983us<br><b>Kernel: 98641us</b><br>Percentage: 59.43%</div>", 1798, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 165983us<br><b>Memcpy: 1798us</b><br>Percentage: 1.08%</div>", 88, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 165983us<br><b>Memset: 88us</b><br>Percentage: 0.05%</div>", 3381, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 165983us<br><b>Runtime: 3381us</b><br>Percentage: 2.04%</div>", 48185, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 165983us<br><b>DataLoader: 48185us</b><br>Percentage: 29.03%</div>", 12773, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 165983us<br><b>CPU Exec: 12773us</b><br>Percentage: 7.7%</div>", 1117, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 165983us<br><b>Other: 1117us</b><br>Percentage: 0.67%</div>"]]}, "performance": [{"name": "Average Step Time", "description": "", "value": 177592, "extra": 100, "children": [{"name": "Kernel", "description": "", "value": 98589, "extra": 55.51}, {"name": "Memcpy", "description": "", "value": 1915, "extra": 1.08}, {"name": "Memset", "description": "", "value": 89, "extra": 0.05}, {"name": "Runtime", "description": "", "value": 2908, "extra": 1.64}, {"name": "DataLoader", "description": "", "value": 59262, "extra": 33.37}, {"name": "CPU Exec", "description": "", "value": 13674, "extra": 7.7}, {"name": "Other", "description": "", "value": 1156, "extra": 0.65}]}], "recommendations": "<ul><li>This run has high time cost on input data loading. 33.4% of the step time is in DataLoader. You could try to set num_workers on DataLoader's construction and enable multi-processes on data loading. Reference: <a href =\"https://pytorch.org/docs/stable/data.html#single-and-multi-process-data-loading\" target=\"_blank\">Single- and Multi-process Data Loading</a></li></ul>", "environments": [{"title": "Number of Worker(s)", "value": "1"}, {"title": "Device Type", "value": "GPU"}], "gpu_metrics": {"title": "GPU Summary", "data": [{"title": "GPU 0:", "value": ""}, {"title": "Name", "value": "Tesla V100-DGXS-32GB"}, {"title": "Memory", "value": "31.74 GB"}, {"title": "Compute Capability", "value": "7.0"}, {"title": "GPU Utilization", "value": "55.51 %"}, {"title": "Est. SM Efficiency", "value": "54.68 %"}, {"title": "Est. Achieved Occupancy", "value": "49.13 %"}], "tooltip": "The GPU usage metrics:\n\nGPU Utilization:\nGPU busy time / All steps time. The bigger, the better. GPU busy time is the time during which there is at least one GPU kernel running on it. All steps time is the total time of all profiler steps(or called as iterations).\n\nEst. SM Efficiency:\nEstimated Stream Multiprocessor Efficiency. The bigger, the better. This metric of a kernel, SM_Eff_K = min(blocks of this kernel / SM number of this GPU, 100%). This overall number is the sum of all kernels' SM_Eff_K weighted by kernel's execution duration, divided by all steps time.\n\nEst. Achieved Occupancy:\nThe bigger, the better. Occupancy is the ratio of active warps on an SM to the maximum number of active warps supported by the SM. The theoretical occupancy of a kernel is upper limit occupancy of this kernel, limited by multiple factors such as kernel shape, kernel used resource, and the GPU compute capability.\nEst. Achieved Occupancy of a kernel, OCC_K = min(threads of the kernel / SM number / max threads per SM, theoretical occupancy of the kernel). This overall number is the weighted average of all kernels' OCC_K using kernel's execution duration as weight. It shows fine-grained low-level GPU utilization."}}
 {"device_total_time": {"title": "Device Total Time (us)", "columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["aten::cudnn_convolution_backward", 273428], ["CudnnConvolutionBackward", 273428], ["aten::cudnn_convolution_backward_weight", 142461], ["aten::cudnn_convolution_backward_input", 130967], ["aten::cudnn_convolution", 126619], ["aten::_convolution", 126619], ["aten::convolution", 126619], ["aten::conv2d", 126619], ["aten::cudnn_batch_norm_backward", 61939], ["CudnnBatchNormBackward", 61939], ["aten::cudnn_batch_norm", 34245], ["aten::_batch_norm_impl_index", 34245], ["aten::batch_norm", 34245], ["aten::threshold_backward", 27298], ["ReluBackward1", 27298], ["aten::add_", 24098], ["aten::clamp_min", 17860], ["aten::clamp_min_", 17860], ["aten::relu_", 17860], ["aten::add", 16038], ["aten::copy_", 11492], ["aten::to", 11492], ["aten::max_pool2d_with_indices_backward", 4677], ["MaxPool2DWithIndicesBackward", 4677], ["torch::autograd::AccumulateGrad", 3030], ["aten::mul_", 2409], ["aten::fill_", 1887], ["aten::zero_", 1881], ["aten::max_pool2d_with_indices", 1420], ["aten::max_pool2d", 1420], ["aten::mm", 275], ["AddmmBackward", 275], ["aten::mean", 212], ["aten::adaptive_avg_pool2d", 212], ["aten::addmm", 197], ["aten::linear", 197], ["aten::div", 144], ["MeanBackward1", 144], ["aten::cross_entropy_loss", 60], ["aten::_log_softmax_backward_data", 53], ["LogSoftmaxBackward", 53], ["aten::sum", 44], ["aten::_log_softmax", 42], ["aten::log_softmax", 42], ["aten::nll_loss_forward", 18], ["aten::nll_loss", 18], ["aten::nll_loss_nd", 18], ["aten::nll_loss_backward", 18], ["NllLossBackward", 18], ["aten::ones_like", 6]]}, "device_self_time": {"title": "Device Self Time (us)", "columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["aten::cudnn_convolution_backward_weight", 142461], ["aten::cudnn_convolution_backward_input", 130967], ["aten::cudnn_convolution", 126619], ["aten::cudnn_batch_norm_backward", 61939], ["aten::cudnn_batch_norm", 34245], ["aten::threshold_backward", 27298], ["aten::add_", 24098], ["aten::clamp_min", 17860], ["aten::add", 16038], ["aten::copy_", 11492], ["aten::max_pool2d_with_indices_backward", 3822], ["aten::mul_", 2409], ["aten::fill_", 1887], ["aten::max_pool2d_with_indices", 1420], ["aten::mm", 275], ["aten::mean", 212], ["aten::addmm", 197], ["aten::div", 144], ["aten::_log_softmax_backward_data", 53], ["aten::sum", 44], ["aten::_log_softmax", 42], ["aten::nll_loss_forward", 18], ["aten::nll_loss_backward", 18]]}, "host_total_time": {"title": "Host Total Time (us)", "columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["CudnnConvolutionBackward", 90989], ["aten::batch_norm", 87977], ["aten::cudnn_convolution_backward", 87772], ["aten::add_", 78125], ["aten::_batch_norm_impl_index", 78071], ["aten::conv2d", 77781], ["aten::cudnn_batch_norm", 71527], ["aten::convolution", 70394], ["aten::empty", 68147], ["aten::to", 64332], ["aten::_convolution", 64243], ["aten::cudnn_convolution", 56998], ["aten::copy_", 52853], ["aten::cudnn_convolution_backward_input", 41445], ["aten::cudnn_convolution_backward_weight", 40246], ["aten::div", 35158], ["CudnnBatchNormBackward", 34608], ["aten::contiguous", 31137], ["aten::cudnn_batch_norm_backward", 30460], ["aten::mul_", 29081], ["torch::autograd::AccumulateGrad", 28494], ["aten::zero_", 27597], ["aten::empty_like", 26064], ["aten::stack", 24346], ["aten::relu_", 24181], ["aten::add", 19289], ["aten::cat", 17085], ["aten::fill_", 17059], ["aten::_cat", 16933], ["aten::clamp_min_", 15665], ["aten::view", 14027], ["aten::resize_", 12406], ["aten::empty_strided", 11829], ["ReluBackward1", 11656], ["aten::clamp_min", 10311], ["aten::permute", 9775], ["aten::threshold_backward", 9482], ["aten::as_strided", 7600], ["aten::unsqueeze", 6603], ["aten::linear", 1408], ["AddmmBackward", 1303], ["aten::cross_entropy_loss", 1180], ["aten::zeros", 1105], ["aten::addmm", 1034], ["MeanBackward1", 987], ["aten::mm", 860], ["NllLossBackward", 716], ["aten::max_pool2d", 687], ["aten::nll_loss_backward", 614], ["aten::t", 584], ["aten::log_softmax", 567], ["aten::max_pool2d_with_indices", 562], ["aten::adaptive_avg_pool2d", 561], ["aten::nll_loss_nd", 495], ["MaxPool2DWithIndicesBackward", 484], ["aten::ones_like", 452], ["aten::mean", 445], ["aten::_log_softmax", 433], ["aten::nll_loss", 414], ["aten::max_pool2d_with_indices_backward", 411], ["LogSoftmaxBackward", 359], ["aten::narrow", 350], ["aten::nll_loss_forward", 346], ["aten::transpose", 329], ["aten::sum", 327], ["aten::_log_softmax_backward_data", 306], ["aten::expand", 229], ["aten::slice", 223], ["aten::detach_", 208], ["AddBackward0", 175], ["aten::flatten", 164], ["TBackward", 103], ["detach_", 100], ["ViewBackward", 80], ["aten::reshape", 55], ["aten::conj", 12]]}, "host_self_time": {"title": "Host Self Time (us)", "columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["aten::empty", 68147], ["aten::add_", 51013], ["aten::copy_", 40255], ["aten::cudnn_convolution", 33121], ["aten::cudnn_convolution_backward_input", 29324], ["aten::cudnn_convolution_backward_weight", 22804], ["aten::mul_", 20515], ["aten::div", 20135], ["aten::cudnn_batch_norm", 19843], ["aten::_cat", 16282], ["aten::to", 14834], ["aten::add", 14329], ["aten::view", 14027], ["aten::resize_", 12406], ["aten::cudnn_batch_norm_backward", 12238], ["aten::empty_strided", 11829], ["aten::empty_like", 11742], ["aten::zero_", 10693], ["aten::batch_norm", 9906], ["aten::fill_", 9879], ["aten::relu_", 8516], ["aten::as_strided", 7600], ["aten::conv2d", 7387], ["aten::_convolution", 7245], ["aten::clamp_min", 7106], ["aten::_batch_norm_impl_index", 6544], ["aten::convolution", 6151], ["aten::threshold_backward", 6090], ["aten::cudnn_convolution_backward", 6081], ["aten::permute", 5515], ["aten::contiguous", 5510], ["torch::autograd::AccumulateGrad", 5457], ["aten::clamp_min_", 5354], ["CudnnBatchNormBackward", 4148], ["aten::unsqueeze", 3574], ["CudnnConvolutionBackward", 3217], ["ReluBackward1", 2174], ["aten::zeros", 659], ["aten::stack", 658], ["aten::addmm", 639], ["aten::mm", 575], ["MeanBackward1", 541], ["aten::max_pool2d_with_indices", 477], ["aten::nll_loss_backward", 388], ["aten::nll_loss_forward", 266], ["aten::t", 255], ["aten::mean", 234], ["aten::transpose", 197], ["AddmmBackward", 182], ["aten::max_pool2d_with_indices_backward", 176], ["AddBackward0", 175], ["aten::_log_softmax", 170], ["aten::sum", 153], ["aten::cat", 152], ["aten::expand", 150], ["aten::narrow", 127], ["aten::max_pool2d", 125], ["aten::linear", 124], ["aten::slice", 123], ["aten::cross_entropy_loss", 118], ["aten::adaptive_avg_pool2d", 116], ["aten::detach_", 108], ["aten::_log_softmax_backward_data", 108], ["NllLossBackward", 102], ["detach_", 100], ["aten::ones_like", 95], ["aten::log_softmax", 90], ["aten::flatten", 84], ["aten::nll_loss_nd", 81], ["MaxPool2DWithIndicesBackward", 73], ["aten::nll_loss", 68], ["LogSoftmaxBackward", 53], ["aten::reshape", 29], ["ViewBackward", 25], ["TBackward", 18], ["aten::conj", 12]]}}
 [{"name": "aten::cudnn_convolution_backward_weight", "calls": 318, "device_self_duration": 142461, "device_total_duration": 142461, "host_self_duration": 22804, "host_total_duration": 40246, "has_call_stack": false}, {"name": "aten::cudnn_convolution_backward_input", "calls": 312, "device_self_duration": 130967, "device_total_duration": 130967, "host_self_duration": 29324, "host_total_duration": 41445, "has_call_stack": false}, {"name": "aten::cudnn_convolution", "calls": 318, "device_self_duration": 126619, "device_total_duration": 126619, "host_self_duration": 33121, "host_total_duration": 56998, "has_call_stack": true}, {"name": "aten::cudnn_batch_norm_backward", "calls": 318, "device_self_duration": 61939, "device_total_duration": 61939, "host_self_duration": 12238, "host_total_duration": 30460, "has_call_stack": false}, {"name": "aten::cudnn_batch_norm", "calls": 318, "device_self_duration": 34245, "device_total_duration": 34245, "host_self_duration": 19843, "host_total_duration": 71527, "has_call_stack": true}, {"name": "aten::threshold_backward", "calls": 294, "device_self_duration": 27298, "device_total_duration": 27298, "host_self_duration": 6090, "host_total_duration": 9482, "has_call_stack": false}, {"name": "aten::add_", "calls": 2994, "device_self_duration": 24098, "device_total_duration": 24098, "host_self_duration": 51013, "host_total_duration": 78125, "has_call_stack": true}, {"name": "aten::clamp_min", "calls": 294, "device_self_duration": 17860, "device_total_duration": 17860, "host_self_duration": 7106, "host_total_duration": 10311, "has_call_stack": true}, {"name": "aten::add", "calls": 414, "device_self_duration": 16038, "device_total_duration": 16038, "host_self_duration": 14329, "host_total_duration": 19289, "has_call_stack": true}, {"name": "aten::copy_", "calls": 588, "device_self_duration": 11492, "device_total_duration": 11492, "host_self_duration": 40255, "host_total_duration": 52853, "has_call_stack": true}, {"name": "aten::max_pool2d_with_indices_backward", "calls": 6, "device_self_duration": 3822, "device_total_duration": 4677, "host_self_duration": 176, "host_total_duration": 411, "has_call_stack": false}, {"name": "aten::mul_", "calls": 966, "device_self_duration": 2409, "device_total_duration": 2409, "host_self_duration": 20515, "host_total_duration": 29081, "has_call_stack": true}, {"name": "aten::fill_", "calls": 978, "device_self_duration": 1887, "device_total_duration": 1887, "host_self_duration": 9879, "host_total_duration": 17059, "has_call_stack": true}, {"name": "aten::max_pool2d_with_indices", "calls": 6, "device_self_duration": 1420, "device_total_duration": 1420, "host_self_duration": 477, "host_total_duration": 562, "has_call_stack": true}, {"name": "aten::mm", "calls": 12, "device_self_duration": 275, "device_total_duration": 275, "host_self_duration": 575, "host_total_duration": 860, "has_call_stack": false}, {"name": "aten::mean", "calls": 6, "device_self_duration": 212, "device_total_duration": 212, "host_self_duration": 234, "host_total_duration": 445, "has_call_stack": true}, {"name": "aten::addmm", "calls": 6, "device_self_duration": 197, "device_total_duration": 197, "host_self_duration": 639, "host_total_duration": 1034, "has_call_stack": true}, {"name": "aten::div", "calls": 198, "device_self_duration": 144, "device_total_duration": 144, "host_self_duration": 20135, "host_total_duration": 35158, "has_call_stack": true}, {"name": "aten::_log_softmax_backward_data", "calls": 6, "device_self_duration": 53, "device_total_duration": 53, "host_self_duration": 108, "host_total_duration": 306, "has_call_stack": false}, {"name": "aten::sum", "calls": 6, "device_self_duration": 44, "device_total_duration": 44, "host_self_duration": 153, "host_total_duration": 327, "has_call_stack": false}, {"name": "aten::_log_softmax", "calls": 6, "device_self_duration": 42, "device_total_duration": 42, "host_self_duration": 170, "host_total_duration": 433, "has_call_stack": true}, {"name": "aten::nll_loss_forward", "calls": 6, "device_self_duration": 18, "device_total_duration": 18, "host_self_duration": 266, "host_total_duration": 346, "has_call_stack": true}, {"name": "aten::nll_loss_backward", "calls": 6, "device_self_duration": 18, "device_total_duration": 18, "host_self_duration": 388, "host_total_duration": 614, "has_call_stack": false}, {"name": "aten::empty", "calls": 4404, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 68147, "host_total_duration": 68147, "has_call_stack": true}, {"name": "aten::zero_", "calls": 996, "device_self_duration": 0, "device_total_duration": 1881, "host_self_duration": 10693, "host_total_duration": 27597, "has_call_stack": true}, {"name": "aten::zeros", "calls": 24, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 659, "host_total_duration": 1105, "has_call_stack": true}, {"name": "aten::view", "calls": 846, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 14027, "host_total_duration": 14027, "has_call_stack": true}, {"name": "aten::as_strided", "calls": 432, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 7600, "host_total_duration": 7600, "has_call_stack": true}, {"name": "aten::permute", "calls": 192, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 5515, "host_total_duration": 9775, "has_call_stack": true}, {"name": "aten::empty_like", "calls": 528, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 11742, "host_total_duration": 26064, "has_call_stack": true}, {"name": "aten::contiguous", "calls": 192, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 5510, "host_total_duration": 31137, "has_call_stack": true}, {"name": "aten::empty_strided", "calls": 402, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 11829, "host_total_duration": 11829, "has_call_stack": true}, {"name": "aten::to", "calls": 414, "device_self_duration": 0, "device_total_duration": 11492, "host_self_duration": 14834, "host_total_duration": 64332, "has_call_stack": true}, {"name": "aten::unsqueeze", "calls": 192, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 3574, "host_total_duration": 6603, "has_call_stack": true}, {"name": "aten::resize_", "calls": 1902, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 12406, "host_total_duration": 12406, "has_call_stack": true}, {"name": "aten::slice", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 123, "host_total_duration": 223, "has_call_stack": true}, {"name": "aten::narrow", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 127, "host_total_duration": 350, "has_call_stack": true}, {"name": "aten::_cat", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 16282, "host_total_duration": 16933, "has_call_stack": true}, {"name": "aten::cat", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 152, "host_total_duration": 17085, "has_call_stack": true}, {"name": "aten::stack", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 658, "host_total_duration": 24346, "has_call_stack": true}, {"name": "detach_", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 100, "host_total_duration": 100, "has_call_stack": true}, {"name": "aten::detach_", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 108, "host_total_duration": 208, "has_call_stack": true}, {"name": "aten::_convolution", "calls": 318, "device_self_duration": 0, "device_total_duration": 126619, "host_self_duration": 7245, "host_total_duration": 64243, "has_call_stack": true}, {"name": "aten::convolution", "calls": 318, "device_self_duration": 0, "device_total_duration": 126619, "host_self_duration": 6151, "host_total_duration": 70394, "has_call_stack": true}, {"name": "aten::conv2d", "calls": 318, "device_self_duration": 0, "device_total_duration": 126619, "host_self_duration": 7387, "host_total_duration": 77781, "has_call_stack": true}, {"name": "aten::_batch_norm_impl_index", "calls": 318, "device_self_duration": 0, "device_total_duration": 34245, "host_self_duration": 6544, "host_total_duration": 78071, "has_call_stack": true}, {"name": "aten::batch_norm", "calls": 318, "device_self_duration": 0, "device_total_duration": 34245, "host_self_duration": 9906, "host_total_duration": 87977, "has_call_stack": true}, {"name": "aten::clamp_min_", "calls": 294, "device_self_duration": 0, "device_total_duration": 17860, "host_self_duration": 5354, "host_total_duration": 15665, "has_call_stack": true}, {"name": "aten::relu_", "calls": 294, "device_self_duration": 0, "device_total_duration": 17860, "host_self_duration": 8516, "host_total_duration": 24181, "has_call_stack": true}, {"name": "aten::max_pool2d", "calls": 6, "device_self_duration": 0, "device_total_duration": 1420, "host_self_duration": 125, "host_total_duration": 687, "has_call_stack": true}, {"name": "aten::adaptive_avg_pool2d", "calls": 6, "device_self_duration": 0, "device_total_duration": 212, "host_self_duration": 116, "host_total_duration": 561, "has_call_stack": true}, {"name": "aten::flatten", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 84, "host_total_duration": 164, "has_call_stack": true}, {"name": "aten::transpose", "calls": 30, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 197, "host_total_duration": 329, "has_call_stack": true}, {"name": "aten::t", "calls": 30, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 255, "host_total_duration": 584, "has_call_stack": true}, {"name": "aten::expand", "calls": 12, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 150, "host_total_duration": 229, "has_call_stack": true}, {"name": "aten::linear", "calls": 6, "device_self_duration": 0, "device_total_duration": 197, "host_self_duration": 124, "host_total_duration": 1408, "has_call_stack": true}, {"name": "aten::log_softmax", "calls": 6, "device_self_duration": 0, "device_total_duration": 42, "host_self_duration": 90, "host_total_duration": 567, "has_call_stack": true}, {"name": "aten::nll_loss", "calls": 6, "device_self_duration": 0, "device_total_duration": 18, "host_self_duration": 68, "host_total_duration": 414, "has_call_stack": true}, {"name": "aten::nll_loss_nd", "calls": 6, "device_self_duration": 0, "device_total_duration": 18, "host_self_duration": 81, "host_total_duration": 495, "has_call_stack": true}, {"name": "aten::cross_entropy_loss", "calls": 6, "device_self_duration": 0, "device_total_duration": 60, "host_self_duration": 118, "host_total_duration": 1180, "has_call_stack": true}, {"name": "aten::ones_like", "calls": 6, "device_self_duration": 0, "device_total_duration": 6, "host_self_duration": 95, "host_total_duration": 452, "has_call_stack": true}, {"name": "NllLossBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 18, "host_self_duration": 102, "host_total_duration": 716, "has_call_stack": false}, {"name": "LogSoftmaxBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 53, "host_self_duration": 53, "host_total_duration": 359, "has_call_stack": false}, {"name": "aten::conj", "calls": 12, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 12, "host_total_duration": 12, "has_call_stack": false}, {"name": "AddmmBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 275, "host_self_duration": 182, "host_total_duration": 1303, "has_call_stack": false}, {"name": "torch::autograd::AccumulateGrad", "calls": 966, "device_self_duration": 0, "device_total_duration": 3030, "host_self_duration": 5457, "host_total_duration": 28494, "has_call_stack": false}, {"name": "TBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 18, "host_total_duration": 103, "has_call_stack": false}, {"name": "aten::reshape", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 29, "host_total_duration": 55, "has_call_stack": false}, {"name": "ViewBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 25, "host_total_duration": 80, "has_call_stack": false}, {"name": "MeanBackward1", "calls": 6, "device_self_duration": 0, "device_total_duration": 144, "host_self_duration": 541, "host_total_duration": 987, "has_call_stack": false}, {"name": "ReluBackward1", "calls": 294, "device_self_duration": 0, "device_total_duration": 27298, "host_self_duration": 2174, "host_total_duration": 11656, "has_call_stack": false}, {"name": "AddBackward0", "calls": 96, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 175, "host_total_duration": 175, "has_call_stack": false}, {"name": "CudnnBatchNormBackward", "calls": 318, "device_self_duration": 0, "device_total_duration": 61939, "host_self_duration": 4148, "host_total_duration": 34608, "has_call_stack": false}, {"name": "aten::cudnn_convolution_backward", "calls": 318, "device_self_duration": 0, "device_total_duration": 273428, "host_self_duration": 6081, "host_total_duration": 87772, "has_call_stack": false}, {"name": "CudnnConvolutionBackward", "calls": 318, "device_self_duration": 0, "device_total_duration": 273428, "host_self_duration": 3217, "host_total_duration": 90989, "has_call_stack": false}, {"name": "MaxPool2DWithIndicesBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 4677, "host_self_duration": 73, "host_total_duration": 484, "has_call_stack": false}]
-{"data": {"columns": [{"type": "string", "name": "Name"}, {"type": "number", "name": "Calls"}, {"type": "number", "name": "Total Duration (us)"}, {"type": "number", "name": "Mean Duration (us)"}, {"type": "number", "name": "Max Duration (us)"}, {"type": "number", "name": "Min Duration (us)"}, {"type": "number", "name": "Mean Blocks Per SM", "tooltip": "Blocks Per SM:\nmin(blocks of this kernel / SM number of this GPU). If this number is less than 1, it indicates the GPU multiprocessors are not fully utilized."}, {"type": "number", "name": "Mean Est. Achieved Occupancy (%)", "tooltip": "Est. Achieved Occupancy:\nOccupancy is the ratio of active threads on an SM to the maximum number of active threads supported by the SM. The theoretical occupancy of a kernel is upper limit occupancy of this kernel, limited by multiple factors such as kernel shape, kernel used resource, and the GPU compute capability.Est. Achieved Occupancy of a kernel, OCC_K = min(threads of the kernel / SM number / max threads per SM, theoretical occupancy of the kernel). This overall number is the weighted sum of all kernels OCC_K using kernel's execution duration as weight."}], "rows": [["cask_cudnn::computeBOffsetsKernel(cask_cudnn::ComputeBOffsetsParams)", 72, 73, 1, 2, 1, 0.02, 0.0], ["cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", 138, 342, 2, 4, 1, 0.13, 1.73], ["cask_cudnn::computeWgradBOffsetsKernel(cask_cudnn::ComputeWgradBOffsetsParams)", 66, 81, 1, 2, 1, 0.02, 0.0], ["cask_cudnn::computeWgradSplitKOffsetsKernel(cask_cudnn::ComputeSplitKOffsetsParams)", 66, 81, 1, 2, 1, 0.15, 1.68], ["void (anonymous namespace)::softmax_warp_backward<float, float, float, 10, true>(float*, float const*, float const*, int, int, int)", 6, 53, 9, 9, 8, 0.1, 1.0], ["void (anonymous namespace)::softmax_warp_forward<float, float, float, 10, true>(float*, float const*, int, int, int)", 6, 42, 7, 7, 7, 0.1, 1.0], ["void at::native::(anonymous namespace)::max_pool_backward_nchw<float, float>(int, float const*, long const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*)", 6, 3822, 637, 638, 636, 1254.4, 100.0], ["void at::native::(anonymous namespace)::max_pool_forward_nchw<float, float>(int, float const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*, long*)", 6, 1420, 237, 239, 234, 313.6, 100.0], ["void at::native::reduce_kernel<128, 4, at::native::ReduceOp<float, at::native::func_wrapper_t<float, at::native::sum_functor<float, float, float>::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp<float, at::native::func_wrapper_t<float, at::native::sum_functor<float, float, float>::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", 6, 44, 7, 8, 7, 0.02, 0.0], ["void at::native::reduce_kernel<512, 1, at::native::ReduceOp<float, at::native::MeanOps<float, float>, unsigned int, float, 4> >(at::native::ReduceOp<float, at::native::MeanOps<float, float>, unsigned int, float, 4>)", 6, 212, 35, 36, 35, 51.2, 100.0], ["void at::native::unrolled_elementwise_kernel<at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast)", 6, 144, 24, 24, 24, 156.8, 100.0], ["void at::native::unrolled_elementwise_kernel<at::native::copy_device_to_device(at::TensorIterator&, bool)::{lambda()#2}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array<char*, 2>::StoreWithoutCast>(int, at::native::copy_device_to_device(at::TensorIterator&, bool)::{lambda()#2}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array<char*, 2>::StoreWithoutCast)", 6, 30, 5, 5, 5, 1.56, 5.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2> >(int, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2>)", 294, 17860, 61, 252, 5, 666.65, 100.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor<float>, at::detail::Array<char*, 3> >(int, at::native::AddFunctor<float>, at::detail::Array<char*, 3>)", 3090, 39814, 13, 378, 1, 641.54, 92.32], ["void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor<at::native::AddFunctor<long> >, at::detail::Array<char*, 2> >(int, at::native::BUnaryFunctor<at::native::AddFunctor<long> >, at::detail::Array<char*, 2>)", 318, 322, 1, 2, 1, 0.01, 0.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<float>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<float>, at::detail::Array<char*, 1>)", 978, 1887, 2, 143, 0, 599.07, 86.78], ["void at::native::vectorized_elementwise_kernel<4, at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2> >(int, at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2>)", 966, 2409, 2, 25, 1, 43.72, 58.39], ["void at::native::vectorized_elementwise_kernel<4, at::native::threshold_kernel_impl<float>(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array<char*, 3> >(int, at::native::threshold_kernel_impl<float>(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array<char*, 3>)", 294, 27298, 93, 377, 13, 653.06, 100.0], ["void cudnn::bn_bw_1C11_kernel_new<float, float, float2, 512, true, 1>(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float)", 264, 59642, 226, 915, 45, 4.34, 67.98], ["void cudnn::bn_bw_1C11_singleread<float, 512, true, 1, 2, 0>(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnBwPersistentState*, int, float, float, float, int, float, cudnnStatus_t*, bool)", 54, 2297, 43, 73, 18, 20.81, 75.0], ["void cudnn::bn_fw_tr_1C11_kernel_NCHW<float, float, 512, true, 1>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float)", 150, 27060, 180, 452, 53, 3.12, 64.06], ["void cudnn::bn_fw_tr_1C11_singleread<float, 512, true, 1, 2, 0>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnFwPersistentState*, int, float, float, float, int, float, float, cudnnStatus_t*, bool)", 168, 7185, 43, 89, 13, 12.57, 75.0], ["void cudnn::cnn::im2col4d_kernel<float, long>(cudnn::cnn::im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, float const*, float*)", 6, 614, 102, 103, 101, 0.95, 24.0], ["void cudnn::cnn::wgrad_alg0_engine<float, 128, 5, 5, 3, 3, 3, false, 512>(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 12, 7068, 589, 987, 193, 85.34, 37.5], ["void cudnn::cnn::wgrad_alg0_engine<float, 128, 6, 7, 3, 3, 5, false, 512>(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 156, 66472, 426, 745, 345, 9.78, 38.0], ["void cudnn::cnn::wgrad_alg0_engine<float, 128, 6, 8, 3, 3, 5, false, 512>(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 6, 4065, 678, 692, 652, 6.4, 25.0], ["void cudnn::detail::dgrad_engine<float, 512, 6, 5, 3, 3, 3, false>(int, int, int, float const*, int, float const*, int, float*, kernel_grad_params, unsigned long long, int, unsigned long long, int, float, int, int, int)", 162, 80756, 498, 1017, 323, 42.25, 29.97], ["void cudnn::ops::scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float*, float)", 162, 4631, 29, 143, 5, 496.39, 100.0], ["void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>)", 36, 134, 4, 5, 2, 0.4, 3.0], ["void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)", 120, 4710, 39, 66, 17, 10.11, 50.0], ["void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>)", 120, 2662, 22, 67, 5, 8.68, 73.22], ["void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>)", 120, 5369, 45, 73, 19, 10.0, 50.0], ["void cudnn::winograd_nonfused::winogradWgradData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)", 78, 4692, 60, 126, 20, 15.46, 38.0], ["void cudnn::winograd_nonfused::winogradWgradDelta4x4<float, float>(cudnn::winograd_nonfused::WinogradDeltaParams<float, float>)", 78, 4573, 59, 125, 17, 15.69, 50.0], ["void cudnn::winograd_nonfused::winogradWgradOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradWgradOutputParams<float, float>)", 78, 1504, 19, 69, 5, 8.06, 41.33], ["void cunn_ClassNLLCriterion_updateGradInput_kernel<float>(float*, float*, long*, float*, float*, int, int, int, int, long)", 6, 12, 2, 2, 2, 0.01, 0.0], ["void cunn_ClassNLLCriterion_updateOutput_kernel<float, float>(float*, float*, float*, long*, float*, int, int, int, int, long)", 6, 18, 3, 3, 3, 0.01, 0.0], ["void explicit_convolve_sgemm<float, int, 1024, 5, 5, 3, 3, 3, 0, false>(int, int, int, float const*, int, float const*, int, float*, kernel_conv_params, unsigned long long, int, unsigned long long, int, float, float, int, float const*, float const*)", 6, 4759, 793, 796, 790, 9.8, 31.0], ["void implicit_convolve_sgemm<float, float, 1024, 6, 7, 3, 3, 5, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 90, 36957, 411, 748, 347, 12.34, 50.0], ["void implicit_convolve_sgemm<float, float, 128, 5, 5, 3, 3, 3, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 12, 5219, 435, 437, 432, 9.8, 31.0], ["void implicit_convolve_sgemm<float, float, 128, 6, 7, 3, 3, 5, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 60, 25782, 430, 729, 352, 3.9, 42.09], ["void implicit_convolve_sgemm<float, float, 512, 6, 8, 3, 3, 5, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 6, 3917, 653, 686, 595, 4.9, 25.0], ["void nchwToNhwcKernel<float, float, float, true, false, (cudnnKernelDataType_t)0>(int, int, int, int, float const*, float*, float, float)", 12, 453, 38, 68, 9, 73.28, 100.0], ["void nhwcToNchwKernel<float, float, float, true, false, (cudnnKernelDataType_t)0>(int, int, int, int, float const*, float*, float, float)", 6, 105, 18, 18, 17, 22.4, 100.0], ["void splitKreduce_kernel<float, float, float, float>(cublasSplitKParams<float>, float const*, float const*, float*, float const*, float const*, float const*)", 12, 30, 2, 3, 2, 4.44, 28.0], ["volta_scudnn_128x128_relu_interior_nn_v1", 6, 3010, 502, 508, 495, 9.8, 25.0], ["volta_scudnn_128x128_stridedB_interior_nn_v1", 18, 4693, 261, 281, 252, 9.8, 25.0], ["volta_scudnn_128x128_stridedB_medium_nn_v1", 12, 3501, 292, 296, 286, 19.6, 25.0], ["volta_scudnn_128x128_stridedB_small_nn_v1", 6, 2995, 499, 505, 493, 19.6, 25.0], ["volta_scudnn_128x128_stridedB_splitK_medium_nn_v1", 6, 3720, 620, 623, 614, 5.6, 25.0], ["volta_scudnn_128x128_stridedB_splitK_small_nn_v1", 48, 20448, 426, 676, 307, 6.83, 25.0], ["volta_scudnn_128x32_sliced1x4_ldg4_relu_exp_medium_nhwc_tn_v1", 6, 3270, 545, 627, 526, 4.9, 25.0], ["volta_scudnn_128x64_relu_interior_nn_v1", 30, 8022, 267, 316, 94, 37.1, 25.0], ["volta_scudnn_128x64_relu_medium_nn_v1", 6, 3627, 604, 606, 603, 39.2, 25.0], ["volta_scudnn_128x64_relu_small_nn_v1", 12, 3265, 272, 279, 254, 9.8, 25.0], ["volta_scudnn_128x64_relu_xregs_large_nn_v1", 6, 3200, 533, 607, 516, 4.9, 19.0], ["volta_scudnn_128x64_stridedB_interior_nn_v1", 30, 9597, 320, 510, 252, 12.9, 19.0], ["volta_scudnn_128x64_stridedB_small_nn_v1", 6, 584, 97, 100, 93, 9.8, 19.0], ["volta_scudnn_128x64_stridedB_splitK_xregs_large_nn_v1", 12, 7817, 651, 671, 635, 15.96, 19.0], ["volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1", 36, 12704, 353, 362, 344, 22.4, 25.0], ["volta_sgemm_128x32_nt", 24, 8629, 360, 477, 18, 0.97, 11.51], ["volta_sgemm_32x128_nn", 18, 3053, 170, 171, 168, 22.05, 50.0], ["volta_sgemm_32x128_nt", 18, 2843, 158, 159, 156, 22.05, 50.0], ["volta_sgemm_64x32_sliced1x4_nn", 6, 150, 25, 26, 24, 2.0, 25.0], ["volta_sgemm_64x32_sliced1x4_tn", 6, 149, 25, 26, 24, 1.0, 13.0], ["volta_sgemm_64x64_nn", 42, 8551, 204, 217, 195, 12.34, 24.14], ["volta_sgemm_64x64_nt", 102, 21084, 207, 279, 184, 10.24, 19.38]]}}
+{"data": {"columns": [{"type": "string", "name": "Name"}, {"type": "number", "name": "Calls"}, {"type": "number", "name": "Total Duration (us)"}, {"type": "number", "name": "Mean Duration (us)"}, {"type": "number", "name": "Max Duration (us)"}, {"type": "number", "name": "Min Duration (us)"}, {"type": "number", "name": "Mean Blocks Per SM", "tooltip": "Blocks Per SM = blocks of this kernel / SM number of this GPU.\nIf this number is less than 1, it indicates the GPU multiprocessors are not fully utilized.\n\"Mean Blocks per SM\" is the weighted average of all calls of this kernel, using each call's execution duration as weight."}, {"type": "number", "name": "Mean Est. Achieved Occupancy (%)", "tooltip": "Est. Achieved Occupancy:\nThe bigger, the better. Occupancy is the ratio of active warps on an SM to the maximum number of active warps supported by the SM. The theoretical occupancy of a kernel is upper limit occupancy of this kernel, limited by multiple factors such as kernel shape, kernel used resource, and the GPU compute capability.\nEst. Achieved Occupancy of a kernel, OCC_K = min(threads of the kernel / SM number / max threads per SM, theoretical occupancy of the kernel). This \"Mean\" number is the weighted average of all calls' OCC_K of the kernel, using each call's execution duration as weight. It shows fine-grained low-level GPU utilization."}], "rows": [["cask_cudnn::computeBOffsetsKernel(cask_cudnn::ComputeBOffsetsParams)", 72, 73, 1, 2, 1, 0.02, 0.0], ["cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", 138, 342, 2, 4, 1, 0.13, 1.73], ["cask_cudnn::computeWgradBOffsetsKernel(cask_cudnn::ComputeWgradBOffsetsParams)", 66, 81, 1, 2, 1, 0.02, 0.0], ["cask_cudnn::computeWgradSplitKOffsetsKernel(cask_cudnn::ComputeSplitKOffsetsParams)", 66, 81, 1, 2, 1, 0.15, 1.68], ["void (anonymous namespace)::softmax_warp_backward<float, float, float, 10, true>(float*, float const*, float const*, int, int, int)", 6, 53, 9, 9, 8, 0.1, 1.0], ["void (anonymous namespace)::softmax_warp_forward<float, float, float, 10, true>(float*, float const*, int, int, int)", 6, 42, 7, 7, 7, 0.1, 1.0], ["void at::native::(anonymous namespace)::max_pool_backward_nchw<float, float>(int, float const*, long const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*)", 6, 3822, 637, 638, 636, 1254.4, 100.0], ["void at::native::(anonymous namespace)::max_pool_forward_nchw<float, float>(int, float const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*, long*)", 6, 1420, 237, 239, 234, 313.6, 100.0], ["void at::native::reduce_kernel<128, 4, at::native::ReduceOp<float, at::native::func_wrapper_t<float, at::native::sum_functor<float, float, float>::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp<float, at::native::func_wrapper_t<float, at::native::sum_functor<float, float, float>::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", 6, 44, 7, 8, 7, 0.02, 0.0], ["void at::native::reduce_kernel<512, 1, at::native::ReduceOp<float, at::native::MeanOps<float, float>, unsigned int, float, 4> >(at::native::ReduceOp<float, at::native::MeanOps<float, float>, unsigned int, float, 4>)", 6, 212, 35, 36, 35, 51.2, 100.0], ["void at::native::unrolled_elementwise_kernel<at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast)", 6, 144, 24, 24, 24, 156.8, 100.0], ["void at::native::unrolled_elementwise_kernel<at::native::copy_device_to_device(at::TensorIterator&, bool)::{lambda()#2}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array<char*, 2>::StoreWithoutCast>(int, at::native::copy_device_to_device(at::TensorIterator&, bool)::{lambda()#2}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array<char*, 2>::StoreWithoutCast)", 6, 30, 5, 5, 5, 1.56, 5.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2> >(int, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2>)", 294, 17860, 61, 252, 5, 666.65, 100.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor<float>, at::detail::Array<char*, 3> >(int, at::native::AddFunctor<float>, at::detail::Array<char*, 3>)", 3090, 39814, 13, 378, 1, 641.54, 92.32], ["void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor<at::native::AddFunctor<long> >, at::detail::Array<char*, 2> >(int, at::native::BUnaryFunctor<at::native::AddFunctor<long> >, at::detail::Array<char*, 2>)", 318, 322, 1, 2, 1, 0.01, 0.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<float>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<float>, at::detail::Array<char*, 1>)", 978, 1887, 2, 143, 0, 599.07, 86.78], ["void at::native::vectorized_elementwise_kernel<4, at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2> >(int, at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2>)", 966, 2409, 2, 25, 1, 43.72, 58.39], ["void at::native::vectorized_elementwise_kernel<4, at::native::threshold_kernel_impl<float>(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array<char*, 3> >(int, at::native::threshold_kernel_impl<float>(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array<char*, 3>)", 294, 27298, 93, 377, 13, 653.06, 100.0], ["void cudnn::bn_bw_1C11_kernel_new<float, float, float2, 512, true, 1>(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float)", 264, 59642, 226, 915, 45, 4.34, 67.98], ["void cudnn::bn_bw_1C11_singleread<float, 512, true, 1, 2, 0>(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnBwPersistentState*, int, float, float, float, int, float, cudnnStatus_t*, bool)", 54, 2297, 43, 73, 18, 20.81, 75.0], ["void cudnn::bn_fw_tr_1C11_kernel_NCHW<float, float, 512, true, 1>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float)", 150, 27060, 180, 452, 53, 3.12, 64.06], ["void cudnn::bn_fw_tr_1C11_singleread<float, 512, true, 1, 2, 0>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnFwPersistentState*, int, float, float, float, int, float, float, cudnnStatus_t*, bool)", 168, 7185, 43, 89, 13, 12.57, 75.0], ["void cudnn::cnn::im2col4d_kernel<float, long>(cudnn::cnn::im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, float const*, float*)", 6, 614, 102, 103, 101, 0.95, 24.0], ["void cudnn::cnn::wgrad_alg0_engine<float, 128, 5, 5, 3, 3, 3, false, 512>(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 12, 7068, 589, 987, 193, 85.34, 37.5], ["void cudnn::cnn::wgrad_alg0_engine<float, 128, 6, 7, 3, 3, 5, false, 512>(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 156, 66472, 426, 745, 345, 9.78, 38.0], ["void cudnn::cnn::wgrad_alg0_engine<float, 128, 6, 8, 3, 3, 5, false, 512>(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 6, 4065, 678, 692, 652, 6.4, 25.0], ["void cudnn::detail::dgrad_engine<float, 512, 6, 5, 3, 3, 3, false>(int, int, int, float const*, int, float const*, int, float*, kernel_grad_params, unsigned long long, int, unsigned long long, int, float, int, int, int)", 162, 80756, 498, 1017, 323, 42.25, 29.97], ["void cudnn::ops::scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float*, float)", 162, 4631, 29, 143, 5, 496.39, 100.0], ["void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>)", 36, 134, 4, 5, 2, 0.4, 3.0], ["void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)", 120, 4710, 39, 66, 17, 10.11, 50.0], ["void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>)", 120, 2662, 22, 67, 5, 8.68, 73.22], ["void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>)", 120, 5369, 45, 73, 19, 10.0, 50.0], ["void cudnn::winograd_nonfused::winogradWgradData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)", 78, 4692, 60, 126, 20, 15.46, 38.0], ["void cudnn::winograd_nonfused::winogradWgradDelta4x4<float, float>(cudnn::winograd_nonfused::WinogradDeltaParams<float, float>)", 78, 4573, 59, 125, 17, 15.69, 50.0], ["void cudnn::winograd_nonfused::winogradWgradOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradWgradOutputParams<float, float>)", 78, 1504, 19, 69, 5, 8.06, 41.33], ["void cunn_ClassNLLCriterion_updateGradInput_kernel<float>(float*, float*, long*, float*, float*, int, int, int, int, long)", 6, 12, 2, 2, 2, 0.01, 0.0], ["void cunn_ClassNLLCriterion_updateOutput_kernel<float, float>(float*, float*, float*, long*, float*, int, int, int, int, long)", 6, 18, 3, 3, 3, 0.01, 0.0], ["void explicit_convolve_sgemm<float, int, 1024, 5, 5, 3, 3, 3, 0, false>(int, int, int, float const*, int, float const*, int, float*, kernel_conv_params, unsigned long long, int, unsigned long long, int, float, float, int, float const*, float const*)", 6, 4759, 793, 796, 790, 9.8, 31.0], ["void implicit_convolve_sgemm<float, float, 1024, 6, 7, 3, 3, 5, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 90, 36957, 411, 748, 347, 12.34, 50.0], ["void implicit_convolve_sgemm<float, float, 128, 5, 5, 3, 3, 3, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 12, 5219, 435, 437, 432, 9.8, 31.0], ["void implicit_convolve_sgemm<float, float, 128, 6, 7, 3, 3, 5, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 60, 25782, 430, 729, 352, 3.9, 42.09], ["void implicit_convolve_sgemm<float, float, 512, 6, 8, 3, 3, 5, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 6, 3917, 653, 686, 595, 4.9, 25.0], ["void nchwToNhwcKernel<float, float, float, true, false, (cudnnKernelDataType_t)0>(int, int, int, int, float const*, float*, float, float)", 12, 453, 38, 68, 9, 73.28, 100.0], ["void nhwcToNchwKernel<float, float, float, true, false, (cudnnKernelDataType_t)0>(int, int, int, int, float const*, float*, float, float)", 6, 105, 18, 18, 17, 22.4, 100.0], ["void splitKreduce_kernel<float, float, float, float>(cublasSplitKParams<float>, float const*, float const*, float*, float const*, float const*, float const*)", 12, 30, 2, 3, 2, 4.44, 28.0], ["volta_scudnn_128x128_relu_interior_nn_v1", 6, 3010, 502, 508, 495, 9.8, 25.0], ["volta_scudnn_128x128_stridedB_interior_nn_v1", 18, 4693, 261, 281, 252, 9.8, 25.0], ["volta_scudnn_128x128_stridedB_medium_nn_v1", 12, 3501, 292, 296, 286, 19.6, 25.0], ["volta_scudnn_128x128_stridedB_small_nn_v1", 6, 2995, 499, 505, 493, 19.6, 25.0], ["volta_scudnn_128x128_stridedB_splitK_medium_nn_v1", 6, 3720, 620, 623, 614, 5.6, 25.0], ["volta_scudnn_128x128_stridedB_splitK_small_nn_v1", 48, 20448, 426, 676, 307, 6.83, 25.0], ["volta_scudnn_128x32_sliced1x4_ldg4_relu_exp_medium_nhwc_tn_v1", 6, 3270, 545, 627, 526, 4.9, 25.0], ["volta_scudnn_128x64_relu_interior_nn_v1", 30, 8022, 267, 316, 94, 37.1, 25.0], ["volta_scudnn_128x64_relu_medium_nn_v1", 6, 3627, 604, 606, 603, 39.2, 25.0], ["volta_scudnn_128x64_relu_small_nn_v1", 12, 3265, 272, 279, 254, 9.8, 25.0], ["volta_scudnn_128x64_relu_xregs_large_nn_v1", 6, 3200, 533, 607, 516, 4.9, 19.0], ["volta_scudnn_128x64_stridedB_interior_nn_v1", 30, 9597, 320, 510, 252, 12.9, 19.0], ["volta_scudnn_128x64_stridedB_small_nn_v1", 6, 584, 97, 100, 93, 9.8, 19.0], ["volta_scudnn_128x64_stridedB_splitK_xregs_large_nn_v1", 12, 7817, 651, 671, 635, 15.96, 19.0], ["volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1", 36, 12704, 353, 362, 344, 22.4, 25.0], ["volta_sgemm_128x32_nt", 24, 8629, 360, 477, 18, 0.97, 11.51], ["volta_sgemm_32x128_nn", 18, 3053, 170, 171, 168, 22.05, 50.0], ["volta_sgemm_32x128_nt", 18, 2843, 158, 159, 156, 22.05, 50.0], ["volta_sgemm_64x32_sliced1x4_nn", 6, 150, 25, 26, 24, 2.0, 25.0], ["volta_sgemm_64x32_sliced1x4_tn", 6, 149, 25, 26, 24, 1.0, 13.0], ["volta_sgemm_64x64_nn", 42, 8551, 204, 217, 195, 12.34, 24.14], ["volta_sgemm_64x64_nt", 102, 21084, 207, 279, 184, 10.24, 19.38]]}}
 {"total": {"columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["cask_cudnn::computeBOffsetsKernel(cask_cudnn::ComputeBOffsetsParams)", 73.0], ["cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", 342.0], ["cask_cudnn::computeWgradBOffsetsKernel(cask_cudnn::ComputeWgradBOffsetsParams)", 81.0], ["cask_cudnn::computeWgradSplitKOffsetsKernel(cask_cudnn::ComputeSplitKOffsetsParams)", 81.0], ["void (anonymous namespace)::softmax_warp_backward<float, float, float, 10, true>(float*, float const*, float const*, int, int, int)", 53.0], ["void (anonymous namespace)::softmax_warp_forward<float, float, float, 10, true>(float*, float const*, int, int, int)", 42.0], ["void at::native::(anonymous namespace)::max_pool_backward_nchw<float, float>(int, float const*, long const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*)", 3822.0], ["void at::native::(anonymous namespace)::max_pool_forward_nchw<float, float>(int, float const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*, long*)", 1420.0], ["void at::native::reduce_kernel<128, 4, at::native::ReduceOp<float, at::native::func_wrapper_t<float, at::native::sum_functor<float, float, float>::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp<float, at::native::func_wrapper_t<float, at::native::sum_functor<float, float, float>::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", 44.0], ["void at::native::reduce_kernel<512, 1, at::native::ReduceOp<float, at::native::MeanOps<float, float>, unsigned int, float, 4> >(at::native::ReduceOp<float, at::native::MeanOps<float, float>, unsigned int, float, 4>)", 212.0], ["void at::native::unrolled_elementwise_kernel<at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast)", 144.0], ["void at::native::unrolled_elementwise_kernel<at::native::copy_device_to_device(at::TensorIterator&, bool)::{lambda()#2}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array<char*, 2>::StoreWithoutCast>(int, at::native::copy_device_to_device(at::TensorIterator&, bool)::{lambda()#2}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array<char*, 2>::StoreWithoutCast)", 30.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2> >(int, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2>)", 17860.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor<float>, at::detail::Array<char*, 3> >(int, at::native::AddFunctor<float>, at::detail::Array<char*, 3>)", 39814.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor<at::native::AddFunctor<long> >, at::detail::Array<char*, 2> >(int, at::native::BUnaryFunctor<at::native::AddFunctor<long> >, at::detail::Array<char*, 2>)", 322.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<float>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<float>, at::detail::Array<char*, 1>)", 1887.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2> >(int, at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2>)", 2409.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::threshold_kernel_impl<float>(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array<char*, 3> >(int, at::native::threshold_kernel_impl<float>(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array<char*, 3>)", 27298.0], ["void cudnn::bn_bw_1C11_kernel_new<float, float, float2, 512, true, 1>(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float)", 59642.0], ["void cudnn::bn_bw_1C11_singleread<float, 512, true, 1, 2, 0>(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnBwPersistentState*, int, float, float, float, int, float, cudnnStatus_t*, bool)", 2297.0], ["void cudnn::bn_fw_tr_1C11_kernel_NCHW<float, float, 512, true, 1>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float)", 27060.0], ["void cudnn::bn_fw_tr_1C11_singleread<float, 512, true, 1, 2, 0>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnFwPersistentState*, int, float, float, float, int, float, float, cudnnStatus_t*, bool)", 7185.0], ["void cudnn::cnn::im2col4d_kernel<float, long>(cudnn::cnn::im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, float const*, float*)", 614.0], ["void cudnn::cnn::wgrad_alg0_engine<float, 128, 5, 5, 3, 3, 3, false, 512>(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 7068.0], ["void cudnn::cnn::wgrad_alg0_engine<float, 128, 6, 7, 3, 3, 5, false, 512>(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 66472.0], ["void cudnn::cnn::wgrad_alg0_engine<float, 128, 6, 8, 3, 3, 5, false, 512>(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 4065.0], ["void cudnn::detail::dgrad_engine<float, 512, 6, 5, 3, 3, 3, false>(int, int, int, float const*, int, float const*, int, float*, kernel_grad_params, unsigned long long, int, unsigned long long, int, float, int, int, int)", 80756.0], ["void cudnn::ops::scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float*, float)", 4631.0], ["void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>)", 134.0], ["void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)", 4710.0], ["void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>)", 2662.0], ["void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>)", 5369.0], ["void cudnn::winograd_nonfused::winogradWgradData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)", 4692.0], ["void cudnn::winograd_nonfused::winogradWgradDelta4x4<float, float>(cudnn::winograd_nonfused::WinogradDeltaParams<float, float>)", 4573.0], ["void cudnn::winograd_nonfused::winogradWgradOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradWgradOutputParams<float, float>)", 1504.0], ["void cunn_ClassNLLCriterion_updateGradInput_kernel<float>(float*, float*, long*, float*, float*, int, int, int, int, long)", 12.0], ["void cunn_ClassNLLCriterion_updateOutput_kernel<float, float>(float*, float*, float*, long*, float*, int, int, int, int, long)", 18.0], ["void explicit_convolve_sgemm<float, int, 1024, 5, 5, 3, 3, 3, 0, false>(int, int, int, float const*, int, float const*, int, float*, kernel_conv_params, unsigned long long, int, unsigned long long, int, float, float, int, float const*, float const*)", 4759.0], ["void implicit_convolve_sgemm<float, float, 1024, 6, 7, 3, 3, 5, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 36957.0], ["void implicit_convolve_sgemm<float, float, 128, 5, 5, 3, 3, 3, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 5219.0], ["void implicit_convolve_sgemm<float, float, 128, 6, 7, 3, 3, 5, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 25782.0], ["void implicit_convolve_sgemm<float, float, 512, 6, 8, 3, 3, 5, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 3917.0], ["void nchwToNhwcKernel<float, float, float, true, false, (cudnnKernelDataType_t)0>(int, int, int, int, float const*, float*, float, float)", 453.0], ["void nhwcToNchwKernel<float, float, float, true, false, (cudnnKernelDataType_t)0>(int, int, int, int, float const*, float*, float, float)", 105.0], ["void splitKreduce_kernel<float, float, float, float>(cublasSplitKParams<float>, float const*, float const*, float*, float const*, float const*, float const*)", 30.0], ["volta_scudnn_128x128_relu_interior_nn_v1", 3010.0], ["volta_scudnn_128x128_stridedB_interior_nn_v1", 4693.0], ["volta_scudnn_128x128_stridedB_medium_nn_v1", 3501.0], ["volta_scudnn_128x128_stridedB_small_nn_v1", 2995.0], ["volta_scudnn_128x128_stridedB_splitK_medium_nn_v1", 3720.0], ["volta_scudnn_128x128_stridedB_splitK_small_nn_v1", 20448.0], ["volta_scudnn_128x32_sliced1x4_ldg4_relu_exp_medium_nhwc_tn_v1", 3270.0], ["volta_scudnn_128x64_relu_interior_nn_v1", 8022.0], ["volta_scudnn_128x64_relu_medium_nn_v1", 3627.0], ["volta_scudnn_128x64_relu_small_nn_v1", 3265.0], ["volta_scudnn_128x64_relu_xregs_large_nn_v1", 3200.0], ["volta_scudnn_128x64_stridedB_interior_nn_v1", 9597.0], ["volta_scudnn_128x64_stridedB_small_nn_v1", 584.0], ["volta_scudnn_128x64_stridedB_splitK_xregs_large_nn_v1", 7817.0], ["volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1", 12704.0], ["volta_sgemm_128x32_nt", 8629.0], ["volta_sgemm_32x128_nn", 3053.0], ["volta_sgemm_32x128_nt", 2843.0], ["volta_sgemm_64x32_sliced1x4_nn", 150.0], ["volta_sgemm_64x32_sliced1x4_tn", 149.0], ["volta_sgemm_64x64_nn", 8551.0], ["volta_sgemm_64x64_nt", 21084.0]]}}
-{"steps": {"columns": [{"type": "string", "name": "Step"}, {"type": "number", "name": "Kernel"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Memcpy"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Memset"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Runtime"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "DataLoader"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "CPU Exec"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Other"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}], "rows": [["5", 99778, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 182306us<br><b>Kernel: 99778us</b><br>Percentage: 54.73%</div>", 3606, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 182306us<br><b>Memcpy: 3606us</b><br>Percentage: 1.98%</div>", 98, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 182306us<br><b>Memset: 98us</b><br>Percentage: 0.05%</div>", 41028, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 182306us<br><b>Runtime: 41028us</b><br>Percentage: 22.51%</div>", 4341, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 182306us<br><b>DataLoader: 4341us</b><br>Percentage: 2.38%</div>", 27460, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 182306us<br><b>CPU Exec: 27460us</b><br>Percentage: 15.06%</div>", 5995, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 182306us<br><b>Other: 5995us</b><br>Percentage: 3.29%</div>"], ["6", 99208, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 126183us<br><b>Kernel: 99208us</b><br>Percentage: 78.62%</div>", 2948, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 126183us<br><b>Memcpy: 2948us</b><br>Percentage: 2.34%</div>", 98, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 126183us<br><b>Memset: 98us</b><br>Percentage: 0.08%</div>", 3406, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 126183us<br><b>Runtime: 3406us</b><br>Percentage: 2.7%</div>", 0, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 126183us<br><b>DataLoader: 0us</b><br>Percentage: 0.0%</div>", 16404, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 126183us<br><b>CPU Exec: 16404us</b><br>Percentage: 13.0%</div>", 4119, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 126183us<br><b>Other: 4119us</b><br>Percentage: 3.26%</div>"], ["7", 99114, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 127181us<br><b>Kernel: 99114us</b><br>Percentage: 77.93%</div>", 2949, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 127181us<br><b>Memcpy: 2949us</b><br>Percentage: 2.32%</div>", 98, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 127181us<br><b>Memset: 98us</b><br>Percentage: 0.08%</div>", 3417, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 127181us<br><b>Runtime: 3417us</b><br>Percentage: 2.69%</div>", 6, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 127181us<br><b>DataLoader: 6us</b><br>Percentage: 0.0%</div>", 19521, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 127181us<br><b>CPU Exec: 19521us</b><br>Percentage: 15.35%</div>", 2076, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 127181us<br><b>Other: 2076us</b><br>Percentage: 1.63%</div>"], ["8", 99021, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 123079us<br><b>Kernel: 99021us</b><br>Percentage: 80.45%</div>", 2975, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 123079us<br><b>Memcpy: 2975us</b><br>Percentage: 2.42%</div>", 97, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 123079us<br><b>Memset: 97us</b><br>Percentage: 0.08%</div>", 3544, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 123079us<br><b>Runtime: 3544us</b><br>Percentage: 2.88%</div>", 0, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 123079us<br><b>DataLoader: 0us</b><br>Percentage: 0.0%</div>", 15464, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 123079us<br><b>CPU Exec: 15464us</b><br>Percentage: 12.56%</div>", 1978, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 123079us<br><b>Other: 1978us</b><br>Percentage: 1.61%</div>"], ["9", 98791, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 163461us<br><b>Kernel: 98791us</b><br>Percentage: 60.44%</div>", 3596, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 163461us<br><b>Memcpy: 3596us</b><br>Percentage: 2.2%</div>", 97, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 163461us<br><b>Memset: 97us</b><br>Percentage: 0.06%</div>", 8275, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 163461us<br><b>Runtime: 8275us</b><br>Percentage: 5.06%</div>", 1370, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 163461us<br><b>DataLoader: 1370us</b><br>Percentage: 0.84%</div>", 43905, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 163461us<br><b>CPU Exec: 43905us</b><br>Percentage: 26.86%</div>", 7427, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 163461us<br><b>Other: 7427us</b><br>Percentage: 4.54%</div>"], ["10", 98956, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 124198us<br><b>Kernel: 98956us</b><br>Percentage: 79.68%</div>", 2885, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 124198us<br><b>Memcpy: 2885us</b><br>Percentage: 2.32%</div>", 98, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 124198us<br><b>Memset: 98us</b><br>Percentage: 0.08%</div>", 3714, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 124198us<br><b>Runtime: 3714us</b><br>Percentage: 2.99%</div>", 1400, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 124198us<br><b>DataLoader: 1400us</b><br>Percentage: 1.13%</div>", 13235, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 124198us<br><b>CPU Exec: 13235us</b><br>Percentage: 10.66%</div>", 3910, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 124198us<br><b>Other: 3910us</b><br>Percentage: 3.15%</div>"]]}, "performance": [{"name": "Average Step Time", "description": "", "value": 141068, "extra": 100, "children": [{"name": "Kernel", "description": "", "value": 99145, "extra": 70.28}, {"name": "Memcpy", "description": "", "value": 3160, "extra": 2.24}, {"name": "Memset", "description": "", "value": 98, "extra": 0.07}, {"name": "Runtime", "description": "", "value": 10564, "extra": 7.49}, {"name": "DataLoader", "description": "", "value": 1186, "extra": 0.84}, {"name": "CPU Exec", "description": "", "value": 22665, "extra": 16.07}, {"name": "Other", "description": "", "value": 4251, "extra": 3.01}]}], "recommendations": "<ul><li>N/A</li></ul>", "environments": [{"title": "Number of Worker(s)", "value": "1"}, {"title": "Device Type", "value": "GPU"}], "gpu_metrics": {"title": "GPU Summary", "data": [{"title": "GPU 0:", "value": ""}, {"title": "Name", "value": "Tesla V100-DGXS-32GB"}, {"title": "Memory", "value": "31.74 GB"}, {"title": "Compute Capability", "value": "7.0"}, {"title": "GPU Utilization", "value": "70.27 %"}, {"title": "Est. SM Efficiency", "value": "69.22 %"}, {"title": "Est. Achieved Occupancy", "value": "48.91 %"}], "tooltip": "The GPU usage metrics:\n\nGPU Utilization:\nGPU busy time / All steps time. GPU busy time is the time during which there is at least one GPU kernel running on it. All steps time is the total time of all profiler steps(or called as iterations).\n\nEst. SM Efficiency:\nEstimated Stream Multiprocessor Efficiency. Est. SM Efficiency of a kernel, SM_Eff_K = min(blocks of this kernel / SM number of this GPU, 100%). This overall number is the sum of all kernels' SM_Eff_K weighted by kernel's execution duration, divided by all steps time.\n\nEst. Achieved Occupancy:\nOccupancy is the ratio of active threads on an SM to the maximum number of active threads supported by the SM. The theoretical occupancy of a kernel is upper limit occupancy of this kernel, limited by multiple factors such as kernel shape, kernel used resource, and the GPU compute capability.Est. Achieved Occupancy of a kernel, OCC_K = min(threads of the kernel / SM number / max threads per SM, theoretical occupancy of the kernel). This overall number is the weighted sum of all kernels OCC_K using kernel's execution duration as weight."}}
+{"steps": {"columns": [{"type": "string", "name": "Step"}, {"type": "number", "name": "Kernel"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Memcpy"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Memset"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Runtime"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "DataLoader"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "CPU Exec"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Other"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}], "rows": [["5", 99778, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 182306us<br><b>Kernel: 99778us</b><br>Percentage: 54.73%</div>", 3606, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 182306us<br><b>Memcpy: 3606us</b><br>Percentage: 1.98%</div>", 98, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 182306us<br><b>Memset: 98us</b><br>Percentage: 0.05%</div>", 41028, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 182306us<br><b>Runtime: 41028us</b><br>Percentage: 22.51%</div>", 4341, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 182306us<br><b>DataLoader: 4341us</b><br>Percentage: 2.38%</div>", 27460, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 182306us<br><b>CPU Exec: 27460us</b><br>Percentage: 15.06%</div>", 5995, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 182306us<br><b>Other: 5995us</b><br>Percentage: 3.29%</div>"], ["6", 99208, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 126183us<br><b>Kernel: 99208us</b><br>Percentage: 78.62%</div>", 2948, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 126183us<br><b>Memcpy: 2948us</b><br>Percentage: 2.34%</div>", 98, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 126183us<br><b>Memset: 98us</b><br>Percentage: 0.08%</div>", 3406, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 126183us<br><b>Runtime: 3406us</b><br>Percentage: 2.7%</div>", 0, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 126183us<br><b>DataLoader: 0us</b><br>Percentage: 0.0%</div>", 16404, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 126183us<br><b>CPU Exec: 16404us</b><br>Percentage: 13.0%</div>", 4119, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 126183us<br><b>Other: 4119us</b><br>Percentage: 3.26%</div>"], ["7", 99114, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 127181us<br><b>Kernel: 99114us</b><br>Percentage: 77.93%</div>", 2949, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 127181us<br><b>Memcpy: 2949us</b><br>Percentage: 2.32%</div>", 98, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 127181us<br><b>Memset: 98us</b><br>Percentage: 0.08%</div>", 3417, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 127181us<br><b>Runtime: 3417us</b><br>Percentage: 2.69%</div>", 6, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 127181us<br><b>DataLoader: 6us</b><br>Percentage: 0.0%</div>", 19521, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 127181us<br><b>CPU Exec: 19521us</b><br>Percentage: 15.35%</div>", 2076, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 127181us<br><b>Other: 2076us</b><br>Percentage: 1.63%</div>"], ["8", 99021, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 123079us<br><b>Kernel: 99021us</b><br>Percentage: 80.45%</div>", 2975, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 123079us<br><b>Memcpy: 2975us</b><br>Percentage: 2.42%</div>", 97, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 123079us<br><b>Memset: 97us</b><br>Percentage: 0.08%</div>", 3544, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 123079us<br><b>Runtime: 3544us</b><br>Percentage: 2.88%</div>", 0, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 123079us<br><b>DataLoader: 0us</b><br>Percentage: 0.0%</div>", 15464, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 123079us<br><b>CPU Exec: 15464us</b><br>Percentage: 12.56%</div>", 1978, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 123079us<br><b>Other: 1978us</b><br>Percentage: 1.61%</div>"], ["9", 98791, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 163461us<br><b>Kernel: 98791us</b><br>Percentage: 60.44%</div>", 3596, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 163461us<br><b>Memcpy: 3596us</b><br>Percentage: 2.2%</div>", 97, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 163461us<br><b>Memset: 97us</b><br>Percentage: 0.06%</div>", 8275, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 163461us<br><b>Runtime: 8275us</b><br>Percentage: 5.06%</div>", 1370, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 163461us<br><b>DataLoader: 1370us</b><br>Percentage: 0.84%</div>", 43905, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 163461us<br><b>CPU Exec: 43905us</b><br>Percentage: 26.86%</div>", 7427, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 163461us<br><b>Other: 7427us</b><br>Percentage: 4.54%</div>"], ["10", 98956, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 124198us<br><b>Kernel: 98956us</b><br>Percentage: 79.68%</div>", 2885, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 124198us<br><b>Memcpy: 2885us</b><br>Percentage: 2.32%</div>", 98, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 124198us<br><b>Memset: 98us</b><br>Percentage: 0.08%</div>", 3714, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 124198us<br><b>Runtime: 3714us</b><br>Percentage: 2.99%</div>", 1400, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 124198us<br><b>DataLoader: 1400us</b><br>Percentage: 1.13%</div>", 13235, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 124198us<br><b>CPU Exec: 13235us</b><br>Percentage: 10.66%</div>", 3910, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 124198us<br><b>Other: 3910us</b><br>Percentage: 3.15%</div>"]]}, "performance": [{"name": "Average Step Time", "description": "", "value": 141068, "extra": 100, "children": [{"name": "Kernel", "description": "", "value": 99145, "extra": 70.28}, {"name": "Memcpy", "description": "", "value": 3160, "extra": 2.24}, {"name": "Memset", "description": "", "value": 98, "extra": 0.07}, {"name": "Runtime", "description": "", "value": 10564, "extra": 7.49}, {"name": "DataLoader", "description": "", "value": 1186, "extra": 0.84}, {"name": "CPU Exec", "description": "", "value": 22665, "extra": 16.07}, {"name": "Other", "description": "", "value": 4251, "extra": 3.01}]}], "recommendations": "<ul><li>N/A</li></ul>", "environments": [{"title": "Number of Worker(s)", "value": "1"}, {"title": "Device Type", "value": "GPU"}], "gpu_metrics": {"title": "GPU Summary", "data": [{"title": "GPU 0:", "value": ""}, {"title": "Name", "value": "Tesla V100-DGXS-32GB"}, {"title": "Memory", "value": "31.74 GB"}, {"title": "Compute Capability", "value": "7.0"}, {"title": "GPU Utilization", "value": "70.27 %"}, {"title": "Est. SM Efficiency", "value": "69.22 %"}, {"title": "Est. Achieved Occupancy", "value": "48.91 %"}], "tooltip": "The GPU usage metrics:\n\nGPU Utilization:\nGPU busy time / All steps time. The bigger, the better. GPU busy time is the time during which there is at least one GPU kernel running on it. All steps time is the total time of all profiler steps(or called as iterations).\n\nEst. SM Efficiency:\nEstimated Stream Multiprocessor Efficiency. The bigger, the better. This metric of a kernel, SM_Eff_K = min(blocks of this kernel / SM number of this GPU, 100%). This overall number is the sum of all kernels' SM_Eff_K weighted by kernel's execution duration, divided by all steps time.\n\nEst. Achieved Occupancy:\nThe bigger, the better. Occupancy is the ratio of active warps on an SM to the maximum number of active warps supported by the SM. The theoretical occupancy of a kernel is upper limit occupancy of this kernel, limited by multiple factors such as kernel shape, kernel used resource, and the GPU compute capability.\nEst. Achieved Occupancy of a kernel, OCC_K = min(threads of the kernel / SM number / max threads per SM, theoretical occupancy of the kernel). This overall number is the weighted average of all kernels' OCC_K using kernel's execution duration as weight. It shows fine-grained low-level GPU utilization."}}
 {"device_total_time": {"title": "Device Total Time (us)", "columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["aten::cudnn_convolution_backward", 274794], ["CudnnConvolutionBackward", 274794], ["aten::cudnn_convolution_backward_weight", 141300], ["aten::cudnn_convolution_backward_input", 133494], ["aten::cudnn_convolution", 128683], ["aten::_convolution", 128683], ["aten::convolution", 128683], ["aten::conv2d", 128683], ["aten::cudnn_batch_norm_backward", 61899], ["CudnnBatchNormBackward", 61899], ["aten::cudnn_batch_norm", 34315], ["aten::_batch_norm_impl_index", 34315], ["aten::batch_norm", 34315], ["aten::threshold_backward", 27280], ["ReluBackward1", 27280], ["aten::add_", 24052], ["aten::to", 18959], ["aten::copy_", 18959], ["aten::clamp_min", 17862], ["aten::clamp_min_", 17862], ["aten::relu_", 17862], ["aten::add", 16026], ["aten::max_pool2d_with_indices_backward", 4695], ["MaxPool2DWithIndicesBackward", 4695], ["torch::autograd::AccumulateGrad", 3012], ["aten::mul_", 2395], ["aten::fill_", 1888], ["aten::zero_", 1882], ["aten::max_pool2d_with_indices", 1422], ["aten::max_pool2d", 1422], ["aten::mm", 274], ["AddmmBackward", 274], ["aten::mean", 210], ["aten::adaptive_avg_pool2d", 210], ["aten::addmm", 197], ["aten::linear", 197], ["aten::div", 145], ["MeanBackward1", 145], ["aten::cross_entropy_loss", 60], ["aten::_log_softmax_backward_data", 51], ["LogSoftmaxBackward", 51], ["aten::sum", 45], ["aten::_log_softmax", 42], ["aten::log_softmax", 42], ["aten::nll_loss_forward", 18], ["aten::nll_loss", 18], ["aten::nll_loss_nd", 18], ["aten::nll_loss_backward", 18], ["NllLossBackward", 18], ["aten::ones_like", 6]]}, "device_self_time": {"title": "Device Self Time (us)", "columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["aten::cudnn_convolution_backward_weight", 141300], ["aten::cudnn_convolution_backward_input", 133494], ["aten::cudnn_convolution", 128683], ["aten::cudnn_batch_norm_backward", 61899], ["aten::cudnn_batch_norm", 34315], ["aten::threshold_backward", 27280], ["aten::add_", 24052], ["aten::copy_", 18959], ["aten::clamp_min", 17862], ["aten::add", 16026], ["aten::max_pool2d_with_indices_backward", 3838], ["aten::mul_", 2395], ["aten::fill_", 1888], ["aten::max_pool2d_with_indices", 1422], ["aten::mm", 274], ["aten::mean", 210], ["aten::addmm", 197], ["aten::div", 145], ["aten::_log_softmax_backward_data", 51], ["aten::sum", 45], ["aten::_log_softmax", 42], ["aten::nll_loss_forward", 18], ["aten::nll_loss_backward", 18]]}, "host_total_time": {"title": "Host Total Time (us)", "columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["CudnnConvolutionBackward", 119890], ["aten::cudnn_convolution_backward", 115797], ["aten::batch_norm", 105589], ["aten::add_", 97540], ["aten::_batch_norm_impl_index", 95925], ["aten::conv2d", 91000], ["aten::cudnn_batch_norm", 87823], ["aten::empty", 82024], ["aten::convolution", 81781], ["aten::_convolution", 74086], ["aten::cudnn_convolution", 64167], ["aten::cudnn_convolution_backward_weight", 60712], ["aten::to", 57776], ["aten::copy_", 56915], ["aten::cudnn_convolution_backward_input", 47359], ["CudnnBatchNormBackward", 41825], ["torch::autograd::AccumulateGrad", 37189], ["aten::cudnn_batch_norm_backward", 36641], ["aten::mul_", 35389], ["aten::relu_", 29432], ["aten::zero_", 28309], ["aten::add", 23831], ["aten::clamp_min_", 19059], ["aten::empty_like", 18591], ["aten::fill_", 17657], ["aten::resize_", 15019], ["ReluBackward1", 14944], ["aten::clamp_min", 12503], ["aten::threshold_backward", 12062], ["aten::view", 9046], ["AddmmBackward", 2026], ["aten::linear", 1463], ["aten::mm", 1424], ["aten::zeros", 1319], ["aten::cross_entropy_loss", 1225], ["aten::addmm", 1060], ["NllLossBackward", 889], ["aten::nll_loss_backward", 747], ["aten::t", 725], ["MeanBackward1", 663], ["aten::max_pool2d", 599], ["MaxPool2DWithIndicesBackward", 590], ["aten::adaptive_avg_pool2d", 581], ["aten::log_softmax", 580], ["aten::nll_loss_nd", 507], ["LogSoftmaxBackward", 500], ["aten::max_pool2d_with_indices_backward", 493], ["aten::ones_like", 470], ["aten::div", 469], ["aten::mean", 454], ["aten::empty_strided", 453], ["aten::_log_softmax_backward_data", 424], ["aten::max_pool2d_with_indices", 422], ["aten::_log_softmax", 420], ["aten::nll_loss", 418], ["aten::transpose", 413], ["aten::sum", 411], ["aten::nll_loss_forward", 343], ["aten::detach_", 323], ["aten::as_strided", 244], ["aten::expand", 237], ["aten::set_", 221], ["AddBackward0", 200], ["aten::flatten", 163], ["detach_", 156], ["TBackward", 151], ["ViewBackward", 132], ["aten::reshape", 88], ["aten::conj", 15]]}, "host_self_time": {"title": "Host Self Time (us)", "columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["aten::empty", 82024], ["aten::add_", 62385], ["aten::cudnn_convolution", 35632], ["aten::cudnn_convolution_backward_input", 31902], ["aten::cudnn_convolution_backward_weight", 30672], ["aten::mul_", 24617], ["aten::cudnn_batch_norm", 23800], ["aten::add", 17808], ["aten::cudnn_batch_norm_backward", 15118], ["aten::resize_", 15019], ["aten::zero_", 10815], ["aten::relu_", 10373], ["aten::_convolution", 9919], ["aten::batch_norm", 9664], ["aten::fill_", 9660], ["aten::conv2d", 9219], ["aten::view", 9046], ["aten::clamp_min", 8409], ["aten::empty_like", 8385], ["aten::_batch_norm_impl_index", 8102], ["aten::threshold_backward", 7820], ["aten::cudnn_convolution_backward", 7726], ["aten::convolution", 7695], ["torch::autograd::AccumulateGrad", 7181], ["aten::clamp_min_", 6556], ["CudnnBatchNormBackward", 5184], ["CudnnConvolutionBackward", 4093], ["ReluBackward1", 2882], ["aten::mm", 1032], ["aten::zeros", 877], ["aten::addmm", 652], ["aten::to", 547], ["aten::nll_loss_backward", 463], ["aten::empty_strided", 453], ["aten::div", 343], ["aten::max_pool2d_with_indices", 325], ["aten::t", 312], ["aten::nll_loss_forward", 264], ["aten::transpose", 254], ["aten::as_strided", 244], ["AddmmBackward", 244], ["aten::mean", 233], ["aten::copy_", 230], ["aten::set_", 221], ["aten::max_pool2d_with_indices_backward", 213], ["aten::sum", 201], ["AddBackward0", 200], ["aten::max_pool2d", 177], ["aten::_log_softmax", 168], ["aten::detach_", 167], ["detach_", 156], ["aten::expand", 152], ["NllLossBackward", 142], ["aten::_log_softmax_backward_data", 142], ["aten::linear", 139], ["aten::cross_entropy_loss", 138], ["aten::adaptive_avg_pool2d", 127], ["aten::log_softmax", 106], ["MaxPool2DWithIndicesBackward", 97], ["aten::ones_like", 96], ["MeanBackward1", 95], ["aten::nll_loss_nd", 89], ["aten::flatten", 88], ["LogSoftmaxBackward", 76], ["aten::nll_loss", 75], ["ViewBackward", 44], ["aten::reshape", 43], ["TBackward", 33], ["aten::conj", 15]]}}
 [{"name": "aten::cudnn_convolution_backward_weight", "calls": 318, "device_self_duration": 141300, "device_total_duration": 141300, "host_self_duration": 30672, "host_total_duration": 60712, "has_call_stack": false}, {"name": "aten::cudnn_convolution_backward_input", "calls": 312, "device_self_duration": 133494, "device_total_duration": 133494, "host_self_duration": 31902, "host_total_duration": 47359, "has_call_stack": false}, {"name": "aten::cudnn_convolution", "calls": 318, "device_self_duration": 128683, "device_total_duration": 128683, "host_self_duration": 35632, "host_total_duration": 64167, "has_call_stack": true}, {"name": "aten::cudnn_batch_norm_backward", "calls": 318, "device_self_duration": 61899, "device_total_duration": 61899, "host_self_duration": 15118, "host_total_duration": 36641, "has_call_stack": false}, {"name": "aten::cudnn_batch_norm", "calls": 318, "device_self_duration": 34315, "device_total_duration": 34315, "host_self_duration": 23800, "host_total_duration": 87823, "has_call_stack": true}, {"name": "aten::threshold_backward", "calls": 294, "device_self_duration": 27280, "device_total_duration": 27280, "host_self_duration": 7820, "host_total_duration": 12062, "has_call_stack": false}, {"name": "aten::add_", "calls": 2994, "device_self_duration": 24052, "device_total_duration": 24052, "host_self_duration": 62385, "host_total_duration": 97540, "has_call_stack": true}, {"name": "aten::copy_", "calls": 12, "device_self_duration": 18959, "device_total_duration": 18959, "host_self_duration": 230, "host_total_duration": 56915, "has_call_stack": true}, {"name": "aten::clamp_min", "calls": 294, "device_self_duration": 17862, "device_total_duration": 17862, "host_self_duration": 8409, "host_total_duration": 12503, "has_call_stack": true}, {"name": "aten::add", "calls": 414, "device_self_duration": 16026, "device_total_duration": 16026, "host_self_duration": 17808, "host_total_duration": 23831, "has_call_stack": true}, {"name": "aten::max_pool2d_with_indices_backward", "calls": 6, "device_self_duration": 3838, "device_total_duration": 4695, "host_self_duration": 213, "host_total_duration": 493, "has_call_stack": false}, {"name": "aten::mul_", "calls": 966, "device_self_duration": 2395, "device_total_duration": 2395, "host_self_duration": 24617, "host_total_duration": 35389, "has_call_stack": true}, {"name": "aten::fill_", "calls": 978, "device_self_duration": 1888, "device_total_duration": 1888, "host_self_duration": 9660, "host_total_duration": 17657, "has_call_stack": true}, {"name": "aten::max_pool2d_with_indices", "calls": 6, "device_self_duration": 1422, "device_total_duration": 1422, "host_self_duration": 325, "host_total_duration": 422, "has_call_stack": true}, {"name": "aten::mm", "calls": 12, "device_self_duration": 274, "device_total_duration": 274, "host_self_duration": 1032, "host_total_duration": 1424, "has_call_stack": false}, {"name": "aten::mean", "calls": 6, "device_self_duration": 210, "device_total_duration": 210, "host_self_duration": 233, "host_total_duration": 454, "has_call_stack": true}, {"name": "aten::addmm", "calls": 6, "device_self_duration": 197, "device_total_duration": 197, "host_self_duration": 652, "host_total_duration": 1060, "has_call_stack": true}, {"name": "aten::div", "calls": 6, "device_self_duration": 145, "device_total_duration": 145, "host_self_duration": 343, "host_total_duration": 469, "has_call_stack": false}, {"name": "aten::_log_softmax_backward_data", "calls": 6, "device_self_duration": 51, "device_total_duration": 51, "host_self_duration": 142, "host_total_duration": 424, "has_call_stack": false}, {"name": "aten::sum", "calls": 6, "device_self_duration": 45, "device_total_duration": 45, "host_self_duration": 201, "host_total_duration": 411, "has_call_stack": false}, {"name": "aten::_log_softmax", "calls": 6, "device_self_duration": 42, "device_total_duration": 42, "host_self_duration": 168, "host_total_duration": 420, "has_call_stack": true}, {"name": "aten::nll_loss_forward", "calls": 6, "device_self_duration": 18, "device_total_duration": 18, "host_self_duration": 264, "host_total_duration": 343, "has_call_stack": true}, {"name": "aten::nll_loss_backward", "calls": 6, "device_self_duration": 18, "device_total_duration": 18, "host_self_duration": 463, "host_total_duration": 747, "has_call_stack": false}, {"name": "aten::empty", "calls": 4212, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 82024, "host_total_duration": 82024, "has_call_stack": true}, {"name": "aten::zero_", "calls": 996, "device_self_duration": 0, "device_total_duration": 1882, "host_self_duration": 10815, "host_total_duration": 28309, "has_call_stack": true}, {"name": "aten::zeros", "calls": 24, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 877, "host_total_duration": 1319, "has_call_stack": true}, {"name": "aten::to", "calls": 36, "device_self_duration": 0, "device_total_duration": 18959, "host_self_duration": 547, "host_total_duration": 57776, "has_call_stack": true}, {"name": "detach_", "calls": 12, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 156, "host_total_duration": 156, "has_call_stack": true}, {"name": "aten::detach_", "calls": 12, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 167, "host_total_duration": 323, "has_call_stack": true}, {"name": "aten::set_", "calls": 12, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 221, "host_total_duration": 221, "has_call_stack": true}, {"name": "aten::empty_strided", "calls": 18, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 453, "host_total_duration": 453, "has_call_stack": true}, {"name": "aten::resize_", "calls": 1896, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 15019, "host_total_duration": 15019, "has_call_stack": true}, {"name": "aten::_convolution", "calls": 318, "device_self_duration": 0, "device_total_duration": 128683, "host_self_duration": 9919, "host_total_duration": 74086, "has_call_stack": true}, {"name": "aten::convolution", "calls": 318, "device_self_duration": 0, "device_total_duration": 128683, "host_self_duration": 7695, "host_total_duration": 81781, "has_call_stack": true}, {"name": "aten::conv2d", "calls": 318, "device_self_duration": 0, "device_total_duration": 128683, "host_self_duration": 9219, "host_total_duration": 91000, "has_call_stack": true}, {"name": "aten::empty_like", "calls": 336, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 8385, "host_total_duration": 18591, "has_call_stack": true}, {"name": "aten::view", "calls": 654, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 9046, "host_total_duration": 9046, "has_call_stack": true}, {"name": "aten::_batch_norm_impl_index", "calls": 318, "device_self_duration": 0, "device_total_duration": 34315, "host_self_duration": 8102, "host_total_duration": 95925, "has_call_stack": true}, {"name": "aten::batch_norm", "calls": 318, "device_self_duration": 0, "device_total_duration": 34315, "host_self_duration": 9664, "host_total_duration": 105589, "has_call_stack": true}, {"name": "aten::clamp_min_", "calls": 294, "device_self_duration": 0, "device_total_duration": 17862, "host_self_duration": 6556, "host_total_duration": 19059, "has_call_stack": true}, {"name": "aten::relu_", "calls": 294, "device_self_duration": 0, "device_total_duration": 17862, "host_self_duration": 10373, "host_total_duration": 29432, "has_call_stack": true}, {"name": "aten::max_pool2d", "calls": 6, "device_self_duration": 0, "device_total_duration": 1422, "host_self_duration": 177, "host_total_duration": 599, "has_call_stack": true}, {"name": "aten::adaptive_avg_pool2d", "calls": 6, "device_self_duration": 0, "device_total_duration": 210, "host_self_duration": 127, "host_total_duration": 581, "has_call_stack": true}, {"name": "aten::flatten", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 88, "host_total_duration": 163, "has_call_stack": true}, {"name": "aten::as_strided", "calls": 42, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 244, "host_total_duration": 244, "has_call_stack": true}, {"name": "aten::transpose", "calls": 30, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 254, "host_total_duration": 413, "has_call_stack": true}, {"name": "aten::t", "calls": 30, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 312, "host_total_duration": 725, "has_call_stack": true}, {"name": "aten::expand", "calls": 12, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 152, "host_total_duration": 237, "has_call_stack": true}, {"name": "aten::linear", "calls": 6, "device_self_duration": 0, "device_total_duration": 197, "host_self_duration": 139, "host_total_duration": 1463, "has_call_stack": true}, {"name": "aten::log_softmax", "calls": 6, "device_self_duration": 0, "device_total_duration": 42, "host_self_duration": 106, "host_total_duration": 580, "has_call_stack": true}, {"name": "aten::nll_loss", "calls": 6, "device_self_duration": 0, "device_total_duration": 18, "host_self_duration": 75, "host_total_duration": 418, "has_call_stack": true}, {"name": "aten::nll_loss_nd", "calls": 6, "device_self_duration": 0, "device_total_duration": 18, "host_self_duration": 89, "host_total_duration": 507, "has_call_stack": true}, {"name": "aten::cross_entropy_loss", "calls": 6, "device_self_duration": 0, "device_total_duration": 60, "host_self_duration": 138, "host_total_duration": 1225, "has_call_stack": true}, {"name": "aten::ones_like", "calls": 6, "device_self_duration": 0, "device_total_duration": 6, "host_self_duration": 96, "host_total_duration": 470, "has_call_stack": true}, {"name": "NllLossBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 18, "host_self_duration": 142, "host_total_duration": 889, "has_call_stack": false}, {"name": "LogSoftmaxBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 51, "host_self_duration": 76, "host_total_duration": 500, "has_call_stack": false}, {"name": "aten::conj", "calls": 12, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 15, "host_total_duration": 15, "has_call_stack": false}, {"name": "AddmmBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 274, "host_self_duration": 244, "host_total_duration": 2026, "has_call_stack": false}, {"name": "torch::autograd::AccumulateGrad", "calls": 966, "device_self_duration": 0, "device_total_duration": 3012, "host_self_duration": 7181, "host_total_duration": 37189, "has_call_stack": false}, {"name": "TBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 33, "host_total_duration": 151, "has_call_stack": false}, {"name": "aten::reshape", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 43, "host_total_duration": 88, "has_call_stack": false}, {"name": "ViewBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 44, "host_total_duration": 132, "has_call_stack": false}, {"name": "MeanBackward1", "calls": 6, "device_self_duration": 0, "device_total_duration": 145, "host_self_duration": 95, "host_total_duration": 663, "has_call_stack": false}, {"name": "ReluBackward1", "calls": 294, "device_self_duration": 0, "device_total_duration": 27280, "host_self_duration": 2882, "host_total_duration": 14944, "has_call_stack": false}, {"name": "AddBackward0", "calls": 96, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 200, "host_total_duration": 200, "has_call_stack": false}, {"name": "CudnnBatchNormBackward", "calls": 318, "device_self_duration": 0, "device_total_duration": 61899, "host_self_duration": 5184, "host_total_duration": 41825, "has_call_stack": false}, {"name": "aten::cudnn_convolution_backward", "calls": 318, "device_self_duration": 0, "device_total_duration": 274794, "host_self_duration": 7726, "host_total_duration": 115797, "has_call_stack": false}, {"name": "CudnnConvolutionBackward", "calls": 318, "device_self_duration": 0, "device_total_duration": 274794, "host_self_duration": 4093, "host_total_duration": 119890, "has_call_stack": false}, {"name": "MaxPool2DWithIndicesBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 4695, "host_self_duration": 97, "host_total_duration": 590, "has_call_stack": false}]
-{"data": {"columns": [{"type": "string", "name": "Name"}, {"type": "number", "name": "Calls"}, {"type": "number", "name": "Total Duration (us)"}, {"type": "number", "name": "Mean Duration (us)"}, {"type": "number", "name": "Max Duration (us)"}, {"type": "number", "name": "Min Duration (us)"}, {"type": "number", "name": "Mean Blocks Per SM", "tooltip": "Blocks Per SM:\nmin(blocks of this kernel / SM number of this GPU). If this number is less than 1, it indicates the GPU multiprocessors are not fully utilized."}, {"type": "number", "name": "Mean Est. Achieved Occupancy (%)", "tooltip": "Est. Achieved Occupancy:\nOccupancy is the ratio of active threads on an SM to the maximum number of active threads supported by the SM. The theoretical occupancy of a kernel is upper limit occupancy of this kernel, limited by multiple factors such as kernel shape, kernel used resource, and the GPU compute capability.Est. Achieved Occupancy of a kernel, OCC_K = min(threads of the kernel / SM number / max threads per SM, theoretical occupancy of the kernel). This overall number is the weighted sum of all kernels OCC_K using kernel's execution duration as weight."}], "rows": [["cask_cudnn::computeBOffsetsKernel(cask_cudnn::ComputeBOffsetsParams)", 54, 57, 1, 2, 1, 0.02, 0.0], ["cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", 108, 216, 2, 5, 1, 0.16, 2.0], ["cask_cudnn::computeWgradBOffsetsKernel(cask_cudnn::ComputeWgradBOffsetsParams)", 132, 150, 1, 2, 1, 0.02, 0.0], ["cask_cudnn::computeWgradSplitKOffsetsKernel(cask_cudnn::ComputeSplitKOffsetsParams)", 132, 155, 1, 2, 1, 0.16, 1.83], ["void (anonymous namespace)::softmax_warp_backward<float, float, float, 10, true>(float*, float const*, float const*, int, int, int)", 6, 51, 8, 9, 8, 0.1, 1.0], ["void (anonymous namespace)::softmax_warp_forward<float, float, float, 10, true>(float*, float const*, int, int, int)", 6, 42, 7, 7, 7, 0.1, 1.0], ["void at::native::(anonymous namespace)::max_pool_backward_nchw<float, float>(int, float const*, long const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*)", 6, 3838, 640, 643, 637, 1254.4, 100.0], ["void at::native::(anonymous namespace)::max_pool_forward_nchw<float, float>(int, float const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*, long*)", 6, 1422, 237, 243, 234, 313.6, 100.0], ["void at::native::reduce_kernel<128, 4, at::native::ReduceOp<float, at::native::func_wrapper_t<float, at::native::sum_functor<float, float, float>::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp<float, at::native::func_wrapper_t<float, at::native::sum_functor<float, float, float>::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", 6, 45, 8, 8, 7, 0.02, 0.0], ["void at::native::reduce_kernel<512, 1, at::native::ReduceOp<float, at::native::MeanOps<float, float>, unsigned int, float, 4> >(at::native::ReduceOp<float, at::native::MeanOps<float, float>, unsigned int, float, 4>)", 6, 210, 35, 35, 35, 51.2, 100.0], ["void at::native::unrolled_elementwise_kernel<at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast)", 6, 145, 24, 25, 24, 156.8, 100.0], ["void at::native::unrolled_elementwise_kernel<at::native::copy_device_to_device(at::TensorIterator&, bool)::{lambda()#2}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array<char*, 2>::StoreWithoutCast>(int, at::native::copy_device_to_device(at::TensorIterator&, bool)::{lambda()#2}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array<char*, 2>::StoreWithoutCast)", 6, 30, 5, 5, 5, 1.56, 5.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2> >(int, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2>)", 294, 17862, 61, 252, 5, 666.77, 100.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor<float>, at::detail::Array<char*, 3> >(int, at::native::AddFunctor<float>, at::detail::Array<char*, 3>)", 3090, 39753, 13, 376, 1, 641.51, 92.35], ["void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor<at::native::AddFunctor<long> >, at::detail::Array<char*, 2> >(int, at::native::BUnaryFunctor<at::native::AddFunctor<long> >, at::detail::Array<char*, 2>)", 318, 325, 1, 2, 1, 0.01, 0.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<float>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<float>, at::detail::Array<char*, 1>)", 978, 1888, 2, 143, 0, 600.2, 86.95], ["void at::native::vectorized_elementwise_kernel<4, at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2> >(int, at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2>)", 966, 2395, 2, 25, 1, 44.01, 58.56], ["void at::native::vectorized_elementwise_kernel<4, at::native::threshold_kernel_impl<float>(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array<char*, 3> >(int, at::native::threshold_kernel_impl<float>(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array<char*, 3>)", 294, 27280, 93, 377, 13, 653.26, 100.0], ["void cudnn::bn_bw_1C11_kernel_new<float, float, float2, 512, true, 1>(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float)", 264, 59568, 226, 923, 45, 4.33, 67.92], ["void cudnn::bn_bw_1C11_singleread<float, 512, true, 1, 2, 0>(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnBwPersistentState*, int, float, float, float, int, float, cudnnStatus_t*, bool)", 54, 2331, 43, 75, 19, 20.83, 75.0], ["void cudnn::bn_fw_tr_1C11_kernel_NCHW<float, float, 512, true, 1>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float)", 150, 27084, 181, 454, 53, 3.12, 64.02], ["void cudnn::bn_fw_tr_1C11_singleread<float, 512, true, 1, 2, 0>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnFwPersistentState*, int, float, float, float, int, float, float, cudnnStatus_t*, bool)", 168, 7231, 43, 89, 11, 12.63, 75.0], ["void cudnn::cnn::wgrad_alg0_engine<float, 128, 5, 5, 3, 3, 3, false, 512>(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 12, 7068, 589, 990, 192, 85.38, 37.51], ["void cudnn::cnn::wgrad_alg0_engine<float, 128, 6, 7, 3, 3, 5, false, 512>(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 90, 43471, 483, 742, 363, 8.18, 38.0], ["void cudnn::cnn::wgrad_alg0_engine<float, 128, 6, 8, 3, 3, 5, false, 512>(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 6, 4038, 673, 691, 649, 6.4, 25.0], ["void cudnn::detail::dgrad_engine<float, 512, 6, 5, 3, 3, 3, false>(int, int, int, float const*, int, float const*, int, float*, kernel_grad_params, unsigned long long, int, unsigned long long, int, float, int, int, int)", 180, 86855, 483, 1023, 323, 45.33, 30.04], ["void cudnn::ops::scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float*, float)", 180, 5901, 33, 142, 5, 525.02, 100.0], ["void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>)", 36, 126, 4, 5, 2, 0.4, 3.0], ["void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)", 120, 4648, 39, 67, 17, 10.15, 50.0], ["void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>)", 120, 2632, 22, 67, 4, 8.75, 73.78], ["void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>)", 120, 5314, 44, 72, 20, 10.02, 50.0], ["void cudnn::winograd_nonfused::winogradWgradData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)", 78, 4681, 60, 126, 20, 15.46, 38.0], ["void cudnn::winograd_nonfused::winogradWgradDelta4x4<float, float>(cudnn::winograd_nonfused::WinogradDeltaParams<float, float>)", 78, 4559, 58, 126, 17, 15.71, 50.0], ["void cudnn::winograd_nonfused::winogradWgradOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradWgradOutputParams<float, float>)", 78, 1484, 19, 69, 3, 8.13, 41.71], ["void cunn_ClassNLLCriterion_updateGradInput_kernel<float>(float*, float*, long*, float*, float*, int, int, int, int, long)", 6, 12, 2, 2, 2, 0.01, 0.0], ["void cunn_ClassNLLCriterion_updateOutput_kernel<float, float>(float*, float*, float*, long*, float*, int, int, int, int, long)", 6, 18, 3, 3, 3, 0.01, 0.0], ["void implicit_convolve_sgemm<float, float, 1024, 6, 7, 3, 3, 5, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 90, 37016, 411, 735, 346, 12.39, 50.0], ["void implicit_convolve_sgemm<float, float, 128, 5, 5, 3, 3, 3, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 12, 5221, 435, 440, 431, 9.8, 31.0], ["void implicit_convolve_sgemm<float, float, 128, 6, 7, 3, 3, 5, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 72, 35106, 488, 822, 350, 3.83, 41.64], ["void implicit_convolve_sgemm<float, float, 512, 6, 8, 3, 3, 5, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 12, 7939, 662, 733, 584, 7.54, 25.0], ["void nchwToNhwcKernel<float, float, float, true, false, (cudnnKernelDataType_t)0>(int, int, int, int, float const*, float*, float, float)", 12, 383, 32, 34, 29, 71.72, 100.0], ["void nhwcToNchwKernel<float, float, float, true, false, (cudnnKernelDataType_t)0>(int, int, int, int, float const*, float*, float, float)", 6, 54, 9, 10, 8, 12.8, 100.0], ["void splitKreduce_kernel<float, float, float, float>(cublasSplitKParams<float>, float const*, float const*, float*, float const*, float const*, float const*)", 12, 31, 3, 4, 2, 4.39, 27.74], ["volta_scudnn_128x128_stridedB_medium_nn_v1", 12, 3550, 296, 309, 286, 19.6, 25.0], ["volta_scudnn_128x128_stridedB_small_nn_v1", 6, 3034, 506, 520, 491, 19.6, 25.0], ["volta_scudnn_128x128_stridedB_splitK_medium_nn_v1", 72, 25342, 352, 629, 323, 3.21, 25.0], ["volta_scudnn_128x128_stridedB_splitK_small_nn_v1", 48, 20473, 427, 681, 309, 6.82, 25.0], ["volta_scudnn_128x32_sliced1x4_ldg4_relu_exp_small_nhwc_tn_v1", 6, 3697, 616, 621, 614, 2.6, 25.0], ["volta_scudnn_128x64_relu_interior_nn_v1", 30, 7976, 266, 316, 92, 37.08, 25.0], ["volta_scudnn_128x64_relu_medium_nn_v1", 6, 3647, 608, 620, 602, 39.2, 25.0], ["volta_scudnn_128x64_relu_small_nn_v1", 12, 3273, 273, 286, 258, 9.8, 25.0], ["volta_scudnn_128x64_stridedB_interior_nn_v1", 30, 9559, 319, 508, 255, 12.91, 19.0], ["volta_scudnn_128x64_stridedB_small_nn_v1", 6, 582, 97, 99, 94, 9.8, 19.0], ["volta_scudnn_128x64_stridedB_splitK_xregs_large_nn_v1", 12, 7819, 652, 670, 634, 15.96, 19.0], ["volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1", 36, 12761, 354, 365, 344, 22.4, 25.0], ["volta_sgemm_128x32_nt", 24, 8658, 361, 479, 18, 0.97, 11.51], ["volta_sgemm_32x128_nn", 18, 3059, 170, 173, 167, 22.05, 50.0], ["volta_sgemm_32x128_nt", 18, 2837, 158, 159, 156, 22.05, 50.0], ["volta_sgemm_64x32_sliced1x4_nn", 6, 149, 25, 25, 24, 2.0, 25.0], ["volta_sgemm_64x32_sliced1x4_tn", 6, 148, 25, 25, 24, 1.0, 13.0], ["volta_sgemm_64x64_nn", 42, 8544, 203, 210, 197, 12.35, 24.14], ["volta_sgemm_64x64_nt", 102, 21125, 207, 281, 184, 10.28, 19.38]]}}
+{"data": {"columns": [{"type": "string", "name": "Name"}, {"type": "number", "name": "Calls"}, {"type": "number", "name": "Total Duration (us)"}, {"type": "number", "name": "Mean Duration (us)"}, {"type": "number", "name": "Max Duration (us)"}, {"type": "number", "name": "Min Duration (us)"}, {"type": "number", "name": "Mean Blocks Per SM", "tooltip": "Blocks Per SM = blocks of this kernel / SM number of this GPU.\nIf this number is less than 1, it indicates the GPU multiprocessors are not fully utilized.\n\"Mean Blocks per SM\" is the weighted average of all calls of this kernel, using each call's execution duration as weight."}, {"type": "number", "name": "Mean Est. Achieved Occupancy (%)", "tooltip": "Est. Achieved Occupancy:\nThe bigger, the better. Occupancy is the ratio of active warps on an SM to the maximum number of active warps supported by the SM. The theoretical occupancy of a kernel is upper limit occupancy of this kernel, limited by multiple factors such as kernel shape, kernel used resource, and the GPU compute capability.\nEst. Achieved Occupancy of a kernel, OCC_K = min(threads of the kernel / SM number / max threads per SM, theoretical occupancy of the kernel). This \"Mean\" number is the weighted average of all calls' OCC_K of the kernel, using each call's execution duration as weight. It shows fine-grained low-level GPU utilization."}], "rows": [["cask_cudnn::computeBOffsetsKernel(cask_cudnn::ComputeBOffsetsParams)", 54, 57, 1, 2, 1, 0.02, 0.0], ["cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", 108, 216, 2, 5, 1, 0.16, 2.0], ["cask_cudnn::computeWgradBOffsetsKernel(cask_cudnn::ComputeWgradBOffsetsParams)", 132, 150, 1, 2, 1, 0.02, 0.0], ["cask_cudnn::computeWgradSplitKOffsetsKernel(cask_cudnn::ComputeSplitKOffsetsParams)", 132, 155, 1, 2, 1, 0.16, 1.83], ["void (anonymous namespace)::softmax_warp_backward<float, float, float, 10, true>(float*, float const*, float const*, int, int, int)", 6, 51, 8, 9, 8, 0.1, 1.0], ["void (anonymous namespace)::softmax_warp_forward<float, float, float, 10, true>(float*, float const*, int, int, int)", 6, 42, 7, 7, 7, 0.1, 1.0], ["void at::native::(anonymous namespace)::max_pool_backward_nchw<float, float>(int, float const*, long const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*)", 6, 3838, 640, 643, 637, 1254.4, 100.0], ["void at::native::(anonymous namespace)::max_pool_forward_nchw<float, float>(int, float const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*, long*)", 6, 1422, 237, 243, 234, 313.6, 100.0], ["void at::native::reduce_kernel<128, 4, at::native::ReduceOp<float, at::native::func_wrapper_t<float, at::native::sum_functor<float, float, float>::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp<float, at::native::func_wrapper_t<float, at::native::sum_functor<float, float, float>::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", 6, 45, 8, 8, 7, 0.02, 0.0], ["void at::native::reduce_kernel<512, 1, at::native::ReduceOp<float, at::native::MeanOps<float, float>, unsigned int, float, 4> >(at::native::ReduceOp<float, at::native::MeanOps<float, float>, unsigned int, float, 4>)", 6, 210, 35, 35, 35, 51.2, 100.0], ["void at::native::unrolled_elementwise_kernel<at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast)", 6, 145, 24, 25, 24, 156.8, 100.0], ["void at::native::unrolled_elementwise_kernel<at::native::copy_device_to_device(at::TensorIterator&, bool)::{lambda()#2}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array<char*, 2>::StoreWithoutCast>(int, at::native::copy_device_to_device(at::TensorIterator&, bool)::{lambda()#2}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array<char*, 2>::StoreWithoutCast)", 6, 30, 5, 5, 5, 1.56, 5.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2> >(int, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2>)", 294, 17862, 61, 252, 5, 666.77, 100.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor<float>, at::detail::Array<char*, 3> >(int, at::native::AddFunctor<float>, at::detail::Array<char*, 3>)", 3090, 39753, 13, 376, 1, 641.51, 92.35], ["void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor<at::native::AddFunctor<long> >, at::detail::Array<char*, 2> >(int, at::native::BUnaryFunctor<at::native::AddFunctor<long> >, at::detail::Array<char*, 2>)", 318, 325, 1, 2, 1, 0.01, 0.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<float>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<float>, at::detail::Array<char*, 1>)", 978, 1888, 2, 143, 0, 600.2, 86.95], ["void at::native::vectorized_elementwise_kernel<4, at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2> >(int, at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2>)", 966, 2395, 2, 25, 1, 44.01, 58.56], ["void at::native::vectorized_elementwise_kernel<4, at::native::threshold_kernel_impl<float>(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array<char*, 3> >(int, at::native::threshold_kernel_impl<float>(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array<char*, 3>)", 294, 27280, 93, 377, 13, 653.26, 100.0], ["void cudnn::bn_bw_1C11_kernel_new<float, float, float2, 512, true, 1>(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float)", 264, 59568, 226, 923, 45, 4.33, 67.92], ["void cudnn::bn_bw_1C11_singleread<float, 512, true, 1, 2, 0>(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnBwPersistentState*, int, float, float, float, int, float, cudnnStatus_t*, bool)", 54, 2331, 43, 75, 19, 20.83, 75.0], ["void cudnn::bn_fw_tr_1C11_kernel_NCHW<float, float, 512, true, 1>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float)", 150, 27084, 181, 454, 53, 3.12, 64.02], ["void cudnn::bn_fw_tr_1C11_singleread<float, 512, true, 1, 2, 0>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnFwPersistentState*, int, float, float, float, int, float, float, cudnnStatus_t*, bool)", 168, 7231, 43, 89, 11, 12.63, 75.0], ["void cudnn::cnn::wgrad_alg0_engine<float, 128, 5, 5, 3, 3, 3, false, 512>(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 12, 7068, 589, 990, 192, 85.38, 37.51], ["void cudnn::cnn::wgrad_alg0_engine<float, 128, 6, 7, 3, 3, 5, false, 512>(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 90, 43471, 483, 742, 363, 8.18, 38.0], ["void cudnn::cnn::wgrad_alg0_engine<float, 128, 6, 8, 3, 3, 5, false, 512>(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 6, 4038, 673, 691, 649, 6.4, 25.0], ["void cudnn::detail::dgrad_engine<float, 512, 6, 5, 3, 3, 3, false>(int, int, int, float const*, int, float const*, int, float*, kernel_grad_params, unsigned long long, int, unsigned long long, int, float, int, int, int)", 180, 86855, 483, 1023, 323, 45.33, 30.04], ["void cudnn::ops::scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float*, float)", 180, 5901, 33, 142, 5, 525.02, 100.0], ["void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>)", 36, 126, 4, 5, 2, 0.4, 3.0], ["void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)", 120, 4648, 39, 67, 17, 10.15, 50.0], ["void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>)", 120, 2632, 22, 67, 4, 8.75, 73.78], ["void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>)", 120, 5314, 44, 72, 20, 10.02, 50.0], ["void cudnn::winograd_nonfused::winogradWgradData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)", 78, 4681, 60, 126, 20, 15.46, 38.0], ["void cudnn::winograd_nonfused::winogradWgradDelta4x4<float, float>(cudnn::winograd_nonfused::WinogradDeltaParams<float, float>)", 78, 4559, 58, 126, 17, 15.71, 50.0], ["void cudnn::winograd_nonfused::winogradWgradOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradWgradOutputParams<float, float>)", 78, 1484, 19, 69, 3, 8.13, 41.71], ["void cunn_ClassNLLCriterion_updateGradInput_kernel<float>(float*, float*, long*, float*, float*, int, int, int, int, long)", 6, 12, 2, 2, 2, 0.01, 0.0], ["void cunn_ClassNLLCriterion_updateOutput_kernel<float, float>(float*, float*, float*, long*, float*, int, int, int, int, long)", 6, 18, 3, 3, 3, 0.01, 0.0], ["void implicit_convolve_sgemm<float, float, 1024, 6, 7, 3, 3, 5, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 90, 37016, 411, 735, 346, 12.39, 50.0], ["void implicit_convolve_sgemm<float, float, 128, 5, 5, 3, 3, 3, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 12, 5221, 435, 440, 431, 9.8, 31.0], ["void implicit_convolve_sgemm<float, float, 128, 6, 7, 3, 3, 5, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 72, 35106, 488, 822, 350, 3.83, 41.64], ["void implicit_convolve_sgemm<float, float, 512, 6, 8, 3, 3, 5, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 12, 7939, 662, 733, 584, 7.54, 25.0], ["void nchwToNhwcKernel<float, float, float, true, false, (cudnnKernelDataType_t)0>(int, int, int, int, float const*, float*, float, float)", 12, 383, 32, 34, 29, 71.72, 100.0], ["void nhwcToNchwKernel<float, float, float, true, false, (cudnnKernelDataType_t)0>(int, int, int, int, float const*, float*, float, float)", 6, 54, 9, 10, 8, 12.8, 100.0], ["void splitKreduce_kernel<float, float, float, float>(cublasSplitKParams<float>, float const*, float const*, float*, float const*, float const*, float const*)", 12, 31, 3, 4, 2, 4.39, 27.74], ["volta_scudnn_128x128_stridedB_medium_nn_v1", 12, 3550, 296, 309, 286, 19.6, 25.0], ["volta_scudnn_128x128_stridedB_small_nn_v1", 6, 3034, 506, 520, 491, 19.6, 25.0], ["volta_scudnn_128x128_stridedB_splitK_medium_nn_v1", 72, 25342, 352, 629, 323, 3.21, 25.0], ["volta_scudnn_128x128_stridedB_splitK_small_nn_v1", 48, 20473, 427, 681, 309, 6.82, 25.0], ["volta_scudnn_128x32_sliced1x4_ldg4_relu_exp_small_nhwc_tn_v1", 6, 3697, 616, 621, 614, 2.6, 25.0], ["volta_scudnn_128x64_relu_interior_nn_v1", 30, 7976, 266, 316, 92, 37.08, 25.0], ["volta_scudnn_128x64_relu_medium_nn_v1", 6, 3647, 608, 620, 602, 39.2, 25.0], ["volta_scudnn_128x64_relu_small_nn_v1", 12, 3273, 273, 286, 258, 9.8, 25.0], ["volta_scudnn_128x64_stridedB_interior_nn_v1", 30, 9559, 319, 508, 255, 12.91, 19.0], ["volta_scudnn_128x64_stridedB_small_nn_v1", 6, 582, 97, 99, 94, 9.8, 19.0], ["volta_scudnn_128x64_stridedB_splitK_xregs_large_nn_v1", 12, 7819, 652, 670, 634, 15.96, 19.0], ["volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1", 36, 12761, 354, 365, 344, 22.4, 25.0], ["volta_sgemm_128x32_nt", 24, 8658, 361, 479, 18, 0.97, 11.51], ["volta_sgemm_32x128_nn", 18, 3059, 170, 173, 167, 22.05, 50.0], ["volta_sgemm_32x128_nt", 18, 2837, 158, 159, 156, 22.05, 50.0], ["volta_sgemm_64x32_sliced1x4_nn", 6, 149, 25, 25, 24, 2.0, 25.0], ["volta_sgemm_64x32_sliced1x4_tn", 6, 148, 25, 25, 24, 1.0, 13.0], ["volta_sgemm_64x64_nn", 42, 8544, 203, 210, 197, 12.35, 24.14], ["volta_sgemm_64x64_nt", 102, 21125, 207, 281, 184, 10.28, 19.38]]}}
 {"total": {"columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["cask_cudnn::computeBOffsetsKernel(cask_cudnn::ComputeBOffsetsParams)", 57.0], ["cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", 216.0], ["cask_cudnn::computeWgradBOffsetsKernel(cask_cudnn::ComputeWgradBOffsetsParams)", 150.0], ["cask_cudnn::computeWgradSplitKOffsetsKernel(cask_cudnn::ComputeSplitKOffsetsParams)", 155.0], ["void (anonymous namespace)::softmax_warp_backward<float, float, float, 10, true>(float*, float const*, float const*, int, int, int)", 51.0], ["void (anonymous namespace)::softmax_warp_forward<float, float, float, 10, true>(float*, float const*, int, int, int)", 42.0], ["void at::native::(anonymous namespace)::max_pool_backward_nchw<float, float>(int, float const*, long const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*)", 3838.0], ["void at::native::(anonymous namespace)::max_pool_forward_nchw<float, float>(int, float const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*, long*)", 1422.0], ["void at::native::reduce_kernel<128, 4, at::native::ReduceOp<float, at::native::func_wrapper_t<float, at::native::sum_functor<float, float, float>::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp<float, at::native::func_wrapper_t<float, at::native::sum_functor<float, float, float>::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", 45.0], ["void at::native::reduce_kernel<512, 1, at::native::ReduceOp<float, at::native::MeanOps<float, float>, unsigned int, float, 4> >(at::native::ReduceOp<float, at::native::MeanOps<float, float>, unsigned int, float, 4>)", 210.0], ["void at::native::unrolled_elementwise_kernel<at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast)", 145.0], ["void at::native::unrolled_elementwise_kernel<at::native::copy_device_to_device(at::TensorIterator&, bool)::{lambda()#2}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array<char*, 2>::StoreWithoutCast>(int, at::native::copy_device_to_device(at::TensorIterator&, bool)::{lambda()#2}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array<char*, 2>::StoreWithoutCast)", 30.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2> >(int, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2>)", 17862.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor<float>, at::detail::Array<char*, 3> >(int, at::native::AddFunctor<float>, at::detail::Array<char*, 3>)", 39753.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor<at::native::AddFunctor<long> >, at::detail::Array<char*, 2> >(int, at::native::BUnaryFunctor<at::native::AddFunctor<long> >, at::detail::Array<char*, 2>)", 325.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<float>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<float>, at::detail::Array<char*, 1>)", 1888.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2> >(int, at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2>)", 2395.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::threshold_kernel_impl<float>(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array<char*, 3> >(int, at::native::threshold_kernel_impl<float>(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array<char*, 3>)", 27280.0], ["void cudnn::bn_bw_1C11_kernel_new<float, float, float2, 512, true, 1>(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float)", 59568.0], ["void cudnn::bn_bw_1C11_singleread<float, 512, true, 1, 2, 0>(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnBwPersistentState*, int, float, float, float, int, float, cudnnStatus_t*, bool)", 2331.0], ["void cudnn::bn_fw_tr_1C11_kernel_NCHW<float, float, 512, true, 1>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float)", 27084.0], ["void cudnn::bn_fw_tr_1C11_singleread<float, 512, true, 1, 2, 0>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnFwPersistentState*, int, float, float, float, int, float, float, cudnnStatus_t*, bool)", 7231.0], ["void cudnn::cnn::wgrad_alg0_engine<float, 128, 5, 5, 3, 3, 3, false, 512>(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 7068.0], ["void cudnn::cnn::wgrad_alg0_engine<float, 128, 6, 7, 3, 3, 5, false, 512>(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 43471.0], ["void cudnn::cnn::wgrad_alg0_engine<float, 128, 6, 8, 3, 3, 5, false, 512>(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 4038.0], ["void cudnn::detail::dgrad_engine<float, 512, 6, 5, 3, 3, 3, false>(int, int, int, float const*, int, float const*, int, float*, kernel_grad_params, unsigned long long, int, unsigned long long, int, float, int, int, int)", 86855.0], ["void cudnn::ops::scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float*, float)", 5901.0], ["void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>)", 126.0], ["void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)", 4648.0], ["void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>)", 2632.0], ["void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>)", 5314.0], ["void cudnn::winograd_nonfused::winogradWgradData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)", 4681.0], ["void cudnn::winograd_nonfused::winogradWgradDelta4x4<float, float>(cudnn::winograd_nonfused::WinogradDeltaParams<float, float>)", 4559.0], ["void cudnn::winograd_nonfused::winogradWgradOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradWgradOutputParams<float, float>)", 1484.0], ["void cunn_ClassNLLCriterion_updateGradInput_kernel<float>(float*, float*, long*, float*, float*, int, int, int, int, long)", 12.0], ["void cunn_ClassNLLCriterion_updateOutput_kernel<float, float>(float*, float*, float*, long*, float*, int, int, int, int, long)", 18.0], ["void implicit_convolve_sgemm<float, float, 1024, 6, 7, 3, 3, 5, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 37016.0], ["void implicit_convolve_sgemm<float, float, 128, 5, 5, 3, 3, 3, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 5221.0], ["void implicit_convolve_sgemm<float, float, 128, 6, 7, 3, 3, 5, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 35106.0], ["void implicit_convolve_sgemm<float, float, 512, 6, 8, 3, 3, 5, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 7939.0], ["void nchwToNhwcKernel<float, float, float, true, false, (cudnnKernelDataType_t)0>(int, int, int, int, float const*, float*, float, float)", 383.0], ["void nhwcToNchwKernel<float, float, float, true, false, (cudnnKernelDataType_t)0>(int, int, int, int, float const*, float*, float, float)", 54.0], ["void splitKreduce_kernel<float, float, float, float>(cublasSplitKParams<float>, float const*, float const*, float*, float const*, float const*, float const*)", 31.0], ["volta_scudnn_128x128_stridedB_medium_nn_v1", 3550.0], ["volta_scudnn_128x128_stridedB_small_nn_v1", 3034.0], ["volta_scudnn_128x128_stridedB_splitK_medium_nn_v1", 25342.0], ["volta_scudnn_128x128_stridedB_splitK_small_nn_v1", 20473.0], ["volta_scudnn_128x32_sliced1x4_ldg4_relu_exp_small_nhwc_tn_v1", 3697.0], ["volta_scudnn_128x64_relu_interior_nn_v1", 7976.0], ["volta_scudnn_128x64_relu_medium_nn_v1", 3647.0], ["volta_scudnn_128x64_relu_small_nn_v1", 3273.0], ["volta_scudnn_128x64_stridedB_interior_nn_v1", 9559.0], ["volta_scudnn_128x64_stridedB_small_nn_v1", 582.0], ["volta_scudnn_128x64_stridedB_splitK_xregs_large_nn_v1", 7819.0], ["volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1", 12761.0], ["volta_sgemm_128x32_nt", 8658.0], ["volta_sgemm_32x128_nn", 3059.0], ["volta_sgemm_32x128_nt", 2837.0], ["volta_sgemm_64x32_sliced1x4_nn", 149.0], ["volta_sgemm_64x32_sliced1x4_tn", 148.0], ["volta_sgemm_64x64_nn", 8544.0], ["volta_sgemm_64x64_nt", 21125.0]]}}
diff --git a/tb_plugin/torch_tb_profiler/consts.py b/tb_plugin/torch_tb_profiler/consts.py
index 1dbfc9fcf..8e41a88f1 100644
--- a/tb_plugin/torch_tb_profiler/consts.py
+++ b/tb_plugin/torch_tb_profiler/consts.py
@@ -25,27 +25,34 @@
 
 TOOLTIP_GPU_UTIL = \
     "GPU Utilization:\n" \
-    "GPU busy time / All steps time. " \
+    "GPU busy time / All steps time. The bigger, the better. " \
     "GPU busy time is the time during which there is at least one GPU kernel running on it. " \
     "All steps time is the total time of all profiler steps(or called as iterations).\n"
 TOOLTIP_SM_EFFICIENCY = \
     "Est. SM Efficiency:\n" \
-    "Estimated Stream Multiprocessor Efficiency. " \
-    "Est. SM Efficiency of a kernel, SM_Eff_K = min(blocks of this kernel / SM number of this GPU, 100%). " \
+    "Estimated Stream Multiprocessor Efficiency. The bigger, the better. " \
+    "This metric of a kernel, SM_Eff_K = min(blocks of this kernel / SM number of this GPU, 100%). " \
     "This overall number is the sum of all kernels' SM_Eff_K weighted by kernel's execution duration, " \
     "divided by all steps time.\n"
-TOOLTIP_OCCUPANCY = \
+TOOLTIP_OCCUPANCY_COMMON = \
     "Est. Achieved Occupancy:\n" \
-    "Occupancy is the ratio of active threads on an SM " \
-    "to the maximum number of active threads supported by the SM. " \
+    "The bigger, the better. Occupancy is the ratio of active warps on an SM " \
+    "to the maximum number of active warps supported by the SM. " \
     "The theoretical occupancy of a kernel is upper limit occupancy of this kernel, " \
     "limited by multiple factors such as kernel shape, kernel used resource, " \
-    "and the GPU compute capability." \
+    "and the GPU compute capability.\n" \
     "Est. Achieved Occupancy of a kernel, OCC_K = " \
-    "min(threads of the kernel / SM number / max threads per SM, theoretical occupancy of the kernel). " \
-    "This overall number is the weighted sum of all kernels OCC_K " \
-    "using kernel's execution duration as weight."
+    "min(threads of the kernel / SM number / max threads per SM, theoretical occupancy of the kernel). "
+TOOLTIP_OCCUPANCY_OVERVIEW = \
+    "This overall number is the weighted average of all kernels' OCC_K " \
+    "using kernel's execution duration as weight. " \
+    "It shows fine-grained low-level GPU utilization."
+TOOLTIP_OCCUPANCY_TABLE = \
+    "This \"Mean\" number is the weighted average of all calls' OCC_K of the kernel, " \
+    "using each call's execution duration as weight. " \
+    "It shows fine-grained low-level GPU utilization."
 TOOLTIP_BLOCKS_PER_SM = \
-    "Blocks Per SM:\n" \
-    "min(blocks of this kernel / SM number of this GPU). " \
-    "If this number is less than 1, it indicates the GPU multiprocessors are not fully utilized."
+    "Blocks Per SM = blocks of this kernel / SM number of this GPU.\n" \
+    "If this number is less than 1, it indicates the GPU multiprocessors are not fully utilized.\n" \
+    "\"Mean Blocks per SM\" is the weighted average of all calls of this kernel, " \
+    "using each call's execution duration as weight."
diff --git a/tb_plugin/torch_tb_profiler/profiler/run_generator.py b/tb_plugin/torch_tb_profiler/profiler/run_generator.py
index ecadcb30c..44923bcac 100644
--- a/tb_plugin/torch_tb_profiler/profiler/run_generator.py
+++ b/tb_plugin/torch_tb_profiler/profiler/run_generator.py
@@ -294,7 +294,7 @@ def _get_gpu_metrics_columns(blocks_per_sm_count, occupancy_count):
                             "tooltip": consts.TOOLTIP_BLOCKS_PER_SM})
         if occupancy_count > 0:
             columns.append({"type": "number", "name": "Mean Est. Achieved Occupancy (%)",
-                            "tooltip": consts.TOOLTIP_OCCUPANCY})
+                            "tooltip": consts.TOOLTIP_OCCUPANCY_COMMON + consts.TOOLTIP_OCCUPANCY_TABLE})
         return columns
 
     def _generate_kernel_op_table(self):
diff --git a/tb_plugin/torch_tb_profiler/run.py b/tb_plugin/torch_tb_profiler/run.py
index 2760111df..ee4e9d961 100644
--- a/tb_plugin/torch_tb_profiler/run.py
+++ b/tb_plugin/torch_tb_profiler/run.py
@@ -211,7 +211,7 @@ def get_gpu_metrics_tooltip(has_sm_efficiency, has_occupancy):
             if has_sm_efficiency:
                 tooltip += "\n" + consts.TOOLTIP_SM_EFFICIENCY
             if has_occupancy:
-                tooltip += "\n" + consts.TOOLTIP_OCCUPANCY
+                tooltip += "\n" + consts.TOOLTIP_OCCUPANCY_COMMON + consts.TOOLTIP_OCCUPANCY_OVERVIEW
             return tooltip
 
         data, has_occupancy, has_sm_efficiency = get_gpu_metrics_data(self)