pytorch · guotuofeng · Jun 17, 2021 · Jun 17, 2021 · Jun 17, 2021 · Jun 17, 2021
diff --git a/tb_plugin/README.md b/tb_plugin/README.md
@@ -24,7 +24,7 @@ and give optimization recommendations.
 
 * Build the wheel
   - `python setup.py build_fe sdist bdist_wheel` \
-   **_Note_**: the build_fe step need setup yarn and nodejs
+   **_Note_**: the build_fe step need setup yarn and Node.js
   - `python setup.py sdist bdist_wheel`
 
 ### Quick Start Instructions
@@ -37,12 +37,12 @@ and give optimization recommendations.
   [kineto/tb_plugin/examples/resnet50_profiler_api.py](https://github.com/pytorch/kineto/blob/master/tb_plugin/examples/resnet50_profiler_api.py).
   Also you can learn how to profile your model and generate profiling data from [PyTorch Profiler](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html?highlight=tensorboard).
 
-  Note: The recommended way to produce profiling data is assigning "torch.profiler.tensorboard_trace_handler"
-  to "on_trace_ready" on creation of "torch.profiler.schedule".
+  Note: The recommended way to produce profiling data is assigning `torch.profiler.tensorboard_trace_handler`
+  to `on_trace_ready` on creation of `torch.profiler.profile`.
 
 * Start TensorBoard
 
-  Specify the profiling data folder to "logdir" in Tensorboard. If you use the above samples data, start TensorBoard with:
+  Specify the profiling data folder to `logdir` in Tensorboard. If you use the above samples data, start TensorBoard with:
 
   `tensorboard --logdir=./samples`
 
@@ -56,14 +56,17 @@ and give optimization recommendations.
 * Open TensorBoard in Chrome browser
 
   Open URL `http://localhost:6006` in the browser.
-  If you use '--bind_all' in tensorboard start cmd, the hostname may not be 'localhost'. You may find it in the log printed after the cmd.
+  If you use `--bind_all` in tensorboard start command, the hostname may not be 'localhost'. You may find it in the log printed after the cmd.
 
 * Navigate to PYTORCH_PROFILER tab
 
   If the files under `--logdir` are too big or too many,
   please wait a while and refresh the browser to check latest loaded result.
-* Also support loading profiling data stored in AWS(S3://), Azure blob(https://\<account\>.blob.core.windows.net) and Google Cloud(GS://)
-  * S3: install boto3. set environment variables:  `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`. Optionally, `S3_ENDPOINT` can be set as well.\
+
+* Loading profiling data from cloud
+  * S3 (S3://)
+
+    install `boto3`. set environment variables:  `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`. Optionally, `S3_ENDPOINT` can be set as well.\
     For minio, the S3 url should start with the bucket name `s3://<bucket>/<folder>/` instead of minio prefix `s3://minio/<bucket>/<folder>`. At the same time, the `S3_ENDPOINT` is needed as well. \
     For example, the following command can be used to create minio storage after following guides: 
     * Server: https://docs.min.io/docs/minio-quickstart-guide.html
@@ -81,8 +84,15 @@ and give optimization recommendations.
      export S3_ENDPOINT=http://localhost:9000
      tensorboard --logdir=s3://profiler/version_2/ --bind_all
   ```
-  * Azure Blob: install azure-storage-blob. Optionally, set environment variable `AZURE_STORAGE_CONNECTION_STRING`
-  * Google Cloud: install google-cloud-storage.
+
+  * Azure blob (https://\<account\>.blob.core.windows.net)
+
+    install `azure-storage-blob`. Optionally, set environment variable `AZURE_STORAGE_CONNECTION_STRING`
+
+  * Google Cloud (GS://) 
+
+    install `google-cloud-storage`.
+
   ---
   > **_NOTES:_** For AWS, Google Cloud and Azure Blob, the trace files need to be put on a top level folder under bucket/container.
   ---
@@ -93,7 +103,7 @@ and give optimization recommendations.
 
   and open tensorboard in browser to see all the views described below.
 
-  Note: for accessing data in azure blob, you need to install torch-tb-profiler with cmd: `pip install torch-tb-profiler[blob]`
+  Note: for accessing data in azure blob, you need to install torch-tb-profiler with `pip install torch-tb-profiler[blob]`
 
 ### Quick Usage Instructions
 
@@ -358,3 +368,8 @@ one worker is much larger than others, there may be a problem of loading balance
         * Data Transfer Time (us): Total time actually used for data transfer in operator of this type.
         * Ave Data Transfer Time (us): Average time actually used for data transfer in each operator of this type.
 
+### PyTorch Profiler TensorBoard Plugin 0.2 Release Notes
+
+Known Issues: This software does not support Python 3.9.0, 3.9.1, 3.9.2. 
+If the tensorboard launching reports error message "ImportError" and "circular import", 
+please update your Python to higher version.
diff --git a/tb_plugin/docs/gpu_utilization.md b/tb_plugin/docs/gpu_utilization.md
@@ -1,15 +1,15 @@
-* GPU Utilization: GPU busy time / all steps time. The bigger, the better. All steps time is the total time of all profiler steps(or called as iterations). 
+* GPU Utilization: GPU busy time / all steps time. The higher, the better. All steps time is the total time of all profiler steps(or called as iterations). 
                    GPU busy time is the time during “all steps time” when is at least one GPU kernel running on this GPU. 
                    However, this high-level utilization metric is coarse. It can’t tell how many SMs(Stream Multiprocessors) are in use. 
                    For example, a kernel with a single thread running continuously will get 100% GPU utilization. 
 
-* Est. SM Efficiency: Estimated Stream Multiprocessor Efficiency. The bigger, the better. This metric of a kernel, SM_Eff_K = min(blocks of this kernel / SM number of this GPU, 100%). 
+* Est. SM Efficiency: Estimated Stream Multiprocessor Efficiency. The higher, the better. This metric of a kernel, SM_Eff_K = min(blocks of this kernel / SM number of this GPU, 100%). 
                       This overall number is the sum of all kernels' SM_Eff_K weighted by kernel's execution duration, divided by “all steps time”. 
                       It shows GPU Stream Multiprocessors’ utilization. 
                       Although it is finer grained than above “GPU Utilization”, it still can’t tell the whole story. 
                       For example, a kernel with only one thread per block can’t fully utilize each SM. 
 
-* Est. Achieved Occupancy: The bigger, the better. The definition of occupancy is [here](https://docs.nvidia.com/gameworks/content/developertools/desktop/analysis/report/cudaexperiments/kernellevel/achievedoccupancy.htm). 
+* Est. Achieved Occupancy: For most cases such as memory bandwidth bounded kernels, the higher the better. [Reference](http://developer.download.nvidia.com/GTC/PDF/GTC2012/PresentationPDF/S0514-GTC2012-GPU-Performance-Analysis.pdf). The definition of occupancy is [here](https://docs.nvidia.com/gameworks/content/developertools/desktop/analysis/report/cudaexperiments/kernellevel/achievedoccupancy.htm). 
                            Occupancy is the ratio of active warps on an SM to the maximum number of
                            active warps supported by the SM. The theoretical occupancy of a kernel is upper limit occupancy of this kernel, limited by multiple 
                            factors such as kernel shape, kernel used resource, and the GPU compute capability. 

diff --git a/tb_plugin/examples/resnet50_profiler_api.py b/tb_plugin/examples/resnet50_profiler_api.py
@@ -31,12 +31,12 @@
         torch.profiler.ProfilerActivity.CPU,
         torch.profiler.ProfilerActivity.CUDA],
     schedule=torch.profiler.schedule(
-        wait=2,
-        warmup=3,
-        active=6),
+        wait=1,
+        warmup=1,
+        active=2),
     on_trace_ready=torch.profiler.tensorboard_trace_handler('./result', worker_name='worker0'),
     record_shapes=True,
-    profile_memory=True,
+    profile_memory=True,  # This will take 1 to 2 minutes. Setting it to False could greatly speedup.
     with_stack=True
 ) as p:
     for step, data in enumerate(trainloader, 0):
@@ -49,6 +49,6 @@
         optimizer.zero_grad()
         loss.backward()
         optimizer.step()
-        if step + 1 >= 22:
+        if step + 1 >= 4:
             break
         p.step()
diff --git a/tb_plugin/fe/src/api/generated/api.ts b/tb_plugin/fe/src/api/generated/api.ts
@@ -596,6 +596,25 @@ export interface Performance {
    */
   children?: Array<Performance>
 }
+/**
+ *
+ * @export
+ * @interface Runs
+ */
+export interface Runs {
+  /**
+   *
+   * @type {Array<string>}
+   * @memberof Runs
+   */
+  runs: Array<string>
+  /**
+   *
+   * @type {boolean}
+   * @memberof Runs
+   */
+  loading: boolean
+}
 /**
  *
  * @export
@@ -2162,7 +2181,7 @@ export const DefaultApiFp = function (configuration?: Configuration) {
      */
     runsGet(
       options?: any
-    ): (fetch?: FetchAPI, basePath?: string) => Promise<Array<string>> {
+    ): (fetch?: FetchAPI, basePath?: string) => Promise<Runs> {
       const localVarFetchArgs = DefaultApiFetchParamCreator(
         configuration
       ).runsGet(options)

diff --git a/tb_plugin/fe/src/api/openapi.yaml b/tb_plugin/fe/src/api/openapi.yaml
@@ -13,9 +13,7 @@ paths:
           content:
             '*/*':
               schema:
-                type: array
-                items:
-                  type: string
+                $ref: '#/components/schemas/Runs'
   /views:
     get:
       parameters:
@@ -453,6 +451,18 @@ paths:
                 type: object
 components:
   schemas:
+    Runs:
+      type: object
+      required:
+        - runs
+        - loading
+      properties:
+        runs:
+          type: array
+          items:
+            type: string
+        loading:
+          type: boolean
     Performance:
       type: object
       required:

diff --git a/tb_plugin/fe/src/app.tsx b/tb_plugin/fe/src/app.tsx
@@ -2,6 +2,9 @@
  * Copyright (c) Microsoft Corporation. All rights reserved.
  *--------------------------------------------------------------------------------------------*/
 
+import Card from '@material-ui/core/Card'
+import CardContent from '@material-ui/core/CardContent'
+import CardHeader from '@material-ui/core/CardHeader'
 import ClickAwayListener from '@material-ui/core/ClickAwayListener'
 import CssBaseline from '@material-ui/core/CssBaseline'
 import Divider from '@material-ui/core/Divider'
@@ -15,6 +18,7 @@ import Select, { SelectProps } from '@material-ui/core/Select'
 import { makeStyles } from '@material-ui/core/styles'
 import ChevronLeftIcon from '@material-ui/icons/ChevronLeft'
 import ChevronRightIcon from '@material-ui/icons/ChevronRight'
+import Typography from '@material-ui/core/Typography'
 import 'antd/es/button/style/css'
 import 'antd/es/list/style/css'
 import 'antd/es/table/style/css'
@@ -130,6 +134,7 @@ export const App = () => {
 
   const [run, setRun] = React.useState<string>('')
   const [runs, setRuns] = React.useState<string[]>([])
+  const [runsLoading, setRunsLoading] = React.useState(true)
 
   const [workers, setWorkers] = React.useState<string[]>([])
   const [worker, setWorker] = React.useState<string>('')
@@ -152,7 +157,8 @@ export const App = () => {
     while (true) {
       try {
         const runs = await api.defaultApi.runsGet()
-        setRuns(runs)
+        setRuns(runs.runs)
+        setRunsLoading(runs.loading)
       } catch (e) {
         console.info('Cannot fetch runs: ', e)
       }
@@ -248,6 +254,17 @@ export const App = () => {
   }
 
   const renderContent = () => {
+    if (!runsLoading && runs.length == 0) {
+      return (
+        <Card variant="outlined">
+          <CardHeader title="No Runs Found"></CardHeader>
+          <CardContent>
+            <Typography>There are not any runs in the log folder.</Typography>
+          </CardContent>
+        </Card>
+      )
+    }
+
     if (!loaded || !run || !worker || !view || !span) {
       return <FullCircularProgress />
     }